Avoid O(n²) in removeDuplicateGradient
The original implementation of removeDuplicateGradient does O(n²)
search over all gradients to remove duplicates. In images with many
gradients (such as [MediaWiki_logo_1.svg]), this becomes a significant
overhead as that logo has over 900 duplicated gradients.
We solve this by creating a key for each gradient based on the
attributes we use for duplication detection. This key is generated
such that if two gradients have the same key, they are duplicates (for
our purpose) and the keys are different then the gradients are
guaranteed to be different as well. With such a key, we can rely on a
dict to handle the duplication detection (which it does very well).
This change improves the runtime performance on [MediaWiki_logo_1.svg]
by about 25% (8m51s -> 1m56s on 5 runs).
Original:
$ time for I in $(seq 1 5) ; do \
PYTHONPATH=. python3 -m scour.scour MediaWiki_logo_1.svg out.svg ; \
done
Scour processed file "heavy.svg" in 105042 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 103412 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 105334 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 107902 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 108161 ms: 1582746/4989544 bytes new/orig -> 31.7%
8m51.855s
...
Optimized:
$ time for I in $(seq 1 5) ; do \
PYTHONPATH=. python3 -m scour.scour MediaWiki_logo_1.svg out.svg ; \
done
Scour processed file "heavy.svg" in 21559 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 21936 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 21540 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 21518 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 21664 ms: 1582746/4989544 bytes new/orig -> 31.7%
real 1m56.400s
...
[MediaWiki_logo_1.svg]: https://upload.wikimedia.org/wikipedia/commons/archive/5/54/20120822053933%21MediaWiki_logo_1.svg
Signed-off-by: Niels Thykier <niels@thykier.net>
This commit is contained in:
parent
0776d32179
commit
69471b8e8d
1 changed files with 41 additions and 50 deletions
|
|
@ -57,7 +57,7 @@ import sys
|
||||||
import time
|
import time
|
||||||
import xml.dom.minidom
|
import xml.dom.minidom
|
||||||
from xml.dom import Node, NotFoundErr
|
from xml.dom import Node, NotFoundErr
|
||||||
from collections import namedtuple
|
from collections import namedtuple, defaultdict
|
||||||
from decimal import Context, Decimal, InvalidOperation, getcontext
|
from decimal import Context, Decimal, InvalidOperation, getcontext
|
||||||
|
|
||||||
import six
|
import six
|
||||||
|
|
@ -1355,6 +1355,32 @@ def collapseSinglyReferencedGradients(doc):
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
|
||||||
|
def computeGradientBucketKey(grad):
|
||||||
|
# Compute a key (hashable opaque value; here a string) from each
|
||||||
|
# gradient such that "key(grad1) == key(grad2)" is the same as
|
||||||
|
# saying that grad1 is a duplicate of grad2.
|
||||||
|
gradBucketAttr = ['gradientUnits', 'spreadMethod', 'gradientTransform',
|
||||||
|
'x1', 'y1', 'x2', 'y2', 'cx', 'cy', 'fx', 'fy', 'r']
|
||||||
|
gradStopBucketsAttr = ['offset', 'stop-color', 'stop-opacity', 'style']
|
||||||
|
|
||||||
|
# A linearGradient can never be a duplicate of a
|
||||||
|
# radialGradient (and vice versa)
|
||||||
|
subKeys = [grad.getAttribute(a) for a in gradBucketAttr]
|
||||||
|
subKeys.append(grad.getAttributeNS(NS['XLINK'], 'href'))
|
||||||
|
stops = grad.getElementsByTagName('stop')
|
||||||
|
if stops.length:
|
||||||
|
for i in range(stops.length):
|
||||||
|
stop = stops.item(i)
|
||||||
|
for attr in gradStopBucketsAttr:
|
||||||
|
stopKey = stop.getAttribute(attr)
|
||||||
|
subKeys.append(stopKey)
|
||||||
|
|
||||||
|
# Use a raw ASCII "record seperator" control character as it is
|
||||||
|
# not likely to be used in any of these values (without having to
|
||||||
|
# be escaped).
|
||||||
|
return "\x1e".join(subKeys)
|
||||||
|
|
||||||
|
|
||||||
def removeDuplicateGradients(doc):
|
def removeDuplicateGradients(doc):
|
||||||
global _num_elements_removed
|
global _num_elements_removed
|
||||||
num = 0
|
num = 0
|
||||||
|
|
@ -1364,58 +1390,23 @@ def removeDuplicateGradients(doc):
|
||||||
|
|
||||||
for gradType in ['linearGradient', 'radialGradient']:
|
for gradType in ['linearGradient', 'radialGradient']:
|
||||||
grads = doc.getElementsByTagName(gradType)
|
grads = doc.getElementsByTagName(gradType)
|
||||||
|
gradBuckets = defaultdict(list)
|
||||||
|
|
||||||
for grad in grads:
|
for grad in grads:
|
||||||
# TODO: should slice grads from 'grad' here to optimize
|
key = computeGradientBucketKey(grad)
|
||||||
for ograd in grads:
|
gradBuckets[key].append(grad)
|
||||||
# do not compare gradient to itself
|
|
||||||
if grad == ograd:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# compare grad to ograd (all properties, then all stops)
|
for bucket in six.itervalues(gradBuckets):
|
||||||
# if attributes do not match, go to next gradient
|
if len(bucket) < 2:
|
||||||
someGradAttrsDoNotMatch = False
|
# The gradient must be unique if it is the only one in
|
||||||
for attr in ['gradientUnits', 'spreadMethod', 'gradientTransform',
|
# this bucket.
|
||||||
'x1', 'y1', 'x2', 'y2', 'cx', 'cy', 'fx', 'fy', 'r']:
|
continue
|
||||||
if grad.getAttribute(attr) != ograd.getAttribute(attr):
|
master = bucket[0]
|
||||||
someGradAttrsDoNotMatch = True
|
duplicates = bucket[1:]
|
||||||
break
|
|
||||||
|
|
||||||
if someGradAttrsDoNotMatch:
|
gradientsToRemove[master] = duplicates
|
||||||
continue
|
for ograd in duplicates:
|
||||||
|
duplicateToMaster[ograd] = master
|
||||||
# compare xlink:href values too
|
|
||||||
if grad.getAttributeNS(NS['XLINK'], 'href') != ograd.getAttributeNS(NS['XLINK'], 'href'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# all gradient properties match, now time to compare stops
|
|
||||||
stops = grad.getElementsByTagName('stop')
|
|
||||||
ostops = ograd.getElementsByTagName('stop')
|
|
||||||
|
|
||||||
if stops.length != ostops.length:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# now compare stops
|
|
||||||
stopsNotEqual = False
|
|
||||||
for i in range(stops.length):
|
|
||||||
if stopsNotEqual:
|
|
||||||
break
|
|
||||||
stop = stops.item(i)
|
|
||||||
ostop = ostops.item(i)
|
|
||||||
for attr in ['offset', 'stop-color', 'stop-opacity', 'style']:
|
|
||||||
if stop.getAttribute(attr) != ostop.getAttribute(attr):
|
|
||||||
stopsNotEqual = True
|
|
||||||
break
|
|
||||||
if stopsNotEqual:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# ograd is a duplicate of grad, we schedule it to be removed UNLESS
|
|
||||||
# ograd is ALREADY considered a 'master' element
|
|
||||||
if ograd not in gradientsToRemove:
|
|
||||||
if ograd not in duplicateToMaster:
|
|
||||||
if grad not in gradientsToRemove:
|
|
||||||
gradientsToRemove[grad] = []
|
|
||||||
gradientsToRemove[grad].append(ograd)
|
|
||||||
duplicateToMaster[ograd] = grad
|
|
||||||
|
|
||||||
# get a collection of all elements that are referenced and their referencing elements
|
# get a collection of all elements that are referenced and their referencing elements
|
||||||
referencedIDs = findReferencedElements(doc.documentElement)
|
referencedIDs = findReferencedElements(doc.documentElement)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue