Avoid O(n²) in removeDuplicateGradient (#171)

The original implementation of removeDuplicateGradient does O(n²)
search over all gradients to remove duplicates.  In images with many
gradients (such as [MediaWiki_logo_1.svg]), this becomes a significant
overhead as that logo has over 900 duplicated gradients.

We solve this by creating a key for each gradient based on the
attributes we use for duplication detection.  This key is generated
such that if two gradients have the same key, they are duplicates (for
our purpose) and the keys are different then the gradients are
guaranteed to be different as well.  With such a key, we can rely on a
dict to handle the duplication detection (which it does very well).

This change improves the runtime performance on [MediaWiki_logo_1.svg]
by about 25% (8m51s -> 1m56s on 5 runs).

Signed-off-by: Niels Thykier <niels@thykier.net>
This commit is contained in:
Niels Thykier 2018-03-10 15:47:30 +01:00 committed by Eduard Braun
parent 7115e82cb8
commit cae0faefa0

View file

@ -57,7 +57,7 @@ import sys
import time
import xml.dom.minidom
from xml.dom import Node, NotFoundErr
from collections import namedtuple
from collections import namedtuple, defaultdict
from decimal import Context, Decimal, InvalidOperation, getcontext
import six
@ -1355,6 +1355,32 @@ def collapseSinglyReferencedGradients(doc):
return num
def computeGradientBucketKey(grad):
# Compute a key (hashable opaque value; here a string) from each
# gradient such that "key(grad1) == key(grad2)" is the same as
# saying that grad1 is a duplicate of grad2.
gradBucketAttr = ['gradientUnits', 'spreadMethod', 'gradientTransform',
'x1', 'y1', 'x2', 'y2', 'cx', 'cy', 'fx', 'fy', 'r']
gradStopBucketsAttr = ['offset', 'stop-color', 'stop-opacity', 'style']
# A linearGradient can never be a duplicate of a
# radialGradient (and vice versa)
subKeys = [grad.getAttribute(a) for a in gradBucketAttr]
subKeys.append(grad.getAttributeNS(NS['XLINK'], 'href'))
stops = grad.getElementsByTagName('stop')
if stops.length:
for i in range(stops.length):
stop = stops.item(i)
for attr in gradStopBucketsAttr:
stopKey = stop.getAttribute(attr)
subKeys.append(stopKey)
# Use a raw ASCII "record separator" control character as it is
# not likely to be used in any of these values (without having to
# be escaped).
return "\x1e".join(subKeys)
def removeDuplicateGradients(doc):
global _num_elements_removed
num = 0
@ -1364,58 +1390,23 @@ def removeDuplicateGradients(doc):
for gradType in ['linearGradient', 'radialGradient']:
grads = doc.getElementsByTagName(gradType)
gradBuckets = defaultdict(list)
for grad in grads:
# TODO: should slice grads from 'grad' here to optimize
for ograd in grads:
# do not compare gradient to itself
if grad == ograd:
key = computeGradientBucketKey(grad)
gradBuckets[key].append(grad)
for bucket in six.itervalues(gradBuckets):
if len(bucket) < 2:
# The gradient must be unique if it is the only one in
# this bucket.
continue
master = bucket[0]
duplicates = bucket[1:]
# compare grad to ograd (all properties, then all stops)
# if attributes do not match, go to next gradient
someGradAttrsDoNotMatch = False
for attr in ['gradientUnits', 'spreadMethod', 'gradientTransform',
'x1', 'y1', 'x2', 'y2', 'cx', 'cy', 'fx', 'fy', 'r']:
if grad.getAttribute(attr) != ograd.getAttribute(attr):
someGradAttrsDoNotMatch = True
break
if someGradAttrsDoNotMatch:
continue
# compare xlink:href values too
if grad.getAttributeNS(NS['XLINK'], 'href') != ograd.getAttributeNS(NS['XLINK'], 'href'):
continue
# all gradient properties match, now time to compare stops
stops = grad.getElementsByTagName('stop')
ostops = ograd.getElementsByTagName('stop')
if stops.length != ostops.length:
continue
# now compare stops
stopsNotEqual = False
for i in range(stops.length):
if stopsNotEqual:
break
stop = stops.item(i)
ostop = ostops.item(i)
for attr in ['offset', 'stop-color', 'stop-opacity', 'style']:
if stop.getAttribute(attr) != ostop.getAttribute(attr):
stopsNotEqual = True
break
if stopsNotEqual:
continue
# ograd is a duplicate of grad, we schedule it to be removed UNLESS
# ograd is ALREADY considered a 'master' element
if ograd not in gradientsToRemove:
if ograd not in duplicateToMaster:
if grad not in gradientsToRemove:
gradientsToRemove[grad] = []
gradientsToRemove[grad].append(ograd)
duplicateToMaster[ograd] = grad
gradientsToRemove[master] = duplicates
for ograd in duplicates:
duplicateToMaster[ograd] = master
# get a collection of all elements that are referenced and their referencing elements
referencedIDs = findReferencedElements(doc.documentElement)