From a8e4d0efbf0fa3e3b8f857a7ec1e5bab5458628f Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 17 Feb 2018 15:05:50 +0000 Subject: [PATCH 1/4] Attempt to reuse results from findReferencedElements better Signed-off-by: Niels Thykier --- scour/scour.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 1998482..de75e24 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1295,14 +1295,14 @@ def removeDuplicateGradientStops(doc): return num -def collapseSinglyReferencedGradients(doc): +def collapseSinglyReferencedGradients(doc, referencedIDs): global _num_elements_removed num = 0 identifiedElements = findElementsWithId(doc.documentElement) # make sure to reset the ref'ed ids for when we are running this in testscour - for rid, nodes in six.iteritems(findReferencedElements(doc.documentElement)): + for rid, nodes in six.iteritems(referencedIDs): # Make sure that there's actually a defining element for the current ID name. # (Cyn: I've seen documents with #id references but no element with that ID!) if len(nodes) == 1 and rid in identifiedElements: @@ -1355,7 +1355,7 @@ def collapseSinglyReferencedGradients(doc): return num -def removeDuplicateGradients(doc): +def removeDuplicateGradients(doc, referencedIDs): global _num_elements_removed num = 0 @@ -1417,8 +1417,6 @@ def removeDuplicateGradients(doc): gradientsToRemove[grad].append(ograd) duplicateToMaster[ograd] = grad - # get a collection of all elements that are referenced and their referencing elements - referencedIDs = findReferencedElements(doc.documentElement) for masterGrad in gradientsToRemove: master_id = masterGrad.getAttribute('id') for dupGrad in gradientsToRemove[masterGrad]: @@ -3470,27 +3468,37 @@ def scourString(in_string, options=None): elem.parentNode.removeChild(elem) _num_elements_removed += 1 + referencedIDs = None if options.strip_ids: + referencedIDs = findReferencedElements(doc.documentElement) bContinueLooping = True while bContinueLooping: identifiedElements = unprotected_ids(doc, options) - referencedIDs = findReferencedElements(doc.documentElement) bContinueLooping = (removeUnreferencedIDs(referencedIDs, identifiedElements) > 0) + referencedIDs = findReferencedElements(doc.documentElement) while removeDuplicateGradientStops(doc) > 0: - pass + # removeDuplicateGradientStops does not need referencedIDs, but it can + # remove nodes. Make no assumptions about referencedIDs still being valid + # after this call. + referencedIDs = None + + if referencedIDs is None: + referencedIDs = findReferencedElements(doc.documentElement) # remove gradients that are only referenced by one other gradient - while collapseSinglyReferencedGradients(doc) > 0: - pass + while collapseSinglyReferencedGradients(doc, referencedIDs) > 0: + referencedIDs = findReferencedElements(doc.documentElement) # remove duplicate gradients - while removeDuplicateGradients(doc) > 0: - pass + while removeDuplicateGradients(doc, referencedIDs) > 0: + referencedIDs = findReferencedElements(doc.documentElement) # create elements if there are runs of elements with the same attributes. # this MUST be before moveCommonAttributesToParentGroup. if options.group_create: + # This does remove nodes, so referencedIDs should remain + # valid after this. createGroupsForCommonAttributes(doc.documentElement) # move common attributes to parent group @@ -3498,9 +3506,8 @@ def scourString(in_string, options=None): # all have the same value for an attribute, it must not # get moved to the element. The element # doesn't accept fill=, stroke= etc.! - referencedIds = findReferencedElements(doc.documentElement) for child in doc.documentElement.childNodes: - _num_attributes_removed += moveCommonAttributesToParentGroup(child, referencedIds) + _num_attributes_removed += moveCommonAttributesToParentGroup(child, referencedIDs) # remove unused attributes from parent _num_attributes_removed += removeUnusedAttributesOnParent(doc.documentElement) From 180d7f8ddb76bbdfe7279af6e12f503624851261 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 17 Feb 2018 14:07:35 +0000 Subject: [PATCH 2/4] Noop indentation Same behaviour; just force a level of indentation. Mostly useful to make the next commit more readable. Signed-off-by: Niels Thykier --- scour/scour.py | 84 ++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index de75e24..6f32dd3 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -561,53 +561,55 @@ def findReferencedElements(node, ids=None): global referencingProps if ids is None: ids = {} - # TODO: input argument ids is clunky here (see below how it is called) - # GZ: alternative to passing dict, use **kwargs - # if this node is a style element, parse its text into CSS - if node.nodeName == 'style' and node.namespaceURI == NS['SVG']: - # one stretch of text, please! (we could use node.normalize(), but - # this actually modifies the node, and we don't want to keep - # whitespace around if there's any) - stylesheet = "".join([child.nodeValue for child in node.childNodes]) - if stylesheet != '': - cssRules = parseCssString(stylesheet) - for rule in cssRules: - for propname in rule['properties']: - propval = rule['properties'][propname] - findReferencingProperty(node, propname, propval, ids) - return ids + if 1: # Indent-only + # TODO: input argument ids is clunky here (see below how it is called) + # GZ: alternative to passing dict, use **kwargs - # else if xlink:href is set, then grab the id - href = node.getAttributeNS(NS['XLINK'], 'href') - if href != '' and len(href) > 1 and href[0] == '#': - # we remove the hash mark from the beginning of the id - id = href[1:] - if id in ids: - ids[id].append(node) - else: - ids[id] = [node] + # if this node is a style element, parse its text into CSS + if node.nodeName == 'style' and node.namespaceURI == NS['SVG']: + # one stretch of text, please! (we could use node.normalize(), but + # this actually modifies the node, and we don't want to keep + # whitespace around if there's any) + stylesheet = "".join([child.nodeValue for child in node.childNodes]) + if stylesheet != '': + cssRules = parseCssString(stylesheet) + for rule in cssRules: + for propname in rule['properties']: + propval = rule['properties'][propname] + findReferencingProperty(node, propname, propval, ids) + return ids - # now get all style properties and the fill, stroke, filter attributes - styles = node.getAttribute('style').split(';') + # else if xlink:href is set, then grab the id + href = node.getAttributeNS(NS['XLINK'], 'href') + if href != '' and len(href) > 1 and href[0] == '#': + # we remove the hash mark from the beginning of the id + id = href[1:] + if id in ids: + ids[id].append(node) + else: + ids[id] = [node] - for style in styles: - propval = style.split(':') - if len(propval) == 2: - prop = propval[0].strip() - val = propval[1].strip() - findReferencingProperty(node, prop, val, ids) + # now get all style properties and the fill, stroke, filter attributes + styles = node.getAttribute('style').split(';') - for attr in referencingProps: - val = node.getAttribute(attr).strip() - if not val: - continue - findReferencingProperty(node, attr, val, ids) + for style in styles: + propval = style.split(':') + if len(propval) == 2: + prop = propval[0].strip() + val = propval[1].strip() + findReferencingProperty(node, prop, val, ids) - if node.hasChildNodes(): - for child in node.childNodes: - if child.nodeType == Node.ELEMENT_NODE: - findReferencedElements(child, ids) + for attr in referencingProps: + val = node.getAttribute(attr).strip() + if not val: + continue + findReferencingProperty(node, attr, val, ids) + + if node.hasChildNodes(): + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE: + findReferencedElements(child, ids) return ids From c182faba0ddb8a4b2201e9b6e72ac662f9120716 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 17 Feb 2018 14:18:14 +0000 Subject: [PATCH 3/4] Rewrite findReferencedElements without recursion A classic "Depth-First-Search via call stack recusion to Depth-First-Search with stack" rewrite. The only thing worth noting is that the nodes with same parent are visited in opposite order than originally. While trivially fixable, none of the callers appear to rely on the order for anything at all, so I have left that out. Signed-off-by: Niels Thykier --- scour/scour.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 6f32dd3..16af5b5 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -549,20 +549,23 @@ def findElementsWithId(node, elems=None): referencingProps = ['fill', 'stroke', 'filter', 'clip-path', 'mask', 'marker-start', 'marker-end', 'marker-mid'] -def findReferencedElements(node, ids=None): +def findReferencedElements(start_node): """ Returns IDs of all referenced elements - - node is the node at which to start the search. + - start_node is the node at which to start the search. - returns a map which has the id as key and each value is is a list of nodes Currently looks at 'xlink:href' and all attributes in 'referencingProps' """ global referencingProps - if ids is None: - ids = {} + ids = {} - if 1: # Indent-only + # Search all nodes in depth-first search + stack = [start_node] + + while stack: + node = stack.pop() # TODO: input argument ids is clunky here (see below how it is called) # GZ: alternative to passing dict, use **kwargs @@ -578,7 +581,7 @@ def findReferencedElements(node, ids=None): for propname in rule['properties']: propval = rule['properties'][propname] findReferencingProperty(node, propname, propval, ids) - return ids + continue # else if xlink:href is set, then grab the id href = node.getAttributeNS(NS['XLINK'], 'href') @@ -607,9 +610,8 @@ def findReferencedElements(node, ids=None): findReferencingProperty(node, attr, val, ids) if node.hasChildNodes(): - for child in node.childNodes: - if child.nodeType == Node.ELEMENT_NODE: - findReferencedElements(child, ids) + stack.extend(n for n in node.childNodes if n.nodeType == Node.ELEMENT_NODE) + return ids From 8c9c6253b9b9b4328b191740130275966c71c876 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 17 Feb 2018 13:44:44 +0000 Subject: [PATCH 4/4] Move loop inside a function to reduce the number of calls In a profile of scour on a given input file, removeDefaultAttributeValue is one of the top 3 most frequent function being called with 332288 calls (with minidom.getAttribute and minidom.hasAttribute being the only ones with similar number of calls). The high number of calls occurs because removeDefaultAttributeValue only removes a single attribute at a time. Given it is always called in a trivial for loop, we can just move the loop inside removeDefaultAttributeValue to reduce the number of calls. At the same time, rename the function to better reflect that it works on multiple attributes. Signed-off-by: Niels Thykier --- scour/scour.py | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 16af5b5..d5c9e8a 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1896,34 +1896,38 @@ def taint(taintedSet, taintedAttribute): return taintedSet -def removeDefaultAttributeValue(node, attribute): +def removeDefaultAttributeValuesFromNode(node, attributes): """ - Removes the DefaultAttribute 'attribute' from 'node' if specified conditions are fulfilled + Removes each of the 'DefaultAttribute's from 'node' if specified conditions are fulfilled """ - if not node.hasAttribute(attribute.name): - return 0 + count = 0 + for attribute in attributes: + if not node.hasAttribute(attribute.name): + continue - if (attribute.elements is not None) and (node.nodeName not in attribute.elements): - return 0 + if (attribute.elements is not None) and (node.nodeName not in attribute.elements): + continue - # differentiate between text and numeric values - if isinstance(attribute.value, str): - if node.getAttribute(attribute.name) == attribute.value: - if (attribute.conditions is None) or attribute.conditions(node): - node.removeAttribute(attribute.name) - return 1 - else: - nodeValue = SVGLength(node.getAttribute(attribute.name)) - if ((attribute.value is None) - or ((nodeValue.value == attribute.value) and not (nodeValue.units == Unit.INVALID))): - if ((attribute.units is None) - or (nodeValue.units == attribute.units) - or (isinstance(attribute.units, list) and nodeValue.units in attribute.units)): + # differentiate between text and numeric values + if isinstance(attribute.value, str): + if node.getAttribute(attribute.name) == attribute.value: if (attribute.conditions is None) or attribute.conditions(node): node.removeAttribute(attribute.name) - return 1 + count += 1 + continue + else: + nodeValue = SVGLength(node.getAttribute(attribute.name)) + if ((attribute.value is None) + or ((nodeValue.value == attribute.value) and not (nodeValue.units == Unit.INVALID))): + if ((attribute.units is None) + or (nodeValue.units == attribute.units) + or (isinstance(attribute.units, list) and nodeValue.units in attribute.units)): + if (attribute.conditions is None) or attribute.conditions(node): + node.removeAttribute(attribute.name) + count += 1 + continue - return 0 + return count def removeDefaultAttributeValues(node, options, tainted=set()): @@ -1935,8 +1939,7 @@ def removeDefaultAttributeValues(node, options, tainted=set()): return 0 # Conditionally remove all default attributes defined in 'default_attributes' (a list of 'DefaultAttribute's) - for attribute in default_attributes: - num += removeDefaultAttributeValue(node, attribute) + num += removeDefaultAttributeValuesFromNode(node, default_attributes) # Summarily get rid of default properties attributes = [node.attributes.item(i).nodeName for i in range(node.attributes.length)]