From a8e4d0efbf0fa3e3b8f857a7ec1e5bab5458628f Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Sat, 17 Feb 2018 15:05:50 +0000
Subject: [PATCH 1/4] Attempt to reuse results from findReferencedElements
 better

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)
diff --git a/scour/scour.py b/scour/scour.py
index 1998482..de75e24 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1295,14 +1295,14 @@ def removeDuplicateGradientStops(doc):
     return num
 
 
-def collapseSinglyReferencedGradients(doc):
+def collapseSinglyReferencedGradients(doc, referencedIDs):
     global _num_elements_removed
     num = 0
 
     identifiedElements = findElementsWithId(doc.documentElement)
 
     # make sure to reset the ref'ed ids for when we are running this in testscour
-    for rid, nodes in six.iteritems(findReferencedElements(doc.documentElement)):
+    for rid, nodes in six.iteritems(referencedIDs):
         # Make sure that there's actually a defining element for the current ID name.
         # (Cyn: I've seen documents with #id references but no element with that ID!)
         if len(nodes) == 1 and rid in identifiedElements:
@@ -1355,7 +1355,7 @@ def collapseSinglyReferencedGradients(doc):
     return num
 
 
-def removeDuplicateGradients(doc):
+def removeDuplicateGradients(doc, referencedIDs):
     global _num_elements_removed
     num = 0
 
@@ -1417,8 +1417,6 @@ def removeDuplicateGradients(doc):
                         gradientsToRemove[grad].append(ograd)
                         duplicateToMaster[ograd] = grad
 
-    # get a collection of all elements that are referenced and their referencing elements
-    referencedIDs = findReferencedElements(doc.documentElement)
     for masterGrad in gradientsToRemove:
         master_id = masterGrad.getAttribute('id')
         for dupGrad in gradientsToRemove[masterGrad]:
@@ -3470,27 +3468,37 @@ def scourString(in_string, options=None):
                 elem.parentNode.removeChild(elem)
                 _num_elements_removed += 1
 
+    referencedIDs = None
     if options.strip_ids:
+        referencedIDs = findReferencedElements(doc.documentElement)
         bContinueLooping = True
         while bContinueLooping:
             identifiedElements = unprotected_ids(doc, options)
-            referencedIDs = findReferencedElements(doc.documentElement)
             bContinueLooping = (removeUnreferencedIDs(referencedIDs, identifiedElements) > 0)
+            referencedIDs = findReferencedElements(doc.documentElement)
 
     while removeDuplicateGradientStops(doc) > 0:
-        pass
+        # removeDuplicateGradientStops does not need referencedIDs, but it can
+        # remove nodes.  Make no assumptions about referencedIDs still being valid
+        # after this call.
+        referencedIDs = None
+
+    if referencedIDs is None:
+        referencedIDs = findReferencedElements(doc.documentElement)
 
     # remove gradients that are only referenced by one other gradient
-    while collapseSinglyReferencedGradients(doc) > 0:
-        pass
+    while collapseSinglyReferencedGradients(doc, referencedIDs) > 0:
+        referencedIDs = findReferencedElements(doc.documentElement)
 
     # remove duplicate gradients
-    while removeDuplicateGradients(doc) > 0:
-        pass
+    while removeDuplicateGradients(doc, referencedIDs) > 0:
+        referencedIDs = findReferencedElements(doc.documentElement)
 
     # create <g> elements if there are runs of elements with the same attributes.
     # this MUST be before moveCommonAttributesToParentGroup.
     if options.group_create:
+        # This does remove nodes, so referencedIDs should remain
+        # valid after this.
         createGroupsForCommonAttributes(doc.documentElement)
 
     # move common attributes to parent group
@@ -3498,9 +3506,8 @@ def scourString(in_string, options=None):
     # all have the same value for an attribute, it must not
     # get moved to the <svg> element. The <svg> element
     # doesn't accept fill=, stroke= etc.!
-    referencedIds = findReferencedElements(doc.documentElement)
     for child in doc.documentElement.childNodes:
-        _num_attributes_removed += moveCommonAttributesToParentGroup(child, referencedIds)
+        _num_attributes_removed += moveCommonAttributesToParentGroup(child, referencedIDs)
 
     # remove unused attributes from parent
     _num_attributes_removed += removeUnusedAttributesOnParent(doc.documentElement)

From 180d7f8ddb76bbdfe7279af6e12f503624851261 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Sat, 17 Feb 2018 14:07:35 +0000
Subject: [PATCH 2/4] Noop indentation

Same behaviour; just force a level of indentation.  Mostly useful to
make the next commit more readable.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 84 ++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index de75e24..6f32dd3 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -561,53 +561,55 @@ def findReferencedElements(node, ids=None):
     global referencingProps
     if ids is None:
         ids = {}
-    # TODO: input argument ids is clunky here (see below how it is called)
-    # GZ: alternative to passing dict, use **kwargs
 
-    # if this node is a style element, parse its text into CSS
-    if node.nodeName == 'style' and node.namespaceURI == NS['SVG']:
-        # one stretch of text, please! (we could use node.normalize(), but
-        # this actually modifies the node, and we don't want to keep
-        # whitespace around if there's any)
-        stylesheet = "".join([child.nodeValue for child in node.childNodes])
-        if stylesheet != '':
-            cssRules = parseCssString(stylesheet)
-            for rule in cssRules:
-                for propname in rule['properties']:
-                    propval = rule['properties'][propname]
-                    findReferencingProperty(node, propname, propval, ids)
-        return ids
+    if 1:  # Indent-only
+        # TODO: input argument ids is clunky here (see below how it is called)
+        # GZ: alternative to passing dict, use **kwargs
 
-    # else if xlink:href is set, then grab the id
-    href = node.getAttributeNS(NS['XLINK'], 'href')
-    if href != '' and len(href) > 1 and href[0] == '#':
-        # we remove the hash mark from the beginning of the id
-        id = href[1:]
-        if id in ids:
-            ids[id].append(node)
-        else:
-            ids[id] = [node]
+        # if this node is a style element, parse its text into CSS
+        if node.nodeName == 'style' and node.namespaceURI == NS['SVG']:
+            # one stretch of text, please! (we could use node.normalize(), but
+            # this actually modifies the node, and we don't want to keep
+            # whitespace around if there's any)
+            stylesheet = "".join([child.nodeValue for child in node.childNodes])
+            if stylesheet != '':
+                cssRules = parseCssString(stylesheet)
+                for rule in cssRules:
+                    for propname in rule['properties']:
+                        propval = rule['properties'][propname]
+                        findReferencingProperty(node, propname, propval, ids)
+            return ids
 
-    # now get all style properties and the fill, stroke, filter attributes
-    styles = node.getAttribute('style').split(';')
+        # else if xlink:href is set, then grab the id
+        href = node.getAttributeNS(NS['XLINK'], 'href')
+        if href != '' and len(href) > 1 and href[0] == '#':
+            # we remove the hash mark from the beginning of the id
+            id = href[1:]
+            if id in ids:
+                ids[id].append(node)
+            else:
+                ids[id] = [node]
 
-    for style in styles:
-        propval = style.split(':')
-        if len(propval) == 2:
-            prop = propval[0].strip()
-            val = propval[1].strip()
-            findReferencingProperty(node, prop, val, ids)
+        # now get all style properties and the fill, stroke, filter attributes
+        styles = node.getAttribute('style').split(';')
 
-    for attr in referencingProps:
-        val = node.getAttribute(attr).strip()
-        if not val:
-            continue
-        findReferencingProperty(node, attr, val, ids)
+        for style in styles:
+            propval = style.split(':')
+            if len(propval) == 2:
+                prop = propval[0].strip()
+                val = propval[1].strip()
+                findReferencingProperty(node, prop, val, ids)
 
-    if node.hasChildNodes():
-        for child in node.childNodes:
-            if child.nodeType == Node.ELEMENT_NODE:
-                findReferencedElements(child, ids)
+        for attr in referencingProps:
+            val = node.getAttribute(attr).strip()
+            if not val:
+                continue
+            findReferencingProperty(node, attr, val, ids)
+
+        if node.hasChildNodes():
+            for child in node.childNodes:
+                if child.nodeType == Node.ELEMENT_NODE:
+                    findReferencedElements(child, ids)
     return ids
 
 

From c182faba0ddb8a4b2201e9b6e72ac662f9120716 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Sat, 17 Feb 2018 14:18:14 +0000
Subject: [PATCH 3/4] Rewrite findReferencedElements without recursion

A classic "Depth-First-Search via call stack recusion to
Depth-First-Search with stack" rewrite.  The only thing worth noting
is that the nodes with same parent are visited in opposite order than
originally.  While trivially fixable, none of the callers appear to rely
on the order for anything at all, so I have left that out.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 6f32dd3..16af5b5 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -549,20 +549,23 @@ def findElementsWithId(node, elems=None):
 referencingProps = ['fill', 'stroke', 'filter', 'clip-path', 'mask',  'marker-start', 'marker-end', 'marker-mid']
 
 
-def findReferencedElements(node, ids=None):
+def findReferencedElements(start_node):
     """
     Returns IDs of all referenced elements
-    - node is the node at which to start the search.
+    - start_node is the node at which to start the search.
     - returns a map which has the id as key and
       each value is is a list of nodes
 
     Currently looks at 'xlink:href' and all attributes in 'referencingProps'
     """
     global referencingProps
-    if ids is None:
-        ids = {}
+    ids = {}
 
-    if 1:  # Indent-only
+    # Search all nodes in depth-first search
+    stack = [start_node]
+
+    while stack:
+        node = stack.pop()
         # TODO: input argument ids is clunky here (see below how it is called)
         # GZ: alternative to passing dict, use **kwargs
 
@@ -578,7 +581,7 @@ def findReferencedElements(node, ids=None):
                     for propname in rule['properties']:
                         propval = rule['properties'][propname]
                         findReferencingProperty(node, propname, propval, ids)
-            return ids
+            continue
 
         # else if xlink:href is set, then grab the id
         href = node.getAttributeNS(NS['XLINK'], 'href')
@@ -607,9 +610,8 @@ def findReferencedElements(node, ids=None):
             findReferencingProperty(node, attr, val, ids)
 
         if node.hasChildNodes():
-            for child in node.childNodes:
-                if child.nodeType == Node.ELEMENT_NODE:
-                    findReferencedElements(child, ids)
+            stack.extend(n for n in node.childNodes if n.nodeType == Node.ELEMENT_NODE)
+
     return ids
 
 

From 8c9c6253b9b9b4328b191740130275966c71c876 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Sat, 17 Feb 2018 13:44:44 +0000
Subject: [PATCH 4/4] Move loop inside a function to reduce the number of calls

In a profile of scour on a given input file,
removeDefaultAttributeValue is one of the top 3 most frequent function
being called with 332288 calls (with minidom.getAttribute and
minidom.hasAttribute being the only ones with similar number of
calls).

The high number of calls occurs because removeDefaultAttributeValue
only removes a single attribute at a time.  Given it is always called
in a trivial for loop, we can just move the loop inside
removeDefaultAttributeValue to reduce the number of calls.  At the
same time, rename the function to better reflect that it works on
multiple attributes.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 16af5b5..d5c9e8a 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1896,34 +1896,38 @@ def taint(taintedSet, taintedAttribute):
     return taintedSet
 
 
-def removeDefaultAttributeValue(node, attribute):
+def removeDefaultAttributeValuesFromNode(node, attributes):
     """
-    Removes the DefaultAttribute 'attribute' from 'node' if specified conditions are fulfilled
+    Removes each of the 'DefaultAttribute's from 'node' if specified conditions are fulfilled
     """
-    if not node.hasAttribute(attribute.name):
-        return 0
+    count = 0
+    for attribute in attributes:
+        if not node.hasAttribute(attribute.name):
+            continue
 
-    if (attribute.elements is not None) and (node.nodeName not in attribute.elements):
-        return 0
+        if (attribute.elements is not None) and (node.nodeName not in attribute.elements):
+            continue
 
-    # differentiate between text and numeric values
-    if isinstance(attribute.value, str):
-        if node.getAttribute(attribute.name) == attribute.value:
-            if (attribute.conditions is None) or attribute.conditions(node):
-                node.removeAttribute(attribute.name)
-                return 1
-    else:
-        nodeValue = SVGLength(node.getAttribute(attribute.name))
-        if ((attribute.value is None)
-                or ((nodeValue.value == attribute.value) and not (nodeValue.units == Unit.INVALID))):
-            if ((attribute.units is None)
-                    or (nodeValue.units == attribute.units)
-                    or (isinstance(attribute.units, list) and nodeValue.units in attribute.units)):
+        # differentiate between text and numeric values
+        if isinstance(attribute.value, str):
+            if node.getAttribute(attribute.name) == attribute.value:
                 if (attribute.conditions is None) or attribute.conditions(node):
                     node.removeAttribute(attribute.name)
-                    return 1
+                    count += 1
+                    continue
+        else:
+            nodeValue = SVGLength(node.getAttribute(attribute.name))
+            if ((attribute.value is None)
+                    or ((nodeValue.value == attribute.value) and not (nodeValue.units == Unit.INVALID))):
+                if ((attribute.units is None)
+                        or (nodeValue.units == attribute.units)
+                        or (isinstance(attribute.units, list) and nodeValue.units in attribute.units)):
+                    if (attribute.conditions is None) or attribute.conditions(node):
+                        node.removeAttribute(attribute.name)
+                        count += 1
+                        continue
 
-    return 0
+    return count
 
 
 def removeDefaultAttributeValues(node, options, tainted=set()):
@@ -1935,8 +1939,7 @@ def removeDefaultAttributeValues(node, options, tainted=set()):
         return 0
 
     # Conditionally remove all default attributes defined in 'default_attributes' (a list of 'DefaultAttribute's)
-    for attribute in default_attributes:
-        num += removeDefaultAttributeValue(node, attribute)
+    num += removeDefaultAttributeValuesFromNode(node, default_attributes)
 
     # Summarily get rid of default properties
     attributes = [node.attributes.item(i).nodeName for i in range(node.attributes.length)]