Optimize remove duplicate gradients (#248)

Optimize remove duplicate gradients
2020-06-09 21:32:59 +02:00 · 2020-06-09 21:32:59 +02:00 · cc11ef3d5e
commit cc11ef3d5e
parent 985cb58a26 ca2b32c0b3
1 changed files with 103 additions and 60 deletions
--- a/scour/scour.py
+++ b/scour/scour.py
@ -556,7 +556,7 @@ def findReferencedElements(node, ids=None):
    Returns IDs of all referenced elements
    - node is the node at which to start the search.
    - returns a map which has the id as key and
-      each value is is a list of nodes
+      each value is is a set of nodes
    Currently looks at 'xlink:href' and all attributes in 'referencingProps'
    """
@ -586,9 +586,9 @@ def findReferencedElements(node, ids=None):
        # we remove the hash mark from the beginning of the id
        id = href[1:]
        if id in ids:
-            ids[id].append(node)
+            ids[id].add(node)
        else:
-            ids[id] = [node]
+            ids[id] = {node}
    # now get all style properties and the fill, stroke, filter attributes
    styles = node.getAttribute('style').split(';')
@ -619,9 +619,9 @@ def findReferencingProperty(node, prop, val, ids):
        if len(val) >= 7 and val[0:5] == 'url(#':
            id = val[5:val.find(')')]
            if id in ids:
-                ids[id].append(node)
+                ids[id].add(node)
            else:
-                ids[id] = [node]
+                ids[id] = {node}
        # if the url has a quote in it, we need to compensate
        elif len(val) >= 8:
            id = None
@ -633,9 +633,9 @@ def findReferencingProperty(node, prop, val, ids):
                id = val[6:val.find("')")]
            if id is not None:
                if id in ids:
-                    ids[id].append(node)
+                    ids[id].add(node)
                else:
-                    ids[id] = [node]
+                    ids[id] = {node}
 def removeUnusedDefs(doc, defElem, elemsToRemove=None, referencedIDs=None):
@ -1457,7 +1457,7 @@ def collapseSinglyReferencedGradients(doc):
                elem.namespaceURI == NS['SVG']
            ):
                # found a gradient that is referenced by only 1 other element
-                refElem = nodes[0]
+                refElem = nodes.pop()
                if refElem.nodeType == Node.ELEMENT_NODE and refElem.nodeName in ['linearGradient', 'radialGradient'] \
                        and refElem.namespaceURI == NS['SVG']:
                    # elem is a gradient referenced by only one other gradient (refElem)
@ -1532,82 +1532,126 @@ def computeGradientBucketKey(grad):
    return "\x1e".join(subKeys)
-def removeDuplicateGradients(doc):
+def detect_duplicate_gradients(*grad_lists):
-    global _num_elements_removed
+    """Detects duplicate gradients from each iterable/generator given as argument
    num = 0
-    gradientsToRemove = {}
+    Yields (master, master_id, duplicates_id, duplicates) tuples where:
-
+      * master_id: The ID attribute of the master element.  This will always be non-empty
-    for gradType in ['linearGradient', 'radialGradient']:
+        and not None as long at least one of the gradients have a valid ID.
-        grads = doc.getElementsByTagName(gradType)
+      * duplicates_id: List of ID attributes of the duplicate gradients elements (can be
-        gradBuckets = defaultdict(list)
+        empty where the gradient had no ID attribute)
      * duplicates: List of elements that are duplicates of the `master` element.  Will
        never include the `master` element.  Has the same order as `duplicates_id` - i.e.
        `duplicates[X].getAttribute("id") == duplicates_id[X]`.
    """
    for grads in grad_lists:
        grad_buckets = defaultdict(list)
        for grad in grads:
            key = computeGradientBucketKey(grad)
-            gradBuckets[key].append(grad)
+            grad_buckets[key].append(grad)
-        for bucket in six.itervalues(gradBuckets):
+        for bucket in six.itervalues(grad_buckets):
            if len(bucket) < 2:
                # The gradient must be unique if it is the only one in
                # this bucket.
                continue
            master = bucket[0]
            duplicates = bucket[1:]
            duplicates_ids = [d.getAttribute('id') for d in duplicates]
            master_id = master.getAttribute('id')
            if not master_id:
                # If our selected "master" copy does not have an ID,
                # then replace it with one that does (assuming any of
                # them has one).  This avoids broken images like we
                # saw in GH#203
-                for i in range(len(duplicates)):
+                for i in range(len(duplicates_ids)):
-                    dup = duplicates[i]
+                    dup_id = duplicates_ids[i]
                    dup_id = dup.getAttribute('id')
                    if dup_id:
                        # We do not bother updating the master field
                        # as it is not used any more.
                        master_id = duplicates_ids[i]
                        duplicates[i] = master
-                        master = dup
+                        # Clear the old id to avoid a redundant remapping
                        duplicates_ids[i] = ""
                        break
-            gradientsToRemove[master] = duplicates
+            yield master_id, duplicates_ids, duplicates
 def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids):
    func_iri = None
    for dup_id, dup_grad in zip(duplicates_ids, duplicates):
        # if the duplicate gradient no longer has a parent that means it was
        # already re-mapped to another master gradient
        if not dup_grad.parentNode:
            continue
        # With --keep-unreferenced-defs, we can end up with
        # unreferenced gradients.  See GH#156.
        if dup_id in referenced_ids:
            if func_iri is None:
                # matches url(#<ANY_DUP_ID>), url('#<ANY_DUP_ID>') and url("#<ANY_DUP_ID>")
                dup_id_regex = "|".join(duplicates_ids)
                func_iri = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)')
            for elem in referenced_ids[dup_id]:
                # find out which attribute referenced the duplicate gradient
                for attr in ['fill', 'stroke']:
                    v = elem.getAttribute(attr)
                    (v_new, n) = func_iri.subn('url(#' + master_id + ')', v)
                    if n > 0:
                        elem.setAttribute(attr, v_new)
                if elem.getAttributeNS(NS['XLINK'], 'href') == '#' + dup_id:
                    elem.setAttributeNS(NS['XLINK'], 'href', '#' + master_id)
                styles = _getStyle(elem)
                for style in styles:
                    v = styles[style]
                    (v_new, n) = func_iri.subn('url(#' + master_id + ')', v)
                    if n > 0:
                        styles[style] = v_new
                _setStyle(elem, styles)
        # now that all referencing elements have been re-mapped to the master
        # it is safe to remove this gradient from the document
        dup_grad.parentNode.removeChild(dup_grad)
    # If the gradients have an ID, we update referenced_ids to match the newly remapped IDs.
    # This enable us to avoid calling findReferencedElements once per loop, which is helpful as it is
    # one of the slowest functions in scour.
    if master_id:
        try:
            master_references = referenced_ids[master_id]
        except KeyError:
            master_references = set()
        for dup_id in duplicates_ids:
            references = referenced_ids.pop(dup_id, None)
            if references is None:
                continue
            master_references.update(references)
        # Only necessary but needed if the master gradient did
        # not have any references originally
        referenced_ids[master_id] = master_references
 def removeDuplicateGradients(doc):
    prev_num = -1
    num = 0
    # get a collection of all elements that are referenced and their referencing elements
-    referencedIDs = findReferencedElements(doc.documentElement)
+    referenced_ids = findReferencedElements(doc.documentElement)
    for masterGrad in gradientsToRemove:
        master_id = masterGrad.getAttribute('id')
        for dupGrad in gradientsToRemove[masterGrad]:
            # if the duplicate gradient no longer has a parent that means it was
            # already re-mapped to another master gradient
            if not dupGrad.parentNode:
                continue
-            # for each element that referenced the gradient we are going to replace dup_id with master_id
+    while prev_num != num:
-            dup_id = dupGrad.getAttribute('id')
+        prev_num = num
            funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)')  # matches url(#a), url('#a') and url("#a")
-            # With --keep-unreferenced-defs, we can end up with
+        linear_gradients = doc.getElementsByTagName('linearGradient')
-            # unreferenced gradients.  See GH#156.
+        radial_gradients = doc.getElementsByTagName('radialGradient')
-            if dup_id in referencedIDs:
+
-                for elem in referencedIDs[dup_id]:
+        for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients):
-                    # find out which attribute referenced the duplicate gradient
+            dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids)
-                    for attr in ['fill', 'stroke']:
+            num += len(duplicates)
                        v = elem.getAttribute(attr)
                        (v_new, n) = funcIRI.subn('url(#' + master_id + ')', v)
                        if n > 0:
                            elem.setAttribute(attr, v_new)
                    if elem.getAttributeNS(NS['XLINK'], 'href') == '#' + dup_id:
                        elem.setAttributeNS(NS['XLINK'], 'href', '#' + master_id)
                    styles = _getStyle(elem)
                    for style in styles:
                        v = styles[style]
                        (v_new, n) = funcIRI.subn('url(#' + master_id + ')', v)
                        if n > 0:
                            styles[style] = v_new
                    _setStyle(elem, styles)
            # now that all referencing elements have been re-mapped to the master
            # it is safe to remove this gradient from the document
            dupGrad.parentNode.removeChild(dupGrad)
            _num_elements_removed += 1
            num += 1
    return num
@ -3754,8 +3798,7 @@ def scourString(in_string, options=None):
        pass
    # remove duplicate gradients
-    while removeDuplicateGradients(doc) > 0:
+    _num_elements_removed += removeDuplicateGradients(doc)
        pass
    if options.group_collapse:
        _num_elements_removed += mergeSiblingGroupsWithCommonAttributes(doc.documentElement)