From 5881890e44d8c24c184a7516609d8b752180469e Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Mon, 18 May 2020 20:46:43 +0000 Subject: [PATCH 1/6] removeUnreferencedElements: Remove defs before unref elements The `removeUnusedDefs` function can take `referencedIDs` as parameter and its work do not invalidate it. By moving it up in `removeUnreferencedElements` we can save a call to `findReferencedElements` per call to `removeUnreferencedElements`. Signed-off-by: Niels Thykier --- scour/scour.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 18a81d2..c9eff0e 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -674,6 +674,16 @@ def removeUnreferencedElements(doc, keepDefs): identifiedElements = findElementsWithId(doc.documentElement) referencedIDs = findReferencedElements(doc.documentElement) + if not keepDefs: + # Remove most unreferenced elements inside defs + defs = doc.documentElement.getElementsByTagName('defs') + for aDef in defs: + elemsToRemove = removeUnusedDefs(doc, aDef, referencedIDs=referencedIDs) + for elem in elemsToRemove: + elem.parentNode.removeChild(elem) + _num_elements_removed += 1 + num += 1 + for id in identifiedElements: if id not in referencedIDs: goner = identifiedElements[id] @@ -684,15 +694,6 @@ def removeUnreferencedElements(doc, keepDefs): num += 1 _num_elements_removed += 1 - if not keepDefs: - # Remove most unreferenced elements inside defs - defs = doc.documentElement.getElementsByTagName('defs') - for aDef in defs: - elemsToRemove = removeUnusedDefs(doc, aDef) - for elem in elemsToRemove: - elem.parentNode.removeChild(elem) - _num_elements_removed += 1 - num += 1 return num From c5362743c3d582943ca4d2cc10a6fba8d9d6b3c6 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Mon, 18 May 2020 21:04:20 +0000 Subject: [PATCH 2/6] _getStyle: Avoid calling getAttribute twice for no reason _getStyle accounted for ~8.9% (~17700) of all calls to getAttribute on devices/hidef/secure-card.svgz file from the Oxygen icon theme. This commit removes this part of the dead weight. Signed-off-by: Niels Thykier --- scour/scour.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index c9eff0e..1bb4980 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1610,9 +1610,12 @@ def removeDuplicateGradients(doc): def _getStyle(node): u"""Returns the style attribute of a node as a dictionary.""" - if node.nodeType == Node.ELEMENT_NODE and len(node.getAttribute('style')) > 0: + if node.nodeType != Node.ELEMENT_NODE: + return {} + style_attribute = node.getAttribute('style') + if style_attribute: styleMap = {} - rawStyles = node.getAttribute('style').split(';') + rawStyles = style_attribute.split(';') for style in rawStyles: propval = style.split(':') if len(propval) == 2: From 528ad91418a4d705bc1d1d1af9e946057217e05e Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Thu, 21 May 2020 11:26:10 +0000 Subject: [PATCH 3/6] removeUnusedDefs: Call getAttribute at most once per element Signed-off-by: Niels Thykier --- scour/scour.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 1bb4980..78ae200 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -647,8 +647,12 @@ def removeUnusedDefs(doc, defElem, elemsToRemove=None, referencedIDs=None): keepTags = ['font', 'style', 'metadata', 'script', 'title', 'desc'] for elem in defElem.childNodes: # only look at it if an element and not referenced anywhere else - if elem.nodeType == Node.ELEMENT_NODE and (elem.getAttribute('id') == '' or - elem.getAttribute('id') not in referencedIDs): + if elem.nodeType != Node.ELEMENT_NODE: + continue + + elem_id = elem.getAttribute('id') + + if elem_id == '' or elem_id not in referencedIDs: # we only inspect the children of a group in a defs if the group # is not referenced anywhere else if elem.nodeName == 'g' and elem.namespaceURI == NS['SVG']: From 29a7474f746b2807058385ca6a6d9cf33812f4b2 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Tue, 19 May 2020 21:56:15 +0000 Subject: [PATCH 4/6] removeNamespacedAttributes: Avoid calling it twice as it is indempotent Signed-off-by: Niels Thykier --- scour/scour.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 78ae200..6bb1b8f 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -955,7 +955,6 @@ def removeUnreferencedIDs(referencedIDs, identifiedElements): def removeNamespacedAttributes(node, namespaces): - global _num_attributes_removed num = 0 if node.nodeType == Node.ELEMENT_NODE: # remove all namespace'd attributes from this element @@ -966,9 +965,8 @@ def removeNamespacedAttributes(node, namespaces): if attr is not None and attr.namespaceURI in namespaces: attrsToRemove.append(attr.nodeName) for attrName in attrsToRemove: - num += 1 - _num_attributes_removed += 1 node.removeAttribute(attrName) + num += len(attrsToRemove) # now recurse for children for child in node.childNodes: @@ -3656,8 +3654,8 @@ def scourString(in_string, options=None): if options.keep_editor_data is False: while removeNamespacedElements(doc.documentElement, unwanted_ns) > 0: pass - while removeNamespacedAttributes(doc.documentElement, unwanted_ns) > 0: - pass + _num_attributes_removed += removeNamespacedAttributes(doc.documentElement, + unwanted_ns) # remove the xmlns: declarations now xmlnsDeclsToRemove = [] From 045f1f0ad543e7bd30b724255161f21c568f1a8f Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Tue, 19 May 2020 21:59:02 +0000 Subject: [PATCH 5/6] removeNamespacedElements: Avoid calling it twice as it is indempotent Signed-off-by: Niels Thykier --- scour/scour.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 6bb1b8f..d03e5d0 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -975,7 +975,6 @@ def removeNamespacedAttributes(node, namespaces): def removeNamespacedElements(node, namespaces): - global _num_elements_removed num = 0 if node.nodeType == Node.ELEMENT_NODE: # remove all namespace'd child nodes from this element @@ -985,9 +984,8 @@ def removeNamespacedElements(node, namespaces): if child is not None and child.namespaceURI in namespaces: childrenToRemove.append(child) for child in childrenToRemove: - num += 1 - _num_elements_removed += 1 node.removeChild(child) + num += len(childrenToRemove) # now recurse for children for child in node.childNodes: @@ -3652,8 +3650,8 @@ def scourString(in_string, options=None): # on the first pass, so we do it multiple times # does it have to do with removal of children affecting the childlist? if options.keep_editor_data is False: - while removeNamespacedElements(doc.documentElement, unwanted_ns) > 0: - pass + _num_elements_removed += removeNamespacedElements(doc.documentElement, + unwanted_ns) _num_attributes_removed += removeNamespacedAttributes(doc.documentElement, unwanted_ns) From fd2daf44b4f4cbad899889426ca644fff6696c29 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Thu, 21 May 2020 13:00:47 +0000 Subject: [PATCH 6/6] Avoid compiling "the same" regex multiple times Signed-off-by: Niels Thykier --- scour/scour.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index d03e5d0..ec5441a 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -80,6 +80,9 @@ XML_ENTS_ESCAPE_APOS["'"] = ''' XML_ENTS_ESCAPE_QUOT = XML_ENTS_NO_QUOTES.copy() XML_ENTS_ESCAPE_QUOT['"'] = '"' +# Used to split values where "x y" or "x,y" or a mix of the two is allowed +RE_COMMA_WSP = re.compile(r"\s*[\s,]\s*") + NS = {'SVG': 'http://www.w3.org/2000/svg', 'XLINK': 'http://www.w3.org/1999/xlink', 'SODIPODI': 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd', @@ -2738,7 +2741,7 @@ def parseListOfPoints(s): # coordinate-pair = coordinate comma-or-wsp coordinate # coordinate = sign? integer # comma-wsp: (wsp+ comma? wsp*) | (comma wsp*) - ws_nums = re.split(r"\s*[\s,]\s*", s.strip()) + ws_nums = RE_COMMA_WSP.split(s.strip()) nums = [] # also, if 100-100 is found, split it into two also @@ -3351,7 +3354,7 @@ def properlySizeDoc(docElement, options): # else we have a statically sized image and we should try to remedy that # parse viewBox attribute - vbSep = re.split('[, ]+', docElement.getAttribute('viewBox')) + vbSep = RE_COMMA_WSP.split(docElement.getAttribute('viewBox')) # if we have a valid viewBox we need to check it vbWidth, vbHeight = 0, 0 if len(vbSep) == 4: @@ -3810,7 +3813,7 @@ def scourString(in_string, options=None): elem.setAttribute(attr, scourLength(elem.getAttribute(attr))) viewBox = doc.documentElement.getAttribute('viewBox') if viewBox: - lengths = re.split('[, ]+', viewBox) + lengths = RE_COMMA_WSP.split(viewBox) lengths = [scourUnitlessLength(length) for length in lengths] doc.documentElement.setAttribute('viewBox', ' '.join(lengths))