Perf optimized serialization (#247)

Some commits to reduce the overhead in outputting the SVG again (most of it is in serializeXML and below)
2020-06-07 20:35:33 +02:00 · 2020-06-07 20:35:33 +02:00 · 4e489b7ea9
commit 4e489b7ea9
parent 47e8b15315 397ffc5529
2 changed files with 87 additions and 61 deletions
--- a/scour/scour.py
+++ b/scour/scour.py
@ -74,6 +74,12 @@ VER = __version__
 COPYRIGHT = u'Copyright Jeff Schiller, Louis Simard, 2010'
 XML_ENTS_NO_QUOTES = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
 XML_ENTS_ESCAPE_APOS = XML_ENTS_NO_QUOTES.copy()
 XML_ENTS_ESCAPE_APOS["'"] = '&apos;'
 XML_ENTS_ESCAPE_QUOT = XML_ENTS_NO_QUOTES.copy()
 XML_ENTS_ESCAPE_QUOT['"'] = '&quot;'
 NS = {'SVG':      'http://www.w3.org/2000/svg',
      'XLINK':    'http://www.w3.org/1999/xlink',
      'SODIPODI': 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd',
@ -562,7 +568,7 @@ def findReferencedElements(node, ids=None):
        # one stretch of text, please! (we could use node.normalize(), but
        # this actually modifies the node, and we don't want to keep
        # whitespace around if there's any)
-        stylesheet = "".join([child.nodeValue for child in node.childNodes])
+        stylesheet = "".join(child.nodeValue for child in node.childNodes)
        if stylesheet != '':
            cssRules = parseCssString(stylesheet)
            for rule in cssRules:
@ -853,7 +859,7 @@ def renameID(idFrom, idTo, identifiedElements, referringNodes):
                    # there's a CDATASection node surrounded by whitespace
                    # nodes
                    # (node.normalize() will NOT work here, it only acts on Text nodes)
-                    oldValue = "".join([child.nodeValue for child in node.childNodes])
+                    oldValue = "".join(child.nodeValue for child in node.childNodes)
                    # not going to reparse the whole thing
                    newValue = oldValue.replace('url(#' + idFrom + ')', 'url(#' + idTo + ')')
                    newValue = newValue.replace("url(#'" + idFrom + "')", 'url(#' + idTo + ')')
@ -1617,7 +1623,7 @@ def _getStyle(node):
 def _setStyle(node, styleMap):
    u"""Sets the style attribute of a node to the dictionary ``styleMap``."""
-    fixedStyle = ';'.join([prop + ':' + styleMap[prop] for prop in styleMap])
+    fixedStyle = ';'.join(prop + ':' + styleMap[prop] for prop in styleMap)
    if fixedStyle != '':
        node.setAttribute('style', fixedStyle)
    elif node.getAttribute('style'):
@ -2837,18 +2843,18 @@ def serializePath(pathObj, options):
    """
    # elliptical arc commands must have comma/wsp separating the coordinates
    # this fixes an issue outlined in Fix https://bugs.launchpad.net/scour/+bug/412754
-    return ''.join([cmd + scourCoordinates(data, options,
+    return ''.join(cmd + scourCoordinates(data, options,
-                                           control_points=controlPoints(cmd, data),
+                                          control_points=controlPoints(cmd, data),
-                                           flags=flags(cmd, data))
+                                          flags=flags(cmd, data))
-                    for cmd, data in pathObj])
+                   for cmd, data in pathObj)
 def serializeTransform(transformObj):
    """
       Reserializes the transform data with some cleanups.
    """
-    return ' '.join([command + '(' + ' '.join([scourUnitlessLength(number) for number in numbers]) + ')'
+    return ' '.join(command + '(' + ' '.join(scourUnitlessLength(number) for number in numbers) + ')'
-                     for command, numbers in transformObj])
+                    for command, numbers in transformObj)
 def scourCoordinates(data, options, force_whitespace=False, control_points=[], flags=[]):
@ -3404,29 +3410,70 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
        remapNamespacePrefix(child, oldprefix, newprefix)
-def makeWellFormed(str, quote=''):
+def make_well_formed(text, quote_dict=None):
-    xml_ents = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
+    if quote_dict is None:
-    if quote:
+        quote_dict = XML_ENTS_NO_QUOTES
-        xml_ents[quote] = '&apos;' if (quote == "'") else "&quot;"
+    if not any(c in text for c in quote_dict):
-    return ''.join([xml_ents[c] if c in xml_ents else c for c in str])
+        # The quote-able characters are quite rare in SVG (they mostly only
        # occur in text elements in practice).  Therefore it make sense to
        # optimize for this common case
        return text
    return ''.join(quote_dict[c] if c in quote_dict else c for c in text)
-def chooseQuoteCharacter(str):
+def choose_quote_character(value):
-    quotCount = str.count('"')
+    quot_count = value.count('"')
-    aposCount = str.count("'")
+    if quot_count == 0 or quot_count <= value.count("'"):
-    if quotCount > aposCount:
+        # Fewest "-symbols (if there are 0, we pick this to avoid spending
-        quote = "'"
+        # time counting the '-symbols as it won't matter)
        hasEmbeddedQuote = aposCount
    else:
        quote = '"'
-        hasEmbeddedQuote = quotCount
+        xml_ent = XML_ENTS_ESCAPE_QUOT
-    return (quote, hasEmbeddedQuote)
+    else:
        quote = "'"
        xml_ent = XML_ENTS_ESCAPE_APOS
    return quote, xml_ent
 TEXT_CONTENT_ELEMENTS = ['text', 'tspan', 'tref', 'textPath', 'altGlyph',
                         'flowDiv', 'flowPara', 'flowSpan', 'flowTref', 'flowLine']
 KNOWN_ATTRS = [
        # TODO: Maybe update with full list from https://www.w3.org/TR/SVG/attindex.html
        # (but should be kept intuitively ordered)
        'id', 'xml:id', 'class',
        'transform',
        'x', 'y', 'z', 'width', 'height', 'x1', 'x2', 'y1', 'y2',
        'dx', 'dy', 'rotate', 'startOffset', 'method', 'spacing',
        'cx', 'cy', 'r', 'rx', 'ry', 'fx', 'fy',
        'd', 'points',
    ] + sorted(svgAttributes) + [
        'style',
    ]
 KNOWN_ATTRS_ORDER_BY_NAME = defaultdict(lambda: len(KNOWN_ATTRS),
                                        {name: order for order, name in enumerate(KNOWN_ATTRS)})
 # use custom order for known attributes and alphabetical order for the rest
 def _attribute_sort_key_function(attribute):
    name = attribute.name
    order_value = KNOWN_ATTRS_ORDER_BY_NAME[name]
    return order_value, name
 def attributes_ordered_for_output(element):
    if not element.hasAttributes():
        return []
    attribute = element.attributes
    # The .item(i) call is painfully slow (bpo#40689). Therefore we ensure we
    # call it at most once per attribute.
    # - it would be many times faster to use `attribute.values()` but sadly
    #   that is an "experimental" interface.
    return sorted((attribute.item(i) for i in range(attribute.length)),
                  key=_attribute_sort_key_function)
 # hand-rolled serialization function that has the following benefits:
 # - pretty printing
 # - somewhat judicious use of whitespace
@ -3447,37 +3494,15 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
    outParts.extend([(indent_type * indent_depth), '<', element.nodeName])
    # now serialize the other attributes
-    known_attr = [
+    attrs = attributes_ordered_for_output(element)
-        # TODO: Maybe update with full list from https://www.w3.org/TR/SVG/attindex.html
+    for attr in attrs:
        # (but should be kept inuitively ordered)
        'id', 'xml:id', 'class',
        'transform',
        'x', 'y', 'z', 'width', 'height', 'x1', 'x2', 'y1', 'y2',
        'dx', 'dy', 'rotate', 'startOffset', 'method', 'spacing',
        'cx', 'cy', 'r', 'rx', 'ry', 'fx', 'fy',
        'd', 'points',
    ] + sorted(svgAttributes) + [
        'style',
    ]
    attrList = element.attributes
    attrName2Index = dict([(attrList.item(i).nodeName, i) for i in range(attrList.length)])
    # use custom order for known attributes and alphabetical order for the rest
    attrIndices = []
    for name in known_attr:
        if name in attrName2Index:
            attrIndices.append(attrName2Index[name])
            del attrName2Index[name]
    attrIndices += [attrName2Index[name] for name in sorted(attrName2Index)]
    for index in attrIndices:
        attr = attrList.item(index)
        attrValue = attr.nodeValue
-        (quote, hasEmbeddedQuote) = chooseQuoteCharacter(attrValue)
+        quote, xml_ent = choose_quote_character(attrValue)
-        attrValue = makeWellFormed(attrValue, quote if hasEmbeddedQuote else '')
+        attrValue = make_well_formed(attrValue, xml_ent)
        if attr.nodeName == 'style':
            # sort declarations
-            attrValue = ';'.join([p for p in sorted(attrValue.split(';'))])
+            attrValue = ';'.join(sorted(attrValue.split(';')))
        outParts.append(' ')
        # preserve xmlns: if it is a namespace prefix declaration
@ -3532,7 +3557,7 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
                            text_content = text_content.replace('  ', ' ')
                    else:
                        text_content = text_content.strip()
-                outParts.append(makeWellFormed(text_content))
+                outParts.append(make_well_formed(text_content))
            # CDATA node
            elif child.nodeType == Node.CDATA_SECTION_NODE:
                outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])
--- a/test_scour.py
+++ b/test_scour.py
@ -30,7 +30,8 @@ import unittest
 import six
 from six.moves import map, range
-from scour.scour import makeWellFormed, parse_args, scourString, scourXmlFile, start, run
+from scour.scour import (make_well_formed, parse_args, scourString, scourXmlFile, start, run,
                         XML_ENTS_ESCAPE_APOS, XML_ENTS_ESCAPE_QUOT)
 from scour.svg_regex import svg_parser
 from scour import __version__
@ -1893,26 +1894,26 @@ class EnsureLineEndings(unittest.TestCase):
 class XmlEntities(unittest.TestCase):
    def runTest(self):
-        self.assertEqual(makeWellFormed('<>&'), '&lt;&gt;&amp;',
+        self.assertEqual(make_well_formed('<>&'), '&lt;&gt;&amp;',
                         'Incorrectly translated unquoted XML entities')
-        self.assertEqual(makeWellFormed('<>&', "'"), '&lt;&gt;&amp;',
+        self.assertEqual(make_well_formed('<>&', XML_ENTS_ESCAPE_APOS), '&lt;&gt;&amp;',
                         'Incorrectly translated single-quoted XML entities')
-        self.assertEqual(makeWellFormed('<>&', '"'), '&lt;&gt;&amp;',
+        self.assertEqual(make_well_formed('<>&', XML_ENTS_ESCAPE_QUOT), '&lt;&gt;&amp;',
                         'Incorrectly translated double-quoted XML entities')
-        self.assertEqual(makeWellFormed("'"), "'",
+        self.assertEqual(make_well_formed("'"), "'",
                         'Incorrectly translated unquoted single quote')
-        self.assertEqual(makeWellFormed('"'), '"',
+        self.assertEqual(make_well_formed('"'), '"',
                         'Incorrectly translated unquoted double quote')
-        self.assertEqual(makeWellFormed("'", '"'), "'",
+        self.assertEqual(make_well_formed("'", XML_ENTS_ESCAPE_QUOT), "'",
                         'Incorrectly translated double-quoted single quote')
-        self.assertEqual(makeWellFormed('"', "'"), '"',
+        self.assertEqual(make_well_formed('"', XML_ENTS_ESCAPE_APOS), '"',
                         'Incorrectly translated single-quoted double quote')
-        self.assertEqual(makeWellFormed("'", "'"), '&apos;',
+        self.assertEqual(make_well_formed("'", XML_ENTS_ESCAPE_APOS), '&apos;',
                         'Incorrectly translated single-quoted single quote')
-        self.assertEqual(makeWellFormed('"', '"'), '&quot;',
+        self.assertEqual(make_well_formed('"', XML_ENTS_ESCAPE_QUOT), '&quot;',
                         'Incorrectly translated double-quoted double quote')