Perf optimized serialization (#247)
Some commits to reduce the overhead in outputting the SVG again (most of it is in serializeXML and below)
This commit is contained in:
commit
4e489b7ea9
2 changed files with 87 additions and 61 deletions
123
scour/scour.py
123
scour/scour.py
|
|
@ -74,6 +74,12 @@ VER = __version__
|
||||||
COPYRIGHT = u'Copyright Jeff Schiller, Louis Simard, 2010'
|
COPYRIGHT = u'Copyright Jeff Schiller, Louis Simard, 2010'
|
||||||
|
|
||||||
|
|
||||||
|
XML_ENTS_NO_QUOTES = {'<': '<', '>': '>', '&': '&'}
|
||||||
|
XML_ENTS_ESCAPE_APOS = XML_ENTS_NO_QUOTES.copy()
|
||||||
|
XML_ENTS_ESCAPE_APOS["'"] = '''
|
||||||
|
XML_ENTS_ESCAPE_QUOT = XML_ENTS_NO_QUOTES.copy()
|
||||||
|
XML_ENTS_ESCAPE_QUOT['"'] = '"'
|
||||||
|
|
||||||
NS = {'SVG': 'http://www.w3.org/2000/svg',
|
NS = {'SVG': 'http://www.w3.org/2000/svg',
|
||||||
'XLINK': 'http://www.w3.org/1999/xlink',
|
'XLINK': 'http://www.w3.org/1999/xlink',
|
||||||
'SODIPODI': 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd',
|
'SODIPODI': 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd',
|
||||||
|
|
@ -562,7 +568,7 @@ def findReferencedElements(node, ids=None):
|
||||||
# one stretch of text, please! (we could use node.normalize(), but
|
# one stretch of text, please! (we could use node.normalize(), but
|
||||||
# this actually modifies the node, and we don't want to keep
|
# this actually modifies the node, and we don't want to keep
|
||||||
# whitespace around if there's any)
|
# whitespace around if there's any)
|
||||||
stylesheet = "".join([child.nodeValue for child in node.childNodes])
|
stylesheet = "".join(child.nodeValue for child in node.childNodes)
|
||||||
if stylesheet != '':
|
if stylesheet != '':
|
||||||
cssRules = parseCssString(stylesheet)
|
cssRules = parseCssString(stylesheet)
|
||||||
for rule in cssRules:
|
for rule in cssRules:
|
||||||
|
|
@ -853,7 +859,7 @@ def renameID(idFrom, idTo, identifiedElements, referringNodes):
|
||||||
# there's a CDATASection node surrounded by whitespace
|
# there's a CDATASection node surrounded by whitespace
|
||||||
# nodes
|
# nodes
|
||||||
# (node.normalize() will NOT work here, it only acts on Text nodes)
|
# (node.normalize() will NOT work here, it only acts on Text nodes)
|
||||||
oldValue = "".join([child.nodeValue for child in node.childNodes])
|
oldValue = "".join(child.nodeValue for child in node.childNodes)
|
||||||
# not going to reparse the whole thing
|
# not going to reparse the whole thing
|
||||||
newValue = oldValue.replace('url(#' + idFrom + ')', 'url(#' + idTo + ')')
|
newValue = oldValue.replace('url(#' + idFrom + ')', 'url(#' + idTo + ')')
|
||||||
newValue = newValue.replace("url(#'" + idFrom + "')", 'url(#' + idTo + ')')
|
newValue = newValue.replace("url(#'" + idFrom + "')", 'url(#' + idTo + ')')
|
||||||
|
|
@ -1617,7 +1623,7 @@ def _getStyle(node):
|
||||||
|
|
||||||
def _setStyle(node, styleMap):
|
def _setStyle(node, styleMap):
|
||||||
u"""Sets the style attribute of a node to the dictionary ``styleMap``."""
|
u"""Sets the style attribute of a node to the dictionary ``styleMap``."""
|
||||||
fixedStyle = ';'.join([prop + ':' + styleMap[prop] for prop in styleMap])
|
fixedStyle = ';'.join(prop + ':' + styleMap[prop] for prop in styleMap)
|
||||||
if fixedStyle != '':
|
if fixedStyle != '':
|
||||||
node.setAttribute('style', fixedStyle)
|
node.setAttribute('style', fixedStyle)
|
||||||
elif node.getAttribute('style'):
|
elif node.getAttribute('style'):
|
||||||
|
|
@ -2837,18 +2843,18 @@ def serializePath(pathObj, options):
|
||||||
"""
|
"""
|
||||||
# elliptical arc commands must have comma/wsp separating the coordinates
|
# elliptical arc commands must have comma/wsp separating the coordinates
|
||||||
# this fixes an issue outlined in Fix https://bugs.launchpad.net/scour/+bug/412754
|
# this fixes an issue outlined in Fix https://bugs.launchpad.net/scour/+bug/412754
|
||||||
return ''.join([cmd + scourCoordinates(data, options,
|
return ''.join(cmd + scourCoordinates(data, options,
|
||||||
control_points=controlPoints(cmd, data),
|
control_points=controlPoints(cmd, data),
|
||||||
flags=flags(cmd, data))
|
flags=flags(cmd, data))
|
||||||
for cmd, data in pathObj])
|
for cmd, data in pathObj)
|
||||||
|
|
||||||
|
|
||||||
def serializeTransform(transformObj):
|
def serializeTransform(transformObj):
|
||||||
"""
|
"""
|
||||||
Reserializes the transform data with some cleanups.
|
Reserializes the transform data with some cleanups.
|
||||||
"""
|
"""
|
||||||
return ' '.join([command + '(' + ' '.join([scourUnitlessLength(number) for number in numbers]) + ')'
|
return ' '.join(command + '(' + ' '.join(scourUnitlessLength(number) for number in numbers) + ')'
|
||||||
for command, numbers in transformObj])
|
for command, numbers in transformObj)
|
||||||
|
|
||||||
|
|
||||||
def scourCoordinates(data, options, force_whitespace=False, control_points=[], flags=[]):
|
def scourCoordinates(data, options, force_whitespace=False, control_points=[], flags=[]):
|
||||||
|
|
@ -3404,29 +3410,70 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
|
||||||
remapNamespacePrefix(child, oldprefix, newprefix)
|
remapNamespacePrefix(child, oldprefix, newprefix)
|
||||||
|
|
||||||
|
|
||||||
def makeWellFormed(str, quote=''):
|
def make_well_formed(text, quote_dict=None):
|
||||||
xml_ents = {'<': '<', '>': '>', '&': '&'}
|
if quote_dict is None:
|
||||||
if quote:
|
quote_dict = XML_ENTS_NO_QUOTES
|
||||||
xml_ents[quote] = ''' if (quote == "'") else """
|
if not any(c in text for c in quote_dict):
|
||||||
return ''.join([xml_ents[c] if c in xml_ents else c for c in str])
|
# The quote-able characters are quite rare in SVG (they mostly only
|
||||||
|
# occur in text elements in practice). Therefore it make sense to
|
||||||
|
# optimize for this common case
|
||||||
|
return text
|
||||||
|
return ''.join(quote_dict[c] if c in quote_dict else c for c in text)
|
||||||
|
|
||||||
|
|
||||||
def chooseQuoteCharacter(str):
|
def choose_quote_character(value):
|
||||||
quotCount = str.count('"')
|
quot_count = value.count('"')
|
||||||
aposCount = str.count("'")
|
if quot_count == 0 or quot_count <= value.count("'"):
|
||||||
if quotCount > aposCount:
|
# Fewest "-symbols (if there are 0, we pick this to avoid spending
|
||||||
quote = "'"
|
# time counting the '-symbols as it won't matter)
|
||||||
hasEmbeddedQuote = aposCount
|
|
||||||
else:
|
|
||||||
quote = '"'
|
quote = '"'
|
||||||
hasEmbeddedQuote = quotCount
|
xml_ent = XML_ENTS_ESCAPE_QUOT
|
||||||
return (quote, hasEmbeddedQuote)
|
else:
|
||||||
|
quote = "'"
|
||||||
|
xml_ent = XML_ENTS_ESCAPE_APOS
|
||||||
|
return quote, xml_ent
|
||||||
|
|
||||||
|
|
||||||
TEXT_CONTENT_ELEMENTS = ['text', 'tspan', 'tref', 'textPath', 'altGlyph',
|
TEXT_CONTENT_ELEMENTS = ['text', 'tspan', 'tref', 'textPath', 'altGlyph',
|
||||||
'flowDiv', 'flowPara', 'flowSpan', 'flowTref', 'flowLine']
|
'flowDiv', 'flowPara', 'flowSpan', 'flowTref', 'flowLine']
|
||||||
|
|
||||||
|
|
||||||
|
KNOWN_ATTRS = [
|
||||||
|
# TODO: Maybe update with full list from https://www.w3.org/TR/SVG/attindex.html
|
||||||
|
# (but should be kept intuitively ordered)
|
||||||
|
'id', 'xml:id', 'class',
|
||||||
|
'transform',
|
||||||
|
'x', 'y', 'z', 'width', 'height', 'x1', 'x2', 'y1', 'y2',
|
||||||
|
'dx', 'dy', 'rotate', 'startOffset', 'method', 'spacing',
|
||||||
|
'cx', 'cy', 'r', 'rx', 'ry', 'fx', 'fy',
|
||||||
|
'd', 'points',
|
||||||
|
] + sorted(svgAttributes) + [
|
||||||
|
'style',
|
||||||
|
]
|
||||||
|
|
||||||
|
KNOWN_ATTRS_ORDER_BY_NAME = defaultdict(lambda: len(KNOWN_ATTRS),
|
||||||
|
{name: order for order, name in enumerate(KNOWN_ATTRS)})
|
||||||
|
|
||||||
|
|
||||||
|
# use custom order for known attributes and alphabetical order for the rest
|
||||||
|
def _attribute_sort_key_function(attribute):
|
||||||
|
name = attribute.name
|
||||||
|
order_value = KNOWN_ATTRS_ORDER_BY_NAME[name]
|
||||||
|
return order_value, name
|
||||||
|
|
||||||
|
|
||||||
|
def attributes_ordered_for_output(element):
|
||||||
|
if not element.hasAttributes():
|
||||||
|
return []
|
||||||
|
attribute = element.attributes
|
||||||
|
# The .item(i) call is painfully slow (bpo#40689). Therefore we ensure we
|
||||||
|
# call it at most once per attribute.
|
||||||
|
# - it would be many times faster to use `attribute.values()` but sadly
|
||||||
|
# that is an "experimental" interface.
|
||||||
|
return sorted((attribute.item(i) for i in range(attribute.length)),
|
||||||
|
key=_attribute_sort_key_function)
|
||||||
|
|
||||||
|
|
||||||
# hand-rolled serialization function that has the following benefits:
|
# hand-rolled serialization function that has the following benefits:
|
||||||
# - pretty printing
|
# - pretty printing
|
||||||
# - somewhat judicious use of whitespace
|
# - somewhat judicious use of whitespace
|
||||||
|
|
@ -3447,37 +3494,15 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
|
||||||
outParts.extend([(indent_type * indent_depth), '<', element.nodeName])
|
outParts.extend([(indent_type * indent_depth), '<', element.nodeName])
|
||||||
|
|
||||||
# now serialize the other attributes
|
# now serialize the other attributes
|
||||||
known_attr = [
|
attrs = attributes_ordered_for_output(element)
|
||||||
# TODO: Maybe update with full list from https://www.w3.org/TR/SVG/attindex.html
|
for attr in attrs:
|
||||||
# (but should be kept inuitively ordered)
|
|
||||||
'id', 'xml:id', 'class',
|
|
||||||
'transform',
|
|
||||||
'x', 'y', 'z', 'width', 'height', 'x1', 'x2', 'y1', 'y2',
|
|
||||||
'dx', 'dy', 'rotate', 'startOffset', 'method', 'spacing',
|
|
||||||
'cx', 'cy', 'r', 'rx', 'ry', 'fx', 'fy',
|
|
||||||
'd', 'points',
|
|
||||||
] + sorted(svgAttributes) + [
|
|
||||||
'style',
|
|
||||||
]
|
|
||||||
attrList = element.attributes
|
|
||||||
attrName2Index = dict([(attrList.item(i).nodeName, i) for i in range(attrList.length)])
|
|
||||||
# use custom order for known attributes and alphabetical order for the rest
|
|
||||||
attrIndices = []
|
|
||||||
for name in known_attr:
|
|
||||||
if name in attrName2Index:
|
|
||||||
attrIndices.append(attrName2Index[name])
|
|
||||||
del attrName2Index[name]
|
|
||||||
attrIndices += [attrName2Index[name] for name in sorted(attrName2Index)]
|
|
||||||
for index in attrIndices:
|
|
||||||
attr = attrList.item(index)
|
|
||||||
|
|
||||||
attrValue = attr.nodeValue
|
attrValue = attr.nodeValue
|
||||||
(quote, hasEmbeddedQuote) = chooseQuoteCharacter(attrValue)
|
quote, xml_ent = choose_quote_character(attrValue)
|
||||||
attrValue = makeWellFormed(attrValue, quote if hasEmbeddedQuote else '')
|
attrValue = make_well_formed(attrValue, xml_ent)
|
||||||
|
|
||||||
if attr.nodeName == 'style':
|
if attr.nodeName == 'style':
|
||||||
# sort declarations
|
# sort declarations
|
||||||
attrValue = ';'.join([p for p in sorted(attrValue.split(';'))])
|
attrValue = ';'.join(sorted(attrValue.split(';')))
|
||||||
|
|
||||||
outParts.append(' ')
|
outParts.append(' ')
|
||||||
# preserve xmlns: if it is a namespace prefix declaration
|
# preserve xmlns: if it is a namespace prefix declaration
|
||||||
|
|
@ -3532,7 +3557,7 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
|
||||||
text_content = text_content.replace(' ', ' ')
|
text_content = text_content.replace(' ', ' ')
|
||||||
else:
|
else:
|
||||||
text_content = text_content.strip()
|
text_content = text_content.strip()
|
||||||
outParts.append(makeWellFormed(text_content))
|
outParts.append(make_well_formed(text_content))
|
||||||
# CDATA node
|
# CDATA node
|
||||||
elif child.nodeType == Node.CDATA_SECTION_NODE:
|
elif child.nodeType == Node.CDATA_SECTION_NODE:
|
||||||
outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])
|
outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,8 @@ import unittest
|
||||||
import six
|
import six
|
||||||
from six.moves import map, range
|
from six.moves import map, range
|
||||||
|
|
||||||
from scour.scour import makeWellFormed, parse_args, scourString, scourXmlFile, start, run
|
from scour.scour import (make_well_formed, parse_args, scourString, scourXmlFile, start, run,
|
||||||
|
XML_ENTS_ESCAPE_APOS, XML_ENTS_ESCAPE_QUOT)
|
||||||
from scour.svg_regex import svg_parser
|
from scour.svg_regex import svg_parser
|
||||||
from scour import __version__
|
from scour import __version__
|
||||||
|
|
||||||
|
|
@ -1893,26 +1894,26 @@ class EnsureLineEndings(unittest.TestCase):
|
||||||
class XmlEntities(unittest.TestCase):
|
class XmlEntities(unittest.TestCase):
|
||||||
|
|
||||||
def runTest(self):
|
def runTest(self):
|
||||||
self.assertEqual(makeWellFormed('<>&'), '<>&',
|
self.assertEqual(make_well_formed('<>&'), '<>&',
|
||||||
'Incorrectly translated unquoted XML entities')
|
'Incorrectly translated unquoted XML entities')
|
||||||
self.assertEqual(makeWellFormed('<>&', "'"), '<>&',
|
self.assertEqual(make_well_formed('<>&', XML_ENTS_ESCAPE_APOS), '<>&',
|
||||||
'Incorrectly translated single-quoted XML entities')
|
'Incorrectly translated single-quoted XML entities')
|
||||||
self.assertEqual(makeWellFormed('<>&', '"'), '<>&',
|
self.assertEqual(make_well_formed('<>&', XML_ENTS_ESCAPE_QUOT), '<>&',
|
||||||
'Incorrectly translated double-quoted XML entities')
|
'Incorrectly translated double-quoted XML entities')
|
||||||
|
|
||||||
self.assertEqual(makeWellFormed("'"), "'",
|
self.assertEqual(make_well_formed("'"), "'",
|
||||||
'Incorrectly translated unquoted single quote')
|
'Incorrectly translated unquoted single quote')
|
||||||
self.assertEqual(makeWellFormed('"'), '"',
|
self.assertEqual(make_well_formed('"'), '"',
|
||||||
'Incorrectly translated unquoted double quote')
|
'Incorrectly translated unquoted double quote')
|
||||||
|
|
||||||
self.assertEqual(makeWellFormed("'", '"'), "'",
|
self.assertEqual(make_well_formed("'", XML_ENTS_ESCAPE_QUOT), "'",
|
||||||
'Incorrectly translated double-quoted single quote')
|
'Incorrectly translated double-quoted single quote')
|
||||||
self.assertEqual(makeWellFormed('"', "'"), '"',
|
self.assertEqual(make_well_formed('"', XML_ENTS_ESCAPE_APOS), '"',
|
||||||
'Incorrectly translated single-quoted double quote')
|
'Incorrectly translated single-quoted double quote')
|
||||||
|
|
||||||
self.assertEqual(makeWellFormed("'", "'"), ''',
|
self.assertEqual(make_well_formed("'", XML_ENTS_ESCAPE_APOS), ''',
|
||||||
'Incorrectly translated single-quoted single quote')
|
'Incorrectly translated single-quoted single quote')
|
||||||
self.assertEqual(makeWellFormed('"', '"'), '"',
|
self.assertEqual(make_well_formed('"', XML_ENTS_ESCAPE_QUOT), '"',
|
||||||
'Incorrectly translated double-quoted double quote')
|
'Incorrectly translated double-quoted double quote')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue