From e1c2699f07ef205188cce679f34381cb81c59bbc Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Sun, 1 Jul 2018 20:16:51 +0200 Subject: [PATCH] Improve whitespace handling in text content elements SVG specifies special logic for handling whitespace, see https://www.w3.org/TR/SVG/text.html#WhiteSpace by implementing it we can even shave off some unneeded bytes here and there (e.g. consecutive spaces). Unfortunately handling of newlines by renderers is inconsistent: Sometimes they are replaced by a single space, sometimes they are removed in the output. As we can not know the expected behavior work around this by keeping newlines inside text content elements intact. Fixes #160. --- scour/scour.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 8ec0126..08027b8 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3341,19 +3341,30 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): for child in element.childNodes: # element node if child.nodeType == Node.ELEMENT_NODE: - if preserveWhitespace: + # do not indent inside text content elements as in SVG there's a difference between + # "text1\ntext2" and + # "text1\n text2" + # see https://www.w3.org/TR/SVG/text.html#WhiteSpace + if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']: outParts.append(serializeXML(child, options, 0, preserveWhitespace)) else: outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)]) onNewLine = True # text node elif child.nodeType == Node.TEXT_NODE: - # trim it only in the case of not being a child of an element - # where whitespace might be important - if preserveWhitespace: - outParts.append(makeWellFormed(child.nodeValue)) - else: - outParts.append(makeWellFormed(child.nodeValue.strip())) + text_content = child.nodeValue + if not preserveWhitespace: + # strip / consolidate whitespace according to spec, see + # https://www.w3.org/TR/SVG/text.html#WhiteSpace + # As a workaround for inconsistent handling of renderers keep newlines if they were in the original + if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']: + text_content = text_content.replace('\t', ' ') + text_content = text_content.strip(' ') + while ' ' in text_content: + text_content = text_content.replace(' ', ' ') + else: + text_content = text_content.strip() + outParts.append(makeWellFormed(text_content)) # CDATA node elif child.nodeType == Node.CDATA_SECTION_NODE: outParts.extend([''])