Improve whitespace handling in text content elements

SVG specifies special logic for handling whitespace, see https://www.w3.org/TR/SVG/text.html#WhiteSpace by implementing it we can even shave off some unneeded bytes here and there (e.g. consecutive spaces). Unfortunately handling of newlines by renderers is inconsistent: Sometimes they are replaced by a single space, sometimes they are removed in the output. As we can not know the expected behavior work around this by keeping newlines inside text content elements intact. Fixes #160.
2018-07-01 20:16:51 +02:00 · 2018-07-01 20:16:51 +02:00 · e1c2699f07
commit e1c2699f07
parent 7d28f5e051
1 changed files with 18 additions and 7 deletions
--- a/scour/scour.py
+++ b/scour/scour.py
@ -3341,19 +3341,30 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
        for child in element.childNodes:
            # element node
            if child.nodeType == Node.ELEMENT_NODE:
-                if preserveWhitespace:
+                # do not indent inside text content elements as in SVG there's a difference between
+                #    "text1\ntext2" and
+                #    "text1\n text2"
+                # see https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
                    outParts.append(serializeXML(child, options, 0, preserveWhitespace))
                else:
                    outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)])
                    onNewLine = True
            # text node
            elif child.nodeType == Node.TEXT_NODE:
-                # trim it only in the case of not being a child of an element
-                # where whitespace might be important
-                if preserveWhitespace:
-                    outParts.append(makeWellFormed(child.nodeValue))
+                text_content = child.nodeValue
+                if not preserveWhitespace:
+                    # strip / consolidate whitespace according to spec, see
+                    #    https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                    # As a workaround for inconsistent handling of renderers keep newlines if they were in the original
+                    if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
+                        text_content = text_content.replace('\t', ' ')
+                        text_content = text_content.strip(' ')
+                        while '  ' in text_content:
+                            text_content = text_content.replace('  ', ' ')
                    else:
-                    outParts.append(makeWellFormed(child.nodeValue.strip()))
+                        text_content = text_content.strip()
+                outParts.append(makeWellFormed(text_content))
            # CDATA node
            elif child.nodeType == Node.CDATA_SECTION_NODE:
                outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])