From e1c2699f07ef205188cce679f34381cb81c59bbc Mon Sep 17 00:00:00 2001
From: Eduard Braun <eduard.braun2@gmx.de>
Date: Sun, 1 Jul 2018 20:16:51 +0200
Subject: [PATCH] Improve whitespace handling in text content elements

SVG specifies special logic for handling whitespace, see
   https://www.w3.org/TR/SVG/text.html#WhiteSpace
by implementing it we can even shave off some unneeded bytes here
and there (e.g. consecutive spaces).

Unfortunately handling of newlines by renderers is inconsistent:
Sometimes they are replaced by a single space, sometimes they
are removed in the output.
As we can not know the expected behavior work around this by keeping
newlines inside text content elements intact.

Fixes #160.
---
 scour/scour.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 8ec0126..08027b8 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3341,19 +3341,30 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
         for child in element.childNodes:
             # element node
             if child.nodeType == Node.ELEMENT_NODE:
-                if preserveWhitespace:
+                # do not indent inside text content elements as in SVG there's a difference between
+                #    "text1\ntext2" and
+                #    "text1\n text2"
+                # see https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
                     outParts.append(serializeXML(child, options, 0, preserveWhitespace))
                 else:
                     outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)])
                     onNewLine = True
             # text node
             elif child.nodeType == Node.TEXT_NODE:
-                # trim it only in the case of not being a child of an element
-                # where whitespace might be important
-                if preserveWhitespace:
-                    outParts.append(makeWellFormed(child.nodeValue))
-                else:
-                    outParts.append(makeWellFormed(child.nodeValue.strip()))
+                text_content = child.nodeValue
+                if not preserveWhitespace:
+                    # strip / consolidate whitespace according to spec, see
+                    #    https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                    # As a workaround for inconsistent handling of renderers keep newlines if they were in the original
+                    if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
+                        text_content = text_content.replace('\t', ' ')
+                        text_content = text_content.strip(' ')
+                        while '  ' in text_content:
+                            text_content = text_content.replace('  ', ' ')
+                    else:
+                        text_content = text_content.strip()
+                outParts.append(makeWellFormed(text_content))
             # CDATA node
             elif child.nodeType == Node.CDATA_SECTION_NODE:
                 outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])