From 7d28f5e051c34f262c7881b00286e6923b742e9a Mon Sep 17 00:00:00 2001
From: Eduard Braun <eduard.braun2@gmx.de>
Date: Sun, 1 Jul 2018 19:24:22 +0200
Subject: [PATCH 1/5] Improve handling of newlines

Previously we added way to many and removed empty lines afterwards
(potentially destructive if xml:space="preserve")

Also adds proper indentation for comment nodes
---
 scour/scour.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 8feb15c..8ec0126 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3334,8 +3334,6 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
     children = element.childNodes
     if children.length == 0:
         outParts.append('/>')
-        if indent_depth > 0:
-            outParts.append(newline)
     else:
         outParts.append('>')
 
@@ -3361,16 +3359,15 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
                 outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])
             # Comment node
             elif child.nodeType == Node.COMMENT_NODE:
-                outParts.extend(['<!--', child.nodeValue, '-->'])
+                outParts.extend([newline, indent_type * (indent_depth+1), '<!--', child.nodeValue, '-->'])
             # TODO: entities, processing instructions, what else?
             else:  # ignore the rest
                 pass
 
         if onNewLine:
+            outParts.append(newline)
             outParts.append(indent_type * indent_depth)
         outParts.extend(['</', element.nodeName, '>'])
-        if indent_depth > 0:
-            outParts.append(newline)
 
     return "".join(outParts)
 
@@ -3632,13 +3629,6 @@ def scourString(in_string, options=None):
 #  out_string = doc.documentElement.toprettyxml(' ')
     out_string = serializeXML(doc.documentElement, options) + '\n'
 
-    # now strip out empty lines
-    lines = []
-    # Get rid of empty lines
-    for line in out_string.splitlines(True):
-        if line.strip():
-            lines.append(line)
-
     # return the string with its XML prolog and surrounding comments
     if options.strip_xml_prolog is False:
         total_output = '<?xml version="1.0" encoding="UTF-8"'
@@ -3650,7 +3640,7 @@ def scourString(in_string, options=None):
 
     for child in doc.childNodes:
         if child.nodeType == Node.ELEMENT_NODE:
-            total_output += "".join(lines)
+            total_output += out_string
         else:  # doctypes, entities, comments
             total_output += child.toxml() + '\n'
 

From e1c2699f07ef205188cce679f34381cb81c59bbc Mon Sep 17 00:00:00 2001
From: Eduard Braun <eduard.braun2@gmx.de>
Date: Sun, 1 Jul 2018 20:16:51 +0200
Subject: [PATCH 2/5] Improve whitespace handling in text content elements

SVG specifies special logic for handling whitespace, see
   https://www.w3.org/TR/SVG/text.html#WhiteSpace
by implementing it we can even shave off some unneeded bytes here
and there (e.g. consecutive spaces).

Unfortunately handling of newlines by renderers is inconsistent:
Sometimes they are replaced by a single space, sometimes they
are removed in the output.
As we can not know the expected behavior work around this by keeping
newlines inside text content elements intact.

Fixes #160.
---
 scour/scour.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 8ec0126..08027b8 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3341,19 +3341,30 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
         for child in element.childNodes:
             # element node
             if child.nodeType == Node.ELEMENT_NODE:
-                if preserveWhitespace:
+                # do not indent inside text content elements as in SVG there's a difference between
+                #    "text1\ntext2" and
+                #    "text1\n text2"
+                # see https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
                     outParts.append(serializeXML(child, options, 0, preserveWhitespace))
                 else:
                     outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)])
                     onNewLine = True
             # text node
             elif child.nodeType == Node.TEXT_NODE:
-                # trim it only in the case of not being a child of an element
-                # where whitespace might be important
-                if preserveWhitespace:
-                    outParts.append(makeWellFormed(child.nodeValue))
-                else:
-                    outParts.append(makeWellFormed(child.nodeValue.strip()))
+                text_content = child.nodeValue
+                if not preserveWhitespace:
+                    # strip / consolidate whitespace according to spec, see
+                    #    https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                    # As a workaround for inconsistent handling of renderers keep newlines if they were in the original
+                    if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
+                        text_content = text_content.replace('\t', ' ')
+                        text_content = text_content.strip(' ')
+                        while '  ' in text_content:
+                            text_content = text_content.replace('  ', ' ')
+                    else:
+                        text_content = text_content.strip()
+                outParts.append(makeWellFormed(text_content))
             # CDATA node
             elif child.nodeType == Node.CDATA_SECTION_NODE:
                 outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])

From 2200f8dc819c4a330b49050569fb8aa0b01ff574 Mon Sep 17 00:00:00 2001
From: Eduard Braun <eduard.braun2@gmx.de>
Date: Mon, 2 Jul 2018 01:05:54 +0200
Subject: [PATCH 3/5] temp

---
 scour/scour.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scour/scour.py b/scour/scour.py
index 08027b8..000142d 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3359,7 +3359,10 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
                     # As a workaround for inconsistent handling of renderers keep newlines if they were in the original
                     if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
                         text_content = text_content.replace('\t', ' ')
-                        text_content = text_content.strip(' ')
+                        if child == element.firstChild:
+                            text_content = text_content.lstrip()
+                        elif child == element.lastChild:
+                            text_content = text_content.rstrip()
                         while '  ' in text_content:
                             text_content = text_content.replace('  ', ' ')
                     else:

From 703122369ed0c6a4e4217e7a77a8bf47d2ec3671 Mon Sep 17 00:00:00 2001
From: Eduard Braun <eduard.braun2@gmx.de>
Date: Mon, 2 Jul 2018 22:14:14 +0200
Subject: [PATCH 4/5] Strip newlines from text nodes and be done with it

Follow the spec "blindly" as it turns out covering all the border
and getting reasonably styled output is just to cumbersome.
This way at least scour output is consistent and it also saves us
some bytes (a lot in some cases as we do not indent <tspan>s etc.
anymore)
---
 scour/scour.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scour/scour.py b/scour/scour.py
index 000142d..a46af61 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3356,8 +3356,8 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
                 if not preserveWhitespace:
                     # strip / consolidate whitespace according to spec, see
                     #    https://www.w3.org/TR/SVG/text.html#WhiteSpace
-                    # As a workaround for inconsistent handling of renderers keep newlines if they were in the original
                     if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
+                        text_content = text_content.replace('\n', '')
                         text_content = text_content.replace('\t', ' ')
                         if child == element.firstChild:
                             text_content = text_content.lstrip()

From 651694a6c0547a57f6634ad4331fa2d551c87074 Mon Sep 17 00:00:00 2001
From: Eduard Braun <eduard.braun2@gmx.de>
Date: Tue, 3 Jul 2018 22:53:05 +0200
Subject: [PATCH 5/5] Add unittests for whitespace handling in text node

Also expand/fix the test for line endings
---
 testscour.py                       | 103 +++++++++++++++++++++--------
 unittests/newlines.svg             |  50 ++++++++++++++
 unittests/whitespace-important.svg |   4 --
 unittests/whitespace-nested.svg    |   4 --
 unittests/whitespace.svg           |  40 +++++++++++
 5 files changed, 166 insertions(+), 35 deletions(-)
 create mode 100644 unittests/newlines.svg
 delete mode 100644 unittests/whitespace-important.svg
 delete mode 100644 unittests/whitespace-nested.svg
 create mode 100644 unittests/whitespace.svg

diff --git a/testscour.py b/testscour.py
index cc6676e..04da38c 100755
--- a/testscour.py
+++ b/testscour.py
@@ -1744,34 +1744,83 @@ class DoNotRemoveGradientsWhenReferencedInStyleCss(unittest.TestCase):
                          'Gradients removed when referenced in CSS')
 
 
-class DoNotPrettyPrintWhenWhitespacePreserved(unittest.TestCase):
+class Whitespace(unittest.TestCase):
 
-    def runTest(self):
-        with open('unittests/whitespace-important.svg') as f:
-            s = scourString(f.read()).splitlines()
-        c = '''<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve">This is some <tspan font-style="italic">messed-up</tspan> markup</text>
-</svg>
-'''.splitlines()
-        for i in range(4):
-            self.assertEqual(s[i], c[i],
-                             'Whitespace not preserved for line ' + str(i))
+    def setUp(self):
+        self.doc = scourXmlFile('unittests/whitespace.svg')
 
+    def test_basic(self):
+        text = self.doc.getElementById('txt_a1')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Multiple spaces not stripped from text element')
+        text = self.doc.getElementById('txt_a2')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Tab not replaced with space in text element')
+        text = self.doc.getElementById('txt_a3')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Multiple spaces not stripped from text element with xml:space="default"')
+        text = self.doc.getElementById('txt_a4')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Tab not replaced with space in text element with xml:space="default"')
+        text = self.doc.getElementById('txt_a5')
+        self.assertIn('text1    text2', text.toxml(),
+                      'Multiple spaces not preserved in text element with xml:space="preserve"')
+        text = self.doc.getElementById('txt_a6')
+        self.assertIn('text1\ttext2', text.toxml(),
+                      'Tab not preserved in text element with xml:space="preserve"')
 
-class DoNotPrettyPrintWhenNestedWhitespacePreserved(unittest.TestCase):
+    def test_newlines(self):
+        text = self.doc.getElementById('txt_b1')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Newline not replaced with space in text element')
+        text = self.doc.getElementById('txt_b2')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Newline not replaced with space in text element with xml:space="default"')
+        text = self.doc.getElementById('txt_b3')
+        self.assertIn('text1\n       text2', text.toxml(),
+                      'Newline not preserved in text element with xml:space="preserve"')
 
-    def runTest(self):
-        with open('unittests/whitespace-nested.svg') as f:
-            s = scourString(f.read()).splitlines()
-        c = '''<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve"><tspan font-style="italic">Use <tspan font-style="bold">bold</tspan> text</tspan></text>
-</svg>
-'''.splitlines()
-        for i in range(4):
-            self.assertEqual(s[i], c[i],
-                             'Whitespace not preserved when nested for line ' + str(i))
+    def test_inheritance(self):
+        text = self.doc.getElementById('txt_c1')
+        self.assertIn('text1    text2', text.toxml(),
+                      '<tspan> does not inherit xml:space="preserve" of parent text element')
+        text = self.doc.getElementById('txt_c2')
+        self.assertIn('text1 text2', text.toxml(),
+                      'xml:space="default" of <tspan> does not overwrite xml:space="preserve" of parent text element')
+        text = self.doc.getElementById('txt_c3')
+        self.assertIn('text1    text2', text.toxml(),
+                      'xml:space="preserve" of <tspan> does not overwrite xml:space="default" of parent text element')
+        text = self.doc.getElementById('txt_c4')
+        self.assertIn('text1    text2', text.toxml(),
+                      '<text> does not inherit xml:space="preserve" of parent group')
+        text = self.doc.getElementById('txt_c5')
+        self.assertIn('text1 text2', text.toxml(),
+                      'xml:space="default" of text element does not overwrite xml:space="preserve" of parent group')
+        text = self.doc.getElementById('txt_c6')
+        self.assertIn('text1    text2', text.toxml(),
+                      'xml:space="preserve" of text element does not overwrite xml:space="default" of parent group')
+
+    def test_important_whitespace(self):
+        text = self.doc.getElementById('txt_d1')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Newline with whitespace collapsed in text element')
+        text = self.doc.getElementById('txt_d2')
+        self.assertIn('text1 <tspan>tspan1</tspan> text2', text.toxml(),
+                      'Whitespace stripped from the middle of a text element')
+        text = self.doc.getElementById('txt_d3')
+        self.assertIn('text1 <tspan>tspan1 <tspan>tspan2</tspan> text2</tspan>', text.toxml(),
+                      'Whitespace stripped from the middle of a text element')
+
+    def test_incorrect_whitespace(self):
+        text = self.doc.getElementById('txt_e1')
+        self.assertIn('text1text2', text.toxml(),
+                      'Whitespace introduced in text element with newline')
+        text = self.doc.getElementById('txt_e2')
+        self.assertIn('text1<tspan>tspan</tspan>text2', text.toxml(),
+                      'Whitespace introduced in text element with <tspan>')
+        text = self.doc.getElementById('txt_e3')
+        self.assertIn('text1<tspan>tspan</tspan>text2', text.toxml(),
+                      'Whitespace introduced in text element with <tspan> and newlines')
 
 
 class GetAttrPrefixRight(unittest.TestCase):
@@ -1807,10 +1856,10 @@ class HandleEmptyStyleElement(unittest.TestCase):
 class EnsureLineEndings(unittest.TestCase):
 
     def runTest(self):
-        with open('unittests/whitespace-important.svg') as f:
+        with open('unittests/newlines.svg') as f:
             s = scourString(f.read())
-        self.assertEqual(len(s.splitlines()), 4,
-                         'Did not output line ending character correctly')
+        self.assertEqual(len(s.splitlines()), 24,
+                         'Did handle reading or outputting line ending characters correctly')
 
 
 class XmlEntities(unittest.TestCase):
diff --git a/unittests/newlines.svg b/unittests/newlines.svg
new file mode 100644
index 0000000..a909603
--- /dev/null
+++ b/unittests/newlines.svg
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+	
+
+<svg xmlns="http://www.w3.org/2000/svg"
+
+>
+
+
+
+     <!-- this file has pretty messed up formatting -->       <rect width="100" height="100"/>	
+  <rect width="100" height="100"/>
+  <rect width="100" height="100"/>
+  
+  
+ 	
+  
+  
+  <rect width="100" height="100"/>
+                 <rect width="100" height="100"/>	
+  <rect width="100" height="100"/>            
+  
+  
+  
+  
+  
+  
+  <!-- we have mixed newline
+  characters, carriage returns  and both of them
+  as well as tabs 	   		and spaces        
+  -->
+  
+  <rect width="100" height="100"/><rect width="100" height="100"/>   <rect width="100" height="100"/>
+  
+  <rect width="100" height="100"/><rect width="100" height="100"/>   <rect width="100" height="100"/>
+  
+	
+  
+		
+  
+  <rect width="100" height="100"/>		  	<rect width="100" height="100"/>
+  
+  
+</svg>
+
+
+
+
+
+<!-- OMG, really? -->
\ No newline at end of file
diff --git a/unittests/whitespace-important.svg b/unittests/whitespace-important.svg
deleted file mode 100644
index 6918044..0000000
--- a/unittests/whitespace-important.svg
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve">This is some <tspan font-style="italic">messed-up</tspan> markup</text>
-</svg>
diff --git a/unittests/whitespace-nested.svg b/unittests/whitespace-nested.svg
deleted file mode 100644
index 3b99356..0000000
--- a/unittests/whitespace-nested.svg
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve"><tspan font-style="italic">Use <tspan font-style="bold">bold</tspan> text</tspan></text>
-</svg>
diff --git a/unittests/whitespace.svg b/unittests/whitespace.svg
new file mode 100644
index 0000000..2bb48a6
--- /dev/null
+++ b/unittests/whitespace.svg
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg">
+ <!-- basic tests -->
+ <text id="txt_a1">text1    text2</text> <!-- multiple spaces -->
+ <text id="txt_a2">text1	text2</text> <!-- tab -->
+ <text id="txt_a3" xml:space="default">text1    text2</text> <!-- multiple spaces -->
+ <text id="txt_a4" xml:space="default">text1	text2</text> <!-- tab -->
+ <text id="txt_a5" xml:space="preserve">text1    text2</text> <!-- multiple spaces -->
+ <text id="txt_a6" xml:space="preserve">text1	text2</text> <!-- tab -->
+
+ <!-- newlines -->
+ <text id="txt_b1">text1
+       text2</text>
+ <text id="txt_b2" xml:space="default">text1
+       text2</text>
+ <text id="txt_b3" xml:space="preserve">text1
+       text2</text>
+
+ <!-- inheritance -->
+ <text id="txt_c1" xml:space="preserve"><tspan>text1    text2</tspan></text>
+ <text id="txt_c2" xml:space="preserve"><tspan xml:space="default">text1    text2</tspan></text>
+ <text id="txt_c3" xml:space="default"><tspan xml:space="preserve">text1    text2</tspan></text>
+ <g xml:space="preserve"><text id="txt_c4">text1    text2</text></g>
+ <g xml:space="preserve"><text id="txt_c5" xml:space="default">text1    text2</text></g>
+ <g xml:space="default"><text id="txt_c6" xml:space="preserve">text1    text2</text></g>
+
+ <!-- important whitespace that must not be stripped -->
+ <text id="txt_d1">text1
+ text2</text>
+ <text id="txt_d2">text1 <tspan>tspan1</tspan> text2</text>
+ <text id="txt_d3">text1 <tspan>tspan1 <tspan>tspan2</tspan> text2</tspan></text>
+ 
+ <!-- whitespace must not be introduced -->
+ <text id="txt_e1">text1
+text2</text>
+ <text id="txt_e2">text1<tspan>tspan</tspan>text2</text>
+ <text id="txt_e3">text1
+<tspan>tspan</tspan>
+text2</text>
+</svg>