From 7d28f5e051c34f262c7881b00286e6923b742e9a Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Sun, 1 Jul 2018 19:24:22 +0200 Subject: [PATCH 1/5] Improve handling of newlines Previously we added way to many and removed empty lines afterwards (potentially destructive if xml:space="preserve") Also adds proper indentation for comment nodes --- scour/scour.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 8feb15c..8ec0126 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3334,8 +3334,6 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): children = element.childNodes if children.length == 0: outParts.append('/>') - if indent_depth > 0: - outParts.append(newline) else: outParts.append('>') @@ -3361,16 +3359,15 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): outParts.extend(['']) # Comment node elif child.nodeType == Node.COMMENT_NODE: - outParts.extend(['']) + outParts.extend([newline, indent_type * (indent_depth+1), '']) # TODO: entities, processing instructions, what else? else: # ignore the rest pass if onNewLine: + outParts.append(newline) outParts.append(indent_type * indent_depth) outParts.extend(['']) - if indent_depth > 0: - outParts.append(newline) return "".join(outParts) @@ -3632,13 +3629,6 @@ def scourString(in_string, options=None): # out_string = doc.documentElement.toprettyxml(' ') out_string = serializeXML(doc.documentElement, options) + '\n' - # now strip out empty lines - lines = [] - # Get rid of empty lines - for line in out_string.splitlines(True): - if line.strip(): - lines.append(line) - # return the string with its XML prolog and surrounding comments if options.strip_xml_prolog is False: total_output = ' Date: Sun, 1 Jul 2018 20:16:51 +0200 Subject: [PATCH 2/5] Improve whitespace handling in text content elements SVG specifies special logic for handling whitespace, see https://www.w3.org/TR/SVG/text.html#WhiteSpace by implementing it we can even shave off some unneeded bytes here and there (e.g. consecutive spaces). Unfortunately handling of newlines by renderers is inconsistent: Sometimes they are replaced by a single space, sometimes they are removed in the output. As we can not know the expected behavior work around this by keeping newlines inside text content elements intact. Fixes #160. --- scour/scour.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 8ec0126..08027b8 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3341,19 +3341,30 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): for child in element.childNodes: # element node if child.nodeType == Node.ELEMENT_NODE: - if preserveWhitespace: + # do not indent inside text content elements as in SVG there's a difference between + # "text1\ntext2" and + # "text1\n text2" + # see https://www.w3.org/TR/SVG/text.html#WhiteSpace + if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']: outParts.append(serializeXML(child, options, 0, preserveWhitespace)) else: outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)]) onNewLine = True # text node elif child.nodeType == Node.TEXT_NODE: - # trim it only in the case of not being a child of an element - # where whitespace might be important - if preserveWhitespace: - outParts.append(makeWellFormed(child.nodeValue)) - else: - outParts.append(makeWellFormed(child.nodeValue.strip())) + text_content = child.nodeValue + if not preserveWhitespace: + # strip / consolidate whitespace according to spec, see + # https://www.w3.org/TR/SVG/text.html#WhiteSpace + # As a workaround for inconsistent handling of renderers keep newlines if they were in the original + if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']: + text_content = text_content.replace('\t', ' ') + text_content = text_content.strip(' ') + while ' ' in text_content: + text_content = text_content.replace(' ', ' ') + else: + text_content = text_content.strip() + outParts.append(makeWellFormed(text_content)) # CDATA node elif child.nodeType == Node.CDATA_SECTION_NODE: outParts.extend(['']) From 2200f8dc819c4a330b49050569fb8aa0b01ff574 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Mon, 2 Jul 2018 01:05:54 +0200 Subject: [PATCH 3/5] temp --- scour/scour.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scour/scour.py b/scour/scour.py index 08027b8..000142d 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3359,7 +3359,10 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): # As a workaround for inconsistent handling of renderers keep newlines if they were in the original if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']: text_content = text_content.replace('\t', ' ') - text_content = text_content.strip(' ') + if child == element.firstChild: + text_content = text_content.lstrip() + elif child == element.lastChild: + text_content = text_content.rstrip() while ' ' in text_content: text_content = text_content.replace(' ', ' ') else: From 703122369ed0c6a4e4217e7a77a8bf47d2ec3671 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Mon, 2 Jul 2018 22:14:14 +0200 Subject: [PATCH 4/5] Strip newlines from text nodes and be done with it Follow the spec "blindly" as it turns out covering all the border and getting reasonably styled output is just to cumbersome. This way at least scour output is consistent and it also saves us some bytes (a lot in some cases as we do not indent s etc. anymore) --- scour/scour.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scour/scour.py b/scour/scour.py index 000142d..a46af61 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3356,8 +3356,8 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): if not preserveWhitespace: # strip / consolidate whitespace according to spec, see # https://www.w3.org/TR/SVG/text.html#WhiteSpace - # As a workaround for inconsistent handling of renderers keep newlines if they were in the original if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']: + text_content = text_content.replace('\n', '') text_content = text_content.replace('\t', ' ') if child == element.firstChild: text_content = text_content.lstrip() From 651694a6c0547a57f6634ad4331fa2d551c87074 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Tue, 3 Jul 2018 22:53:05 +0200 Subject: [PATCH 5/5] Add unittests for whitespace handling in text node Also expand/fix the test for line endings --- testscour.py | 103 +++++++++++++++++++++-------- unittests/newlines.svg | 50 ++++++++++++++ unittests/whitespace-important.svg | 4 -- unittests/whitespace-nested.svg | 4 -- unittests/whitespace.svg | 40 +++++++++++ 5 files changed, 166 insertions(+), 35 deletions(-) create mode 100644 unittests/newlines.svg delete mode 100644 unittests/whitespace-important.svg delete mode 100644 unittests/whitespace-nested.svg create mode 100644 unittests/whitespace.svg diff --git a/testscour.py b/testscour.py index cc6676e..04da38c 100755 --- a/testscour.py +++ b/testscour.py @@ -1744,34 +1744,83 @@ class DoNotRemoveGradientsWhenReferencedInStyleCss(unittest.TestCase): 'Gradients removed when referenced in CSS') -class DoNotPrettyPrintWhenWhitespacePreserved(unittest.TestCase): +class Whitespace(unittest.TestCase): - def runTest(self): - with open('unittests/whitespace-important.svg') as f: - s = scourString(f.read()).splitlines() - c = ''' - - This is some messed-up markup - -'''.splitlines() - for i in range(4): - self.assertEqual(s[i], c[i], - 'Whitespace not preserved for line ' + str(i)) + def setUp(self): + self.doc = scourXmlFile('unittests/whitespace.svg') + def test_basic(self): + text = self.doc.getElementById('txt_a1') + self.assertIn('text1 text2', text.toxml(), + 'Multiple spaces not stripped from text element') + text = self.doc.getElementById('txt_a2') + self.assertIn('text1 text2', text.toxml(), + 'Tab not replaced with space in text element') + text = self.doc.getElementById('txt_a3') + self.assertIn('text1 text2', text.toxml(), + 'Multiple spaces not stripped from text element with xml:space="default"') + text = self.doc.getElementById('txt_a4') + self.assertIn('text1 text2', text.toxml(), + 'Tab not replaced with space in text element with xml:space="default"') + text = self.doc.getElementById('txt_a5') + self.assertIn('text1 text2', text.toxml(), + 'Multiple spaces not preserved in text element with xml:space="preserve"') + text = self.doc.getElementById('txt_a6') + self.assertIn('text1\ttext2', text.toxml(), + 'Tab not preserved in text element with xml:space="preserve"') -class DoNotPrettyPrintWhenNestedWhitespacePreserved(unittest.TestCase): + def test_newlines(self): + text = self.doc.getElementById('txt_b1') + self.assertIn('text1 text2', text.toxml(), + 'Newline not replaced with space in text element') + text = self.doc.getElementById('txt_b2') + self.assertIn('text1 text2', text.toxml(), + 'Newline not replaced with space in text element with xml:space="default"') + text = self.doc.getElementById('txt_b3') + self.assertIn('text1\n text2', text.toxml(), + 'Newline not preserved in text element with xml:space="preserve"') - def runTest(self): - with open('unittests/whitespace-nested.svg') as f: - s = scourString(f.read()).splitlines() - c = ''' - - Use bold text - -'''.splitlines() - for i in range(4): - self.assertEqual(s[i], c[i], - 'Whitespace not preserved when nested for line ' + str(i)) + def test_inheritance(self): + text = self.doc.getElementById('txt_c1') + self.assertIn('text1 text2', text.toxml(), + ' does not inherit xml:space="preserve" of parent text element') + text = self.doc.getElementById('txt_c2') + self.assertIn('text1 text2', text.toxml(), + 'xml:space="default" of does not overwrite xml:space="preserve" of parent text element') + text = self.doc.getElementById('txt_c3') + self.assertIn('text1 text2', text.toxml(), + 'xml:space="preserve" of does not overwrite xml:space="default" of parent text element') + text = self.doc.getElementById('txt_c4') + self.assertIn('text1 text2', text.toxml(), + ' does not inherit xml:space="preserve" of parent group') + text = self.doc.getElementById('txt_c5') + self.assertIn('text1 text2', text.toxml(), + 'xml:space="default" of text element does not overwrite xml:space="preserve" of parent group') + text = self.doc.getElementById('txt_c6') + self.assertIn('text1 text2', text.toxml(), + 'xml:space="preserve" of text element does not overwrite xml:space="default" of parent group') + + def test_important_whitespace(self): + text = self.doc.getElementById('txt_d1') + self.assertIn('text1 text2', text.toxml(), + 'Newline with whitespace collapsed in text element') + text = self.doc.getElementById('txt_d2') + self.assertIn('text1 tspan1 text2', text.toxml(), + 'Whitespace stripped from the middle of a text element') + text = self.doc.getElementById('txt_d3') + self.assertIn('text1 tspan1 tspan2 text2', text.toxml(), + 'Whitespace stripped from the middle of a text element') + + def test_incorrect_whitespace(self): + text = self.doc.getElementById('txt_e1') + self.assertIn('text1text2', text.toxml(), + 'Whitespace introduced in text element with newline') + text = self.doc.getElementById('txt_e2') + self.assertIn('text1tspantext2', text.toxml(), + 'Whitespace introduced in text element with ') + text = self.doc.getElementById('txt_e3') + self.assertIn('text1tspantext2', text.toxml(), + 'Whitespace introduced in text element with and newlines') class GetAttrPrefixRight(unittest.TestCase): @@ -1807,10 +1856,10 @@ class HandleEmptyStyleElement(unittest.TestCase): class EnsureLineEndings(unittest.TestCase): def runTest(self): - with open('unittests/whitespace-important.svg') as f: + with open('unittests/newlines.svg') as f: s = scourString(f.read()) - self.assertEqual(len(s.splitlines()), 4, - 'Did not output line ending character correctly') + self.assertEqual(len(s.splitlines()), 24, + 'Did handle reading or outputting line ending characters correctly') class XmlEntities(unittest.TestCase): diff --git a/unittests/newlines.svg b/unittests/newlines.svg new file mode 100644 index 0000000..a909603 --- /dev/null +++ b/unittests/newlines.svg @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/unittests/whitespace-important.svg b/unittests/whitespace-important.svg deleted file mode 100644 index 6918044..0000000 --- a/unittests/whitespace-important.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - This is some messed-up markup - diff --git a/unittests/whitespace-nested.svg b/unittests/whitespace-nested.svg deleted file mode 100644 index 3b99356..0000000 --- a/unittests/whitespace-nested.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - Use bold text - diff --git a/unittests/whitespace.svg b/unittests/whitespace.svg new file mode 100644 index 0000000..2bb48a6 --- /dev/null +++ b/unittests/whitespace.svg @@ -0,0 +1,40 @@ + + + + text1 text2 + text1 text2 + text1 text2 + text1 text2 + text1 text2 + text1 text2 + + + text1 + text2 + text1 + text2 + text1 + text2 + + + text1 text2 + text1 text2 + text1 text2 + text1 text2 + text1 text2 + text1 text2 + + + text1 + text2 + text1 tspan1 text2 + text1 tspan1 tspan2 text2 + + + text1 +text2 + text1tspantext2 + text1 +tspan +text2 +