From 4eade6920145b5953a42e4bbb14deda81dafa090 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Tue, 8 Dec 2015 23:38:06 +0100 Subject: [PATCH 1/3] Open input file in binary mode an let XML parser deal with encoding. Fixes #26 --- scour/scour.py | 4 ++-- testscour.py | 12 +++++++++--- unittests/utf8.svg | 20 +++++++++++++++++--- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 335bd0e..a62ee58 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3097,7 +3097,7 @@ def scourString(in_string, options=None): # input is a filename # returns the minidom doc representation of the SVG def scourXmlFile(filename, options=None): - with open(filename) as f: + with open(filename, "rb") as f: in_string = f.read() out_string = scourString(in_string, options) return xml.dom.minidom.parseString(out_string.encode('utf-8')) @@ -3235,7 +3235,7 @@ def parse_args(args=None, ignore_additional_args=False): _options_parser.error("Input filename is the same as output filename") if options.infilename: - infile = maybe_gziped_file(options.infilename) + infile = maybe_gziped_file(options.infilename, "rb") # GZ: could catch a raised IOError here and report else: # GZ: could sniff for gzip compression here diff --git a/testscour.py b/testscour.py index 7b29bdf..8912288 100755 --- a/testscour.py +++ b/testscour.py @@ -604,12 +604,18 @@ class ChangeQuadToShorthandInPath(unittest.TestCase): self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0', 'Did not change quadratic curves into shorthand curve segments in path') -class HandleNonAsciiUtf8(unittest.TestCase): +class HandleUTF8(unittest.TestCase): def runTest(self): doc = scour.scourXmlFile('unittests/utf8.svg') + text = u'Hello in many languages:\nar: أهلا\nbn: হ্যালো\nel: Χαίρετε\nen: Hello\nhi: नमस्ते\niw: שלום\nja: こんにちは\nkm: ជំរាបសួរ\nml: ഹലോ\nru: Здравствуйте\nur: ہیلو\nzh: 您好' desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip() - self.assertEqual( desc, u'ú', - 'Did not handle non-ASCII characters' ) + self.assertEqual( desc, text, 'Did not handle international UTF8 characters' ) + desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[1].firstChild.wholeText).strip() + self.assertEqual( desc, u'“”‘’–—…‐‒°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿�', 'Did not handle common UTF8 characters' ) + desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[2].firstChild.wholeText).strip() + self.assertEqual( desc, u':-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔', 'Did not handle mathematical UTF8 characters' ) + desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[3].firstChild.wholeText).strip() + self.assertEqual( desc, u'⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎', 'Did not handle superscript/subscript UTF8 characters' ) class HandleSciNoInPathData(unittest.TestCase): def runTest(self): diff --git a/unittests/utf8.svg b/unittests/utf8.svg index 6c77d7a..dd63f12 100644 --- a/unittests/utf8.svg +++ b/unittests/utf8.svg @@ -1,5 +1,19 @@ - - ú + + Hello in many languages: +ar: أهلا +bn: হ্যালো +el: Χαίρετε +en: Hello +hi: नमस्ते +iw: שלום +ja: こんにちは +km: ជំរាបសួរ +ml: ഹലോ +ru: Здравствуйте +ur: ہیلو +zh: 您好 + “”‘’–—…‐‒°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿� + :-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔ + ⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ From 8984e550b07df5e490ebcac4194ee87fc6927844 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Wed, 9 Dec 2015 00:30:16 +0100 Subject: [PATCH 2/3] Read from stdin in binary mode an let XML parser deal with encoding. Also write to stdout in binary mode as the output is already encoded. --- scour/scour.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index a62ee58..cb2d386 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3239,11 +3239,20 @@ def parse_args(args=None, ignore_additional_args=False): # GZ: could catch a raised IOError here and report else: # GZ: could sniff for gzip compression here - infile = sys.stdin + # + # open the binary buffer of stdin and let XML parser handle decoding + try: + infile = sys.stdin.buffer + except AttributeError: + infile = sys.stdin if options.outfilename: outfile = maybe_gziped_file(options.outfilename, "wb") else: - outfile = sys.stdout + # open the binary buffer of stdout as the output is already encoded + try: + outfile = sys.stdout.buffer + except AttributeError: + outfile = sys.stdout return options, [infile, outfile] From 946ca3ce4acc7a21337c696cd83b0ad4af05f40f Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Wed, 9 Dec 2015 21:31:16 +0100 Subject: [PATCH 3/3] Unittests: Add a test for proper decoding of ISO 8859-15 --- testscour.py | 10 ++++++++-- unittests/encoding-iso-8859-15.svg | 4 ++++ unittests/{utf8.svg => encoding-utf8.svg} | 0 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 unittests/encoding-iso-8859-15.svg rename unittests/{utf8.svg => encoding-utf8.svg} (100%) diff --git a/testscour.py b/testscour.py index 8912288..a5c8185 100755 --- a/testscour.py +++ b/testscour.py @@ -604,9 +604,9 @@ class ChangeQuadToShorthandInPath(unittest.TestCase): self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0', 'Did not change quadratic curves into shorthand curve segments in path') -class HandleUTF8(unittest.TestCase): +class HandleEncodingUTF8(unittest.TestCase): def runTest(self): - doc = scour.scourXmlFile('unittests/utf8.svg') + doc = scour.scourXmlFile('unittests/encoding-utf8.svg') text = u'Hello in many languages:\nar: أهلا\nbn: হ্যালো\nel: Χαίρετε\nen: Hello\nhi: नमस्ते\niw: שלום\nja: こんにちは\nkm: ជំរាបសួរ\nml: ഹലോ\nru: Здравствуйте\nur: ہیلو\nzh: 您好' desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip() self.assertEqual( desc, text, 'Did not handle international UTF8 characters' ) @@ -617,6 +617,12 @@ class HandleUTF8(unittest.TestCase): desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[3].firstChild.wholeText).strip() self.assertEqual( desc, u'⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎', 'Did not handle superscript/subscript UTF8 characters' ) +class HandleEncodingISO_8859_15(unittest.TestCase): + def runTest(self): + doc = scour.scourXmlFile('unittests/encoding-iso-8859-15.svg') + desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip() + self.assertEqual( desc, u'áèîäöüß€ŠšŽžŒœŸ', 'Did not handle ISO 8859-15 encoded characters' ) + class HandleSciNoInPathData(unittest.TestCase): def runTest(self): doc = scour.scourXmlFile('unittests/path-sn.svg') diff --git a/unittests/encoding-iso-8859-15.svg b/unittests/encoding-iso-8859-15.svg new file mode 100644 index 0000000..626aca4 --- /dev/null +++ b/unittests/encoding-iso-8859-15.svg @@ -0,0 +1,4 @@ + + + ߤ + diff --git a/unittests/utf8.svg b/unittests/encoding-utf8.svg similarity index 100% rename from unittests/utf8.svg rename to unittests/encoding-utf8.svg