From 4eade6920145b5953a42e4bbb14deda81dafa090 Mon Sep 17 00:00:00 2001
From: Eduard Braun <Eduard.Braun2@gmx.de>
Date: Tue, 8 Dec 2015 23:38:06 +0100
Subject: [PATCH 1/3] Open input file in binary mode an let XML parser deal
 with encoding. Fixes #26

---
 scour/scour.py     |  4 ++--
 testscour.py       | 12 +++++++++---
 unittests/utf8.svg | 20 +++++++++++++++++---
 3 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/scour/scour.py b/scour/scour.py
index 335bd0e..a62ee58 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3097,7 +3097,7 @@ def scourString(in_string, options=None):
 # input is a filename
 # returns the minidom doc representation of the SVG
 def scourXmlFile(filename, options=None):
-   with open(filename) as f:
+   with open(filename, "rb") as f:
       in_string = f.read()
    out_string = scourString(in_string, options)
    return xml.dom.minidom.parseString(out_string.encode('utf-8'))
@@ -3235,7 +3235,7 @@ def parse_args(args=None, ignore_additional_args=False):
       _options_parser.error("Input filename is the same as output filename")
 
    if options.infilename:
-      infile = maybe_gziped_file(options.infilename)
+      infile = maybe_gziped_file(options.infilename, "rb")
       # GZ: could catch a raised IOError here and report
    else:
       # GZ: could sniff for gzip compression here
diff --git a/testscour.py b/testscour.py
index 7b29bdf..8912288 100755
--- a/testscour.py
+++ b/testscour.py
@@ -604,12 +604,18 @@ class ChangeQuadToShorthandInPath(unittest.TestCase):
 		self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0',
 			'Did not change quadratic curves into shorthand curve segments in path')
 
-class HandleNonAsciiUtf8(unittest.TestCase):
+class HandleUTF8(unittest.TestCase):
 	def runTest(self):
 		doc = scour.scourXmlFile('unittests/utf8.svg')
+		text = u'Hello in many languages:\nar: أهلا\nbn: হ্যালো\nel: Χαίρετε\nen: Hello\nhi: नमस्ते\niw: שלום\nja: こんにちは\nkm: ជំរាបសួរ\nml: ഹലോ\nru: Здравствуйте\nur: ہیلو\nzh: 您好'
 		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip()
-		self.assertEqual( desc, u'ú',
-			'Did not handle non-ASCII characters' )
+		self.assertEqual( desc, text, 'Did not handle international UTF8 characters' )
+		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[1].firstChild.wholeText).strip()
+		self.assertEqual( desc, u'“”‘’–—…‐‒°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿�', 'Did not handle common UTF8 characters' )
+		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[2].firstChild.wholeText).strip()
+		self.assertEqual( desc, u':-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔', 'Did not handle mathematical UTF8 characters' )
+		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[3].firstChild.wholeText).strip()
+		self.assertEqual( desc, u'⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎', 'Did not handle superscript/subscript UTF8 characters' )
 
 class HandleSciNoInPathData(unittest.TestCase):
 	def runTest(self):
diff --git a/unittests/utf8.svg b/unittests/utf8.svg
index 6c77d7a..dd63f12 100644
--- a/unittests/utf8.svg
+++ b/unittests/utf8.svg
@@ -1,5 +1,19 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg xmlns:xlink="http://www.w3.org/1999/xlink" 
-   xmlns="http://www.w3.org/2000/svg">
-  <desc>ú</desc>
+<svg xmlns="http://www.w3.org/2000/svg">
+ <desc id="hello">Hello in many languages:
+ar: أهلا
+bn: হ্যালো
+el: Χαίρετε
+en: Hello
+hi: नमस्ते
+iw: שלום
+ja: こんにちは
+km: ជំរាបសួរ
+ml: ഹലോ
+ru: Здравствуйте
+ur: ہیلو
+zh: 您好</desc>
+ <desc id="common">“”‘’–—…‐‒°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿�</desc>
+ <desc id="math">:-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔</desc>
+ <desc id="supersub">⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎</desc>
 </svg>

From 8984e550b07df5e490ebcac4194ee87fc6927844 Mon Sep 17 00:00:00 2001
From: Eduard Braun <Eduard.Braun2@gmx.de>
Date: Wed, 9 Dec 2015 00:30:16 +0100
Subject: [PATCH 2/3] Read from stdin in binary mode an let XML parser deal
 with encoding. Also write to stdout in binary mode as the output is already
 encoded.

---
 scour/scour.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index a62ee58..cb2d386 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3239,11 +3239,20 @@ def parse_args(args=None, ignore_additional_args=False):
       # GZ: could catch a raised IOError here and report
    else:
       # GZ: could sniff for gzip compression here
-      infile = sys.stdin
+      #
+      # open the binary buffer of stdin and let XML parser handle decoding
+      try:
+        infile = sys.stdin.buffer
+      except AttributeError:
+        infile = sys.stdin
    if options.outfilename:
       outfile = maybe_gziped_file(options.outfilename, "wb")
    else:
-      outfile = sys.stdout
+      # open the binary buffer of stdout as the output is already encoded
+      try:
+         outfile = sys.stdout.buffer
+      except AttributeError:
+         outfile = sys.stdout
 
    return options, [infile, outfile]
 

From 946ca3ce4acc7a21337c696cd83b0ad4af05f40f Mon Sep 17 00:00:00 2001
From: Eduard Braun <Eduard.Braun2@gmx.de>
Date: Wed, 9 Dec 2015 21:31:16 +0100
Subject: [PATCH 3/3] Unittests: Add a test for proper decoding of ISO 8859-15

---
 testscour.py                              | 10 ++++++++--
 unittests/encoding-iso-8859-15.svg        |  4 ++++
 unittests/{utf8.svg => encoding-utf8.svg} |  0
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 unittests/encoding-iso-8859-15.svg
 rename unittests/{utf8.svg => encoding-utf8.svg} (100%)

diff --git a/testscour.py b/testscour.py
index 8912288..a5c8185 100755
--- a/testscour.py
+++ b/testscour.py
@@ -604,9 +604,9 @@ class ChangeQuadToShorthandInPath(unittest.TestCase):
 		self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0',
 			'Did not change quadratic curves into shorthand curve segments in path')
 
-class HandleUTF8(unittest.TestCase):
+class HandleEncodingUTF8(unittest.TestCase):
 	def runTest(self):
-		doc = scour.scourXmlFile('unittests/utf8.svg')
+		doc = scour.scourXmlFile('unittests/encoding-utf8.svg')
 		text = u'Hello in many languages:\nar: أهلا\nbn: হ্যালো\nel: Χαίρετε\nen: Hello\nhi: नमस्ते\niw: שלום\nja: こんにちは\nkm: ជំរាបសួរ\nml: ഹലോ\nru: Здравствуйте\nur: ہیلو\nzh: 您好'
 		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip()
 		self.assertEqual( desc, text, 'Did not handle international UTF8 characters' )
@@ -617,6 +617,12 @@ class HandleUTF8(unittest.TestCase):
 		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[3].firstChild.wholeText).strip()
 		self.assertEqual( desc, u'⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎', 'Did not handle superscript/subscript UTF8 characters' )
 
+class HandleEncodingISO_8859_15(unittest.TestCase):
+	def runTest(self):
+		doc = scour.scourXmlFile('unittests/encoding-iso-8859-15.svg')
+		desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip()
+		self.assertEqual( desc, u'áèîäöüß€ŠšŽžŒœŸ', 'Did not handle ISO 8859-15 encoded characters' )
+
 class HandleSciNoInPathData(unittest.TestCase):
 	def runTest(self):
 		doc = scour.scourXmlFile('unittests/path-sn.svg')
diff --git a/unittests/encoding-iso-8859-15.svg b/unittests/encoding-iso-8859-15.svg
new file mode 100644
index 0000000..626aca4
--- /dev/null
+++ b/unittests/encoding-iso-8859-15.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="ISO-8859-15" standalone="no"?>
+<svg xmlns="http://www.w3.org/2000/svg">
+ <desc>������ߤ�������</desc>
+</svg>
diff --git a/unittests/utf8.svg b/unittests/encoding-utf8.svg
similarity index 100%
rename from unittests/utf8.svg
rename to unittests/encoding-utf8.svg