diff --git a/scour/scour.py b/scour/scour.py
index 335bd0e..cb2d386 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3097,7 +3097,7 @@ def scourString(in_string, options=None):
# input is a filename
# returns the minidom doc representation of the SVG
def scourXmlFile(filename, options=None):
- with open(filename) as f:
+ with open(filename, "rb") as f:
in_string = f.read()
out_string = scourString(in_string, options)
return xml.dom.minidom.parseString(out_string.encode('utf-8'))
@@ -3235,15 +3235,24 @@ def parse_args(args=None, ignore_additional_args=False):
_options_parser.error("Input filename is the same as output filename")
if options.infilename:
- infile = maybe_gziped_file(options.infilename)
+ infile = maybe_gziped_file(options.infilename, "rb")
# GZ: could catch a raised IOError here and report
else:
# GZ: could sniff for gzip compression here
- infile = sys.stdin
+ #
+ # open the binary buffer of stdin and let XML parser handle decoding
+ try:
+ infile = sys.stdin.buffer
+ except AttributeError:
+ infile = sys.stdin
if options.outfilename:
outfile = maybe_gziped_file(options.outfilename, "wb")
else:
- outfile = sys.stdout
+ # open the binary buffer of stdout as the output is already encoded
+ try:
+ outfile = sys.stdout.buffer
+ except AttributeError:
+ outfile = sys.stdout
return options, [infile, outfile]
diff --git a/testscour.py b/testscour.py
index 7b29bdf..a5c8185 100755
--- a/testscour.py
+++ b/testscour.py
@@ -604,12 +604,24 @@ class ChangeQuadToShorthandInPath(unittest.TestCase):
self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0',
'Did not change quadratic curves into shorthand curve segments in path')
-class HandleNonAsciiUtf8(unittest.TestCase):
+class HandleEncodingUTF8(unittest.TestCase):
def runTest(self):
- doc = scour.scourXmlFile('unittests/utf8.svg')
+ doc = scour.scourXmlFile('unittests/encoding-utf8.svg')
+ text = u'Hello in many languages:\nar: أهلا\nbn: হ্যালো\nel: Χαίρετε\nen: Hello\nhi: नमस्ते\niw: שלום\nja: こんにちは\nkm: ជំរាបសួរ\nml: ഹലോ\nru: Здравствуйте\nur: ہیلو\nzh: 您好'
desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip()
- self.assertEqual( desc, u'ú',
- 'Did not handle non-ASCII characters' )
+ self.assertEqual( desc, text, 'Did not handle international UTF8 characters' )
+ desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[1].firstChild.wholeText).strip()
+ self.assertEqual( desc, u'“”‘’–—…‐‒°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿�', 'Did not handle common UTF8 characters' )
+ desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[2].firstChild.wholeText).strip()
+ self.assertEqual( desc, u':-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔', 'Did not handle mathematical UTF8 characters' )
+ desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[3].firstChild.wholeText).strip()
+ self.assertEqual( desc, u'⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎', 'Did not handle superscript/subscript UTF8 characters' )
+
+class HandleEncodingISO_8859_15(unittest.TestCase):
+ def runTest(self):
+ doc = scour.scourXmlFile('unittests/encoding-iso-8859-15.svg')
+ desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip()
+ self.assertEqual( desc, u'áèîäöüß€ŠšŽžŒœŸ', 'Did not handle ISO 8859-15 encoded characters' )
class HandleSciNoInPathData(unittest.TestCase):
def runTest(self):
diff --git a/unittests/encoding-iso-8859-15.svg b/unittests/encoding-iso-8859-15.svg
new file mode 100644
index 0000000..626aca4
--- /dev/null
+++ b/unittests/encoding-iso-8859-15.svg
@@ -0,0 +1,4 @@
+
+
diff --git a/unittests/encoding-utf8.svg b/unittests/encoding-utf8.svg
new file mode 100644
index 0000000..dd63f12
--- /dev/null
+++ b/unittests/encoding-utf8.svg
@@ -0,0 +1,19 @@
+
+
diff --git a/unittests/utf8.svg b/unittests/utf8.svg
deleted file mode 100644
index 6c77d7a..0000000
--- a/unittests/utf8.svg
+++ /dev/null
@@ -1,5 +0,0 @@
-
-