From 462460a5125c1d617dc9e68a8cae8ccd6fb4f7d3 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Fri, 23 Sep 2016 00:20:14 +0200 Subject: [PATCH 1/4] Fix `embedRasters()` function. It was not Python 3 compatible and usually would not have worked with local files. --- scour/scour.py | 75 +++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 12867da..2ae8230 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -60,7 +60,7 @@ from collections import namedtuple from decimal import Context, Decimal, InvalidOperation, getcontext import six -from six.moves import range +from six.moves import range, urllib from scour.svg_regex import svg_parser from scour.svg_transform import svg_transform_parser @@ -2939,47 +2939,65 @@ def removeComments(element): def embedRasters(element, options): import base64 - import urllib """ Converts raster references to inline images. NOTE: there are size limits to base64-encoding handling in browsers - """ + """ global _num_rasters_embedded href = element.getAttributeNS(NS['XLINK'], 'href') # if xlink:href is set, then grab the id if href != '' and len(href) > 1: - # find if href value has filename ext ext = os.path.splitext(os.path.basename(href))[1].lower()[1:] - # look for 'png', 'jpg', and 'gif' extensions - if ext == 'png' or ext == 'jpg' or ext == 'gif': + # only operate on files with 'png', 'jpg', and 'gif' file extensions + if ext in ['png', 'jpg', 'gif']: + # fix common issues with file paths + # TODO: should we warn the user instead of trying to correct those invalid URIs? + # convert backslashes to slashes + href_fixed = href.replace('\\', '/') + # absolute 'file:' URIs have to use three slashes (unless specifying a host which I've never seen) + href_fixed = re.sub('file:/+', 'file:///', href_fixed) - # file:// URLs denote files on the local system too - if href[:7] == 'file://': - href = href[7:] - # does the file exist? - if os.path.isfile(href): - # if this is not an absolute path, set path relative - # to script file based on input arg - infilename = '.' + # parse the URI to get scheme and path + # in principle it would make sense to work only with this ParseResult and call 'urlunparse()' in the end + # however 'urlunparse(urlparse(file:raster.png))' -> 'file:///raster.png' which is nonsense + parsed_href = urllib.parse.urlparse(href_fixed) + + # assume locations without protocol point to local files (and should use the 'file:' protocol) + if parsed_href.scheme == '': + parsed_href = parsed_href._replace(scheme='file') + if href_fixed[0] == '/': + href_fixed = 'file://' + href_fixed + else: + href_fixed = 'file:' + href_fixed + + # relative local paths are relative to the input file, therefore temporarily change the working dir + working_dir_old = None + if parsed_href.scheme == 'file' and parsed_href.path[0] != '/': if options.infilename: - infilename = options.infilename - href = os.path.join(os.path.dirname(infilename), href) + working_dir_old = os.getcwd() + working_dir_new = os.path.abspath(os.path.dirname(options.infilename)) + os.chdir(working_dir_new) - rasterdata = '' - # test if file exists locally - if os.path.isfile(href): - # open raster file as raw binary - raster = open(href, "rb") - rasterdata = raster.read() - elif href[:7] == 'http://': - webFile = urllib.urlopen(href) - rasterdata = webFile.read() - webFile.close() + # open/download the file + try: + file = urllib.request.urlopen(href_fixed) + rasterdata = file.read() + file.close() + except Exception as e: + print("WARNING: Could not open file '" + href + "' for embedding. " + "The raster image will be kept as a reference but might be invalid. " + "(Exception details: " + str(e) + ")", file=sys.stderr) + rasterdata = '' + finally: + # always restore initial working directory if we changed it above + if working_dir_old is not None: + os.chdir(working_dir_old) - # ... should we remove all images which don't resolve? + # TODO: should we remove all images which don't resolve? + # then we also have to consider unreachable remote locations (i.e. if there is no internet connection) if rasterdata != '': # base64-encode raster b64eRaster = base64.b64encode(rasterdata) @@ -2991,7 +3009,8 @@ def embedRasters(element, options): if ext == 'jpg': ext = 'jpeg' - element.setAttributeNS(NS['XLINK'], 'href', 'data:image/' + ext + ';base64,' + b64eRaster) + element.setAttributeNS(NS['XLINK'], 'href', + 'data:image/' + ext + ';base64,' + b64eRaster.decode()) _num_rasters_embedded += 1 del b64eRaster From 8cc97601c45d9efaa22c11e5961b3883132e312e Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Fri, 23 Sep 2016 22:32:32 +0200 Subject: [PATCH 2/4] scourXmlFile(): Set specified 'filename' as input filename so relative references will work --- scour/scour.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scour/scour.py b/scour/scour.py index 2ae8230..e91d385 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3519,10 +3519,17 @@ def scourString(in_string, options=None): # input is a filename # returns the minidom doc representation of the SVG def scourXmlFile(filename, options=None): + # we need to set infilename (otherwise relative references in the SVG won't work) + if options is None: + options = generateDefaultOptions() + options.infilename = filename + + # open the file and scour it with open(filename, "rb") as f: in_string = f.read() out_string = scourString(in_string, options) + # prepare the output xml.dom.minidom object doc = xml.dom.minidom.parseString(out_string.encode('utf-8')) # since minidom does not seem to parse DTDs properly From 902e112a96d1f522ec5d82d55a0cd62db3790eae Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Fri, 23 Sep 2016 22:33:01 +0200 Subject: [PATCH 3/4] Add unittests for embedding rasters (and --disable-embed-rasters) --- testscour.py | 57 +++++++++++++++++++++++++++++-- unittests/raster-formats.svg | 7 ++++ unittests/raster-paths-local.svg | 19 +++++++++++ unittests/raster-paths-remote.svg | 8 +++++ 4 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 unittests/raster-formats.svg create mode 100644 unittests/raster-paths-local.svg create mode 100644 unittests/raster-paths-remote.svg diff --git a/testscour.py b/testscour.py index 08eda23..f14e901 100755 --- a/testscour.py +++ b/testscour.py @@ -2341,9 +2341,62 @@ class CommandLineUsage(unittest.TestCase): "Statistics output not as expected when '--verbose' option was used") +class EmbedRasters(unittest.TestCase): + + # quick way to ping a host using the OS 'ping' command and return the execution result + def _ping(host): + import os + import platform + + system = platform.system().lower() + ping_count = '-n' if system == 'windows' else '-c' + dev_null = 'NUL' if system == 'windows' else '/dev/null' + + return os.system('ping ' + ping_count + ' 1 ' + host + ' > ' + dev_null) + + def test_disable_embed_rasters(self): + doc = scourXmlFile('unittests/raster-formats.svg', + parse_args(['--disable-embed-rasters'])) + self.assertEqual(doc.getElementById('png').getAttribute('xlink:href'), 'raster.png', + "Raster image embedded when '--disable-embed-rasters' was specified") + + def test_raster_formats(self): + doc = scourXmlFile('unittests/raster-formats.svg') + self.assertEqual(doc.getElementById('png').getAttribute('xlink:href'), + '' + 'VBMVEUAAP//AAAA/wBmtfVOAAAACklEQVQI12NIAAAAYgBhGxZhsAAAAABJRU5ErkJggg==', + "Raster image (PNG) not correctly embedded.") + self.assertEqual(doc.getElementById('gif').getAttribute('xlink:href'), + '', + "Raster image (GIF) not correctly embedded.") + self.assertEqual(doc.getElementById('jpg').getAttribute('xlink:href'), + '' + '2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/' + '2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/' + 'wAARCAABAAMDAREAAhEBAxEB/8QAFAABAAAAAAAAAAAAAAAAAAAACv/EABoQAAEFAQAAAAAAAAAAAAAAAAgABQc3d7j/' + 'xAAVAQEBAAAAAAAAAAAAAAAAAAAHCv/EABwRAAEDBQAAAAAAAAAAAAAAAAgAB7gJODl2eP/aAAwDAQACEQMRAD8AMeaF' + '/u2aj5z1Fqp7oN4rxx2kn5cPuhV6LkzG7qOyYL2r/9k=', + "Raster image (JPG) not correctly embedded.") + + def test_raster_paths_local(self): + doc = scourXmlFile('unittests/raster-paths-local.svg') + images = doc.getElementsByTagName('image') + for image in images: + href = image.getAttribute('xlink:href') + self.assertTrue(href.startswith('data:image/'), + "Raster image from local path '" + href + "' not embedded.") + + @unittest.skipIf(_ping('raw.githubusercontent.com') != 0, "Remote server not reachable.") + def test_raster_paths_remote(self): + doc = scourXmlFile('unittests/raster-paths-remote.svg') + images = doc.getElementsByTagName('image') + for image in images: + href = image.getAttribute('xlink:href') + self.assertTrue(href.startswith('data:image/'), + "Raster image from remote path '" + href + "' not embedded.") + + # TODO: write tests for --enable-viewboxing -# TODO; write a test for embedding rasters -# TODO: write a test for --disable-embed-rasters # TODO: write tests for --keep-editor-data if __name__ == '__main__': diff --git a/unittests/raster-formats.svg b/unittests/raster-formats.svg new file mode 100644 index 0000000..c31b65a --- /dev/null +++ b/unittests/raster-formats.svg @@ -0,0 +1,7 @@ + + + Three different formats + + + + \ No newline at end of file diff --git a/unittests/raster-paths-local.svg b/unittests/raster-paths-local.svg new file mode 100644 index 0000000..9cc6ed9 --- /dev/null +++ b/unittests/raster-paths-local.svg @@ -0,0 +1,19 @@ + + + + Local files + + + + + + + + Local files (file: protocol) + + + + + + + \ No newline at end of file diff --git a/unittests/raster-paths-remote.svg b/unittests/raster-paths-remote.svg new file mode 100644 index 0000000..ede7783 --- /dev/null +++ b/unittests/raster-paths-remote.svg @@ -0,0 +1,8 @@ + + + + Files from internet + + + + \ No newline at end of file From de1441fd58294585cc0e3ad296572960d0b2aac5 Mon Sep 17 00:00:00 2001 From: Eduard Braun Date: Fri, 23 Sep 2016 23:16:19 +0200 Subject: [PATCH 4/4] Exclude (system specific) absolute paths from test file and add a unittest that creates/tests absolute paths on-the-fly --- testscour.py | 23 +++++++++++++++++++++++ unittests/raster-paths-local.svg | 4 ++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/testscour.py b/testscour.py index f14e901..6098914 100755 --- a/testscour.py +++ b/testscour.py @@ -2386,6 +2386,29 @@ class EmbedRasters(unittest.TestCase): self.assertTrue(href.startswith('data:image/'), "Raster image from local path '" + href + "' not embedded.") + def test_raster_paths_local_absolute(self): + with open('unittests/raster-formats.svg', 'r') as f: + svg = f.read() + + # create a reference string by scouring the original file with relative links + options = ScourOptions + options.infilename = 'unittests/raster-formats.svg' + reference_svg = scourString(svg, options) + + # this will not always create formally valid paths but it'll check how robust our implementation is + # (the third path is invalid for sure because file: needs three slashes according to URI spec) + svg = svg.replace('raster.png', + '/' + os.path.abspath(os.path.dirname(__file__)) + '\\unittests\\raster.png') + svg = svg.replace('raster.gif', + 'file:///' + os.path.abspath(os.path.dirname(__file__)) + '/unittests/raster.gif') + svg = svg.replace('raster.jpg', + 'file:/' + os.path.abspath(os.path.dirname(__file__)) + '/unittests/raster.jpg') + + svg = scourString(svg) + + self.assertEqual(svg, reference_svg, + "Raster images from absolute local paths not properly embedded.") + @unittest.skipIf(_ping('raw.githubusercontent.com') != 0, "Remote server not reachable.") def test_raster_paths_remote(self): doc = scourXmlFile('unittests/raster-paths-remote.svg') diff --git a/unittests/raster-paths-local.svg b/unittests/raster-paths-local.svg index 9cc6ed9..61db8ab 100644 --- a/unittests/raster-paths-local.svg +++ b/unittests/raster-paths-local.svg @@ -6,7 +6,7 @@ - + Local files (file: protocol) @@ -14,6 +14,6 @@ - + \ No newline at end of file