From 82ce83acab7fbec7de24ca3af835c53b1652f00a Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sun, 18 Mar 2018 11:50:45 +0000 Subject: [PATCH 1/2] Make embedRasters independent of options.inputfile This is a part of making scour able to support multiple input files. Signed-off-by: Niels Thykier --- scour/scour.py | 28 ++++++++++++++-------------- testscour.py | 9 +++++---- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 1e990c4..605fb7d 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3017,7 +3017,7 @@ def removeComments(element): return num -def embedRasters(element, options): +def embedRasters(element, references_relative_to, options): import base64 """ Converts raster references to inline images. @@ -3056,9 +3056,9 @@ def embedRasters(element, options): # relative local paths are relative to the input file, therefore temporarily change the working dir working_dir_old = None if parsed_href.scheme == 'file' and parsed_href.path[0] != '/': - if options.infilename: + if references_relative_to: + working_dir_new = os.path.abspath(references_relative_to) working_dir_old = os.getcwd() - working_dir_new = os.path.abspath(os.path.dirname(options.infilename)) os.chdir(working_dir_new) # open/download the file @@ -3308,7 +3308,7 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False): # this is the main method # input is a string representation of the input XML # returns a string representation of the output XML -def scourString(in_string, options=None): +def scourString(in_string, options=None, references_relative_to=None): # sanitize options (take missing attributes from defaults, discard unknown attributes) options = sanitizeOptions(options) @@ -3549,7 +3549,7 @@ def scourString(in_string, options=None): # convert rasters references to base64-encoded strings if options.embed_rasters: for elem in doc.documentElement.getElementsByTagName('image'): - embedRasters(elem, options) + embedRasters(elem, references_relative_to, options) # properly size the SVG document (ideally width/height should be 100% with a viewBox) if options.enable_viewboxing: @@ -3590,16 +3590,14 @@ def scourString(in_string, options=None): # used mostly by unit tests # input is a filename # returns the minidom doc representation of the SVG -def scourXmlFile(filename, options=None): +def scourXmlFile(filename, options=None, references_relative_to=None): # sanitize options (take missing attributes from defaults, discard unknown attributes) options = sanitizeOptions(options) - # we need to make sure infilename is set correctly (otherwise relative references in the SVG won't work) - options.ensure_value("infilename", filename) # open the file and scour it with open(filename, "rb") as f: in_string = f.read() - out_string = scourString(in_string, options) + out_string = scourString(in_string, options, references_relative_to) # prepare the output xml.dom.minidom object doc = xml.dom.minidom.parseString(out_string.encode('utf-8')) @@ -3814,9 +3812,11 @@ def maybe_gziped_file(filename, mode="r"): def getInOut(options): + references_relative_to = None if options.infilename: infile = maybe_gziped_file(options.infilename, "rb") # GZ: could catch a raised IOError here and report + references_relative_to = os.path.dirname(options.infilename) else: # GZ: could sniff for gzip compression here # @@ -3840,7 +3840,7 @@ def getInOut(options): # redirect informational output to stderr when SVG is output to stdout options.stdout = sys.stderr - return [infile, outfile] + return [infile, references_relative_to, outfile] def getReport(): @@ -3862,7 +3862,7 @@ def getReport(): ) -def start(options, input, output): +def start(options, input, output, references_relative_to=None): # sanitize options (take missing attributes from defaults, discard unknown attributes) options = sanitizeOptions(options) @@ -3870,7 +3870,7 @@ def start(options, input, output): # do the work in_string = input.read() - out_string = scourString(in_string, options).encode("UTF-8") + out_string = scourString(in_string, options, references_relative_to).encode("UTF-8") output.write(out_string) # Close input and output files (but do not attempt to close stdin/stdout!) @@ -3901,8 +3901,8 @@ def start(options, input, output): def run(): options = parse_args() - (input, output) = getInOut(options) - start(options, input, output) + (input, input_relative_to, output) = getInOut(options) + start(options, input, output, input_relative_to) if __name__ == '__main__': diff --git a/testscour.py b/testscour.py index 060b095..f6c9b57 100755 --- a/testscour.py +++ b/testscour.py @@ -2526,12 +2526,13 @@ class EmbedRasters(unittest.TestCase): def test_disable_embed_rasters(self): doc = scourXmlFile('unittests/raster-formats.svg', - parse_args(['--disable-embed-rasters'])) + parse_args(['--disable-embed-rasters']), + 'unittests') self.assertEqual(doc.getElementById('png').getAttribute('xlink:href'), 'raster.png', "Raster image embedded when '--disable-embed-rasters' was specified") def test_raster_formats(self): - doc = scourXmlFile('unittests/raster-formats.svg') + doc = scourXmlFile('unittests/raster-formats.svg', None, 'unittests') self.assertEqual(doc.getElementById('png').getAttribute('xlink:href'), '' 'VBMVEUAAP//AAAA/wBmtfVOAAAACklEQVQI12NIAAAAYgBhGxZhsAAAAABJRU5ErkJggg==', @@ -2549,7 +2550,7 @@ class EmbedRasters(unittest.TestCase): "Raster image (JPG) not correctly embedded.") def test_raster_paths_local(self): - doc = scourXmlFile('unittests/raster-paths-local.svg') + doc = scourXmlFile('unittests/raster-paths-local.svg', None, 'unittests') images = doc.getElementsByTagName('image') for image in images: href = image.getAttribute('xlink:href') @@ -2563,7 +2564,7 @@ class EmbedRasters(unittest.TestCase): # create a reference string by scouring the original file with relative links options = ScourOptions options.infilename = 'unittests/raster-formats.svg' - reference_svg = scourString(svg, options) + reference_svg = scourString(svg, options, 'unittests') # this will not always create formally valid paths but it'll check how robust our implementation is # (the third path is invalid for sure because file: needs three slashes according to URI spec) From c42dc6b793f89d83ad72ad0c678f56fc394073c1 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sun, 18 Mar 2018 11:15:46 +0000 Subject: [PATCH 2/2] Support multiple input files with -o being a directory This change makes it possible for scour to consume multiple input files in one command invocation. E.g. $ scour file1.svg file2.svgz ... output-directory # xargs friendly variant $ scour -o output-directory file1.svg file2.svgz ... This avoids most of the "startup" overhead in python and scour when many files are being processed. On about a 100 of (already scour'ed) gnuplot svg graphs, this change provides an almost 40% speed up compared to a shell alternative: # Original shell pipeline (~29s) # Note; for bash, rewriting this without the "basename"-call does # not seem to improve performance considerably. $ for FILE in input/[01]* ; do \ python3 -m scour.scour "$FILE" output/"$(basename "$FILE")" > /dev/null ; \ done # With this patch (~16s) $ python3 -m scour.scour input/[01]* output > /dev/null Signed-off-by: Niels Thykier --- scour/scour.py | 58 +++++++++++++++++++++++++++++++++----------------- testscour.py | 2 +- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 605fb7d..9a7f4f4 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -3630,10 +3630,12 @@ class HeaderedFormatter(optparse.IndentedHelpFormatter): # GZ: would prefer this to be in a function or class scope, but tests etc need # access to the defaults anyway _options_parser = optparse.OptionParser( - usage="%prog [INPUT.SVG [OUTPUT.SVG]] [OPTIONS]", + usage="%prog [INPUT.SVG [[... INPUT.SVG] OUTPUT]] [OPTIONS]", description=("If the input/output files are not specified, stdin/stdout are used. " "If the input/output files are specified with a svgz extension, " - "then compressed SVG is assumed."), + "then compressed SVG is assumed. Multiple input files can be specified " + "when output is a directory. The ouput files will be the basename of " + "the input files."), formatter=HeaderedFormatter(max_help_position=33), version=VER) @@ -3648,11 +3650,11 @@ _options_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="verbose output (statistics, etc.)") _options_parser.add_option("-i", - action="store", dest="infilename", metavar="INPUT.SVG", - help="alternative way to specify input filename") + action="append", dest="infilenames", metavar="INPUT.SVG", + help="alternative way to specify input filenames") _options_parser.add_option("-o", - action="store", dest="outfilename", metavar="OUTPUT.SVG", - help="alternative way to specify output filename") + action="store", dest="outfilename", metavar="OUTPUT", + help="alternative way to specify output (either a file or a directory)") _option_group_optimization = optparse.OptionGroup(_options_parser, "Optimization") _option_group_optimization.add_option("--set-precision", @@ -3766,10 +3768,11 @@ def parse_args(args=None, ignore_additional_args=False): options, rargs = _options_parser.parse_args(args) if rargs: - if not options.infilename: - options.infilename = rargs.pop(0) - if not options.outfilename and rargs: - options.outfilename = rargs.pop(0) + if not options.outfilename and len(rargs) > 1: + options.outfilename = rargs.pop() + if not options.infilenames: + options.infilenames = rargs + rargs = [] if not ignore_additional_args and rargs: _options_parser.error("Additional arguments not handled: %r, see --help" % rargs) if options.digits < 1: @@ -3782,8 +3785,15 @@ def parse_args(args=None, ignore_additional_args=False): _options_parser.error("Invalid value for --indent, see --help") if options.indent_depth < 0: _options_parser.error("Value for --nindent should be positive (or zero), see --help") - if options.infilename and options.outfilename and options.infilename == options.outfilename: - _options_parser.error("Input filename is the same as output filename") + if options.infilenames: + if len(options.infilenames) > 1: + if not options.outfilename or not os.path.isdir(options.outfilename): + _options_parser.error("Multiple input files requires an directory as output (-o)") + elif len(options.infilenames) == 0 and options.outfilename and options.infilenames[0] == options.outfilename: + _options_parser.error("Input filename is the same as output filename") + else: + if options.outfilename and os.path.isdir(options.outfilename): + _options_parser.error("Cannot use a directory as output when input is stdin") return options @@ -3811,12 +3821,12 @@ def maybe_gziped_file(filename, mode="r"): return open(filename, mode) -def getInOut(options): +def getInOut(input_file, options): references_relative_to = None - if options.infilename: - infile = maybe_gziped_file(options.infilename, "rb") + if input_file is not None: + infile = maybe_gziped_file(input_file, "rb") # GZ: could catch a raised IOError here and report - references_relative_to = os.path.dirname(options.infilename) + references_relative_to = os.path.dirname(input_file) else: # GZ: could sniff for gzip compression here # @@ -3830,7 +3840,11 @@ def getInOut(options): _options_parser.error("No input file specified, see --help for detailed usage information") if options.outfilename: - outfile = maybe_gziped_file(options.outfilename, "wb") + dest = options.outfilename + if os.path.isdir(dest): + assert input_file is not None + dest = os.path.join(dest, os.path.basename(input_file)) + outfile = maybe_gziped_file(dest, "wb") else: # open the binary buffer of stdout as the output is already encoded try: @@ -3901,8 +3915,14 @@ def start(options, input, output, references_relative_to=None): def run(): options = parse_args() - (input, input_relative_to, output) = getInOut(options) - start(options, input, output, input_relative_to) + input_files = options.infilenames if options.infilenames is not None else [] + if input_files and input_files[0] is not None: + for filename in input_files: + (input, input_relative_to, output) = getInOut(filename, options) + start(options, input, output, input_relative_to) + else: + (input, input_relative_to, output) = getInOut(None, options) + start(options, input, output, input_relative_to) if __name__ == '__main__': diff --git a/testscour.py b/testscour.py index f6c9b57..a71d9cd 100755 --- a/testscour.py +++ b/testscour.py @@ -2358,7 +2358,7 @@ class DoNotStripXmlSpaceAttribute(unittest.TestCase): class CommandLineUsage(unittest.TestCase): - USAGE_STRING = "Usage: scour [INPUT.SVG [OUTPUT.SVG]] [OPTIONS]" + USAGE_STRING = "Usage: scour [INPUT.SVG [[... INPUT.SVG] OUTPUT]] [OPTIONS]" MINIMAL_SVG = '\n' \ '\n' TEMP_SVG_FILE = 'testscour_temp.svg'