From bac229dd14bbad964486dd3c25d6d8355fa9f466 Mon Sep 17 00:00:00 2001 From: JSCHILL1 Date: Wed, 5 Aug 2009 22:53:31 -0500 Subject: [PATCH] Add hand-rolled XML serialization function to improve XML output. Added --indent option to choose indentation mechanism (space, tab, none) --- release-notes.html | 4 +- scour.py | 95 ++++++++++++++++++++++++++++++++++++++++++---- testscour.py | 11 ++++++ 3 files changed, 102 insertions(+), 8 deletions(-) diff --git a/release-notes.html b/release-notes.html index 7c97b0c..87805bf 100644 --- a/release-notes.html +++ b/release-notes.html @@ -13,10 +13,12 @@

Version 0.18

-

Aug 3rd, 2009

+

Aug 5th, 2009

diff --git a/scour.py b/scour.py index eeb32b0..72b4e85 100755 --- a/scour.py +++ b/scour.py @@ -51,13 +51,12 @@ # # -# Suggestion from Richard Hutch: -# * Put id attributes first in the serialization (or make the d attribute last) -# This would require my own serialization of the DOM objects (not impossible) - # Next Up: # + Remove some attributes that have default values # + Convert c/q path segments into shorthand equivalents where possible: +# + custom serialization of SVG that prints out id/xml:id first (suggestion by Richard Hutch) +# + --indent option to specify how indent should work: space, tab, none +# - option to remove metadata # - parse transform attribute # - if a has only one element in it, collapse the (ensure transform, etc are carried down) # - remove id if it matches the Inkscape-style of IDs (also provide a switch to disable this) @@ -1876,6 +1875,82 @@ def remapNamespacePrefix(node, oldprefix, newprefix): for child in node.childNodes : remapNamespacePrefix(child, oldprefix, newprefix) +# hand-rolled serialization function that has the following benefits: +# - pretty printing +# - somewhat judicious use of whitespace +# - ensure id attributes are first +def serializeXML(element, options, ind = 0): + indent = ind + I='' + if options.indent_type == 'tab': I='\t' + elif options.indent_type == 'space': I=' ' + + outString = (I * ind) + '<' + element.nodeName + + # always serialize the id or xml:id attributes first + if element.getAttribute('id') != '': + id = element.getAttribute('id') + quot = '"' + if id.find('"') != -1: + quot = "'" + outString += ' ' + 'id=' + quot + id + quot + if element.getAttribute('xml:id') != '': + id = element.getAttribute('xml:id') + quot = '"' + if id.find('"') != -1: + quot = "'" + outString += ' ' + 'xml:id=' + quot + id + quot + + # now serialize the other attributes + attrList = element.attributes + for num in range(attrList.length) : + attr = attrList.item(num) + if attr.nodeName == 'id' or attr.nodeName == 'xml:id': continue + # if the attribute value contains a double-quote, use single-quotes + quot = '"' + if attr.nodeValue.find('"') != -1: + quot = "'" + + outString += ' ' + attr.nodeName + '=' + quot + attr.nodeValue + quot + + # if no children, self-close + children = element.childNodes + if children.length > 0: + outString += '>' + + onNewLine = False + for child in element.childNodes: + # element node + if child.nodeType == 1: + outString += '\n' + serializeXML(child, options, indent + 1) + onNewLine = True + # text node + elif child.nodeType == 3: + # trim it only in the case of not being a child of an element + # where whitespace might be important + if element.nodeName in ["text", "tspan", "textPath", "tref", "title", "desc", "textArea"]: + outString += child.nodeValue + else: + outString += child.nodeValue.strip() + # CDATA node + elif child.nodeType == 4: + outString += '' + # Comment node + elif child.nodeType == 8: + outString += '' + # TODO: entities, processing instructions, what else? + else: # ignore the rest + pass + + if onNewLine: outString += (I * ind) + outString += '' + if indent > 0: outString += '\n' + else: + outString += '/>' + if indent > 0: outString += '\n' + + return outString + # this is the main method # input is a string representation of the input XML # returns a string representation of the output XML @@ -2004,7 +2079,6 @@ def scourString(in_string, options=None): elem.setAttribute(attr, scourLength(elem.getAttribute(attr))) # remove default values of attributes -# print doc.documentElement.toxml() numAttrsRemoved += removeDefaultAttributeValues(doc.documentElement, options) # convert rasters references to base64-encoded strings @@ -2018,8 +2092,9 @@ def scourString(in_string, options=None): # output the document as a pretty string with a single space for indent # NOTE: removed pretty printing because of this problem: # http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/ + # rolled our own serialize function here to save on space, put id first, customize indentation, etc # out_string = doc.documentElement.toprettyxml(' ') - out_string = doc.documentElement.toxml() + out_string = serializeXML(doc.documentElement, options) # now strip out empty lines lines = [] @@ -2096,6 +2171,9 @@ _options_parser.add_option("-i", action="store", dest="infilename", help=optparse.SUPPRESS_HELP) _options_parser.add_option("-o", action="store", dest="outfilename", help=optparse.SUPPRESS_HELP) +_options_parser.add_option("--indent", + action="store", type="string", dest="indent_type", default="space", + help="indentation of the output: none, space, tab (default: %default)") def maybe_gziped_file(filename, mode="r"): if os.path.splitext(filename)[1].lower() in (".svgz", ".gz"): @@ -2109,6 +2187,9 @@ def parse_args(args=None): _options_parser.error("Additional arguments not handled: %r, see --help" % rargs) if options.digits < 0: _options_parser.error("Can't have negative significant digits, see --help") + if not options.indent_type in ["tab", "space", "none"]: + _options_parser.error("Invalid value for --indent, see --help") + if options.infilename: infile = maybe_gziped_file(options.infilename) # GZ: could catch a raised IOError here and report @@ -2119,7 +2200,7 @@ def parse_args(args=None): outfile = maybe_gziped_file(options.outfilename, "w") else: outfile = sys.stdout - + return options, [infile, outfile] def getReport(): diff --git a/testscour.py b/testscour.py index d428711..ec0a22f 100755 --- a/testscour.py +++ b/testscour.py @@ -820,6 +820,17 @@ class RemoveDefaultGradFYValue(unittest.TestCase): self.assertEquals( g.getAttribute('fy'), '', 'fy matching cy not removed') +class CDATAInXml(unittest.TestCase): + def runTest(self): + self.assertEquals( scour.scourString(open('unittests/cdata.svg').read()), + ''' + + +''', + 'Improperly serialized the cdata unit tests') + # TODO; write a test for embedding rasters # TODO: write a test for --disable-embed-rasters # TODO: write tests for --keep-editor-data