From bac229dd14bbad964486dd3c25d6d8355fa9f466 Mon Sep 17 00:00:00 2001
From: JSCHILL1 <jschill1@Lithium-3.local>
Date: Wed, 5 Aug 2009 22:53:31 -0500
Subject: [PATCH] Add hand-rolled XML serialization function to improve XML
 output. Added --indent option to choose indentation mechanism (space, tab,
 none)

---
 release-notes.html |  4 +-
 scour.py           | 95 ++++++++++++++++++++++++++++++++++++++++++----
 testscour.py       | 11 ++++++
 3 files changed, 102 insertions(+), 8 deletions(-)
diff --git a/release-notes.html b/release-notes.html
index 7c97b0c..87805bf 100644
--- a/release-notes.html
+++ b/release-notes.html
@@ -13,10 +13,12 @@
 	<header>
 		<h2><a href="#0.18">Version 0.18</a></h2>
 	</header>
-	<p>Aug 3rd, 2009</p>
+	<p>Aug 5th, 2009</p>
 	<ul>
 		<li>Remove attributes of gradients if they contain default values</li>
 		<li>Reduce bezier/quadratic (c/q) segments to their shorthand equivalents (s/t)</li>
+		<li>Custom XML serialization such that id/xml:id is printed first (Thanks to Richard Hutch for the suggestion)</li>
+		<li>Added --indent option to specify indentation type (default='space', other options: 'none', 'tab')</li>
 	</ul>
 </section>
 
diff --git a/scour.py b/scour.py
index eeb32b0..72b4e85 100755
--- a/scour.py
+++ b/scour.py
@@ -51,13 +51,12 @@
 #      <rect />
 #    </g>
 
-# Suggestion from Richard Hutch:
-#  * Put id attributes first in the serialization (or make the d attribute last)
-#    This would require my own serialization of the DOM objects (not impossible)
-
 # Next Up:
 # + Remove some attributes that have default values
 # + Convert c/q path segments into shorthand equivalents where possible: 
+# + custom serialization of SVG that prints out id/xml:id first (suggestion by Richard Hutch)
+# + --indent option to specify how indent should work: space, tab, none
+# - option to remove metadata
 # - parse transform attribute
 # - if a <g> has only one element in it, collapse the <g> (ensure transform, etc are carried down)
 # - remove id if it matches the Inkscape-style of IDs (also provide a switch to disable this)
@@ -1876,6 +1875,82 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
 	for child in node.childNodes :
 		remapNamespacePrefix(child, oldprefix, newprefix)	
 
+# hand-rolled serialization function that has the following benefits:
+# - pretty printing
+# - somewhat judicious use of whitespace
+# - ensure id attributes are first
+def serializeXML(element, options, ind = 0):
+	indent = ind
+	I=''
+	if options.indent_type == 'tab': I='\t'
+	elif options.indent_type == 'space': I=' '
+	
+	outString = (I * ind) + '<' + element.nodeName
+
+	# always serialize the id or xml:id attributes first
+	if element.getAttribute('id') != '':
+		id = element.getAttribute('id')
+		quot = '"'
+		if id.find('"') != -1:
+			quot = "'"
+		outString += ' ' + 'id=' + quot + id + quot
+	if element.getAttribute('xml:id') != '':
+		id = element.getAttribute('xml:id')
+		quot = '"'
+		if id.find('"') != -1:
+			quot = "'"
+		outString += ' ' + 'xml:id=' + quot + id + quot
+	
+	# now serialize the other attributes
+	attrList = element.attributes
+	for num in range(attrList.length) :
+		attr = attrList.item(num)
+		if attr.nodeName == 'id' or attr.nodeName == 'xml:id': continue
+		# if the attribute value contains a double-quote, use single-quotes
+		quot = '"'
+		if attr.nodeValue.find('"') != -1:
+			quot = "'"
+
+		outString += ' ' + attr.nodeName + '=' + quot + attr.nodeValue + quot
+	
+	# if no children, self-close
+	children = element.childNodes
+	if children.length > 0:
+		outString += '>'
+	
+		onNewLine = False
+		for child in element.childNodes:
+			# element node
+			if child.nodeType == 1:
+				outString += '\n' + serializeXML(child, options, indent + 1)
+				onNewLine = True
+			# text node
+			elif child.nodeType == 3:
+				# trim it only in the case of not being a child of an element
+				# where whitespace might be important
+				if element.nodeName in ["text", "tspan", "textPath", "tref", "title", "desc", "textArea"]:
+					outString += child.nodeValue
+				else:
+					outString += child.nodeValue.strip()
+			# CDATA node
+			elif child.nodeType == 4:
+				outString += '<![CDATA[' + child.nodeValue + ']]>'
+			# Comment node
+			elif child.nodeType == 8:
+				outString += '<!--' + child.nodeValue + '-->'
+			# TODO: entities, processing instructions, what else?
+			else: # ignore the rest
+				pass
+				
+		if onNewLine: outString += (I * ind)
+		outString += '</' + element.nodeName + '>'
+		if indent > 0: outString += '\n'
+	else:
+		outString += '/>'
+		if indent > 0: outString += '\n'
+		
+	return outString
+	
 # this is the main method
 # input is a string representation of the input XML
 # returns a string representation of the output XML
@@ -2004,7 +2079,6 @@ def scourString(in_string, options=None):
 					elem.setAttribute(attr, scourLength(elem.getAttribute(attr)))
 
 	# remove default values of attributes
-#	print doc.documentElement.toxml()
 	numAttrsRemoved += removeDefaultAttributeValues(doc.documentElement, options)		
 	
 	# convert rasters references to base64-encoded strings 
@@ -2018,8 +2092,9 @@ def scourString(in_string, options=None):
 	# output the document as a pretty string with a single space for indent
 	# NOTE: removed pretty printing because of this problem:
 	# http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/
+	# rolled our own serialize function here to save on space, put id first, customize indentation, etc
 #	out_string = doc.documentElement.toprettyxml(' ')
-	out_string = doc.documentElement.toxml()
+	out_string = serializeXML(doc.documentElement, options)
 	
 	# now strip out empty lines
 	lines = []
@@ -2096,6 +2171,9 @@ _options_parser.add_option("-i",
 	action="store", dest="infilename", help=optparse.SUPPRESS_HELP)
 _options_parser.add_option("-o",
 	action="store", dest="outfilename", help=optparse.SUPPRESS_HELP)
+_options_parser.add_option("--indent",
+	action="store", type="string", dest="indent_type", default="space",
+	help="indentation of the output: none, space, tab (default: %default)")
 
 def maybe_gziped_file(filename, mode="r"):
 	if os.path.splitext(filename)[1].lower() in (".svgz", ".gz"):
@@ -2109,6 +2187,9 @@ def parse_args(args=None):
 		_options_parser.error("Additional arguments not handled: %r, see --help" % rargs)
 	if options.digits < 0:
 		_options_parser.error("Can't have negative significant digits, see --help")
+	if not options.indent_type in ["tab", "space", "none"]:
+		_options_parser.error("Invalid value for --indent, see --help")
+
 	if options.infilename:
 		infile = maybe_gziped_file(options.infilename)
 		# GZ: could catch a raised IOError here and report
@@ -2119,7 +2200,7 @@ def parse_args(args=None):
 		outfile = maybe_gziped_file(options.outfilename, "w")
 	else:
 		outfile = sys.stdout
-
+		
 	return options, [infile, outfile]
 
 def getReport():
diff --git a/testscour.py b/testscour.py
index d428711..ec0a22f 100755
--- a/testscour.py
+++ b/testscour.py
@@ -820,6 +820,17 @@ class RemoveDefaultGradFYValue(unittest.TestCase):
 		self.assertEquals( g.getAttribute('fy'), '',
 			'fy matching cy not removed')
 
+class CDATAInXml(unittest.TestCase):
+	def runTest(self):
+		self.assertEquals( scour.scourString(open('unittests/cdata.svg').read()), 
+			'''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg xmlns="http://www.w3.org/2000/svg">
+ <script type="application/ecmascript"><![CDATA[
+  	alert('pb&j');
+ ]]></script>
+</svg>''',
+			'Improperly serialized the cdata unit tests')
+
 # TODO; write a test for embedding rasters
 # TODO: write a test for --disable-embed-rasters
 # TODO: write tests for --keep-editor-data