From 879300373f39319c76ef58bd10ce0e32ee55d22b Mon Sep 17 00:00:00 2001
From: JSCHILL1 <jschill1@Macintosh.local>
Date: Sun, 5 Apr 2009 13:46:03 -0500
Subject: [PATCH] Remove all inkscape and sodipodi attributes and elements

---
 scour.py | 100 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 25 deletions(-)

diff --git a/scour.py b/scour.py
index 8cf8dde..3b62456 100755
--- a/scour.py
+++ b/scour.py
@@ -35,8 +35,11 @@ APP = 'scour'
 VER = '0.02'
 COPYRIGHT = 'Copyright Jeff Schiller, 2009'
 
-SVGNS = 'http://www.w3.org/2000/svg'
-XLINKNS = 'http://www.w3.org/1999/xlink'
+NS = { 	'SVG': 		'http://www.w3.org/2000/svg', 
+		'XLINK': 	'http://www.w3.org/1999/xlink', 
+		'SODIPODI': 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd',
+		'INKSCAPE': 'http://www.inkscape.org/namespaces/inkscape'
+		}
 
 def printHeader():
 	print APP , VER
@@ -57,32 +60,32 @@ input = sys.stdin
 output = sys.stdout
 
 # if -i or -o is supplied, switch the stream to the file
-if( len(args) == 2):
-	if( args[0] == '-i' ):
+if len(args) == 2:
+	if args[0] == '-i' :
 		input = open(args[1], 'r')
-	elif( args[0] == '-o' ):
+	elif args[0] == '-o' :
 		output = open(args[1], 'w')
 	else:
 		printSyntaxAndQuit()
 
 # if both -o and -o are supplied, switch streams to the files
-elif( len(args) == 4 ):
-	if( args[0] == '-i' and args[2] == '-o' ):
+elif len(args) == 4 :
+	if args[0] == '-i' and args[2] == '-o' :
 		input = open(args[1], 'r')
 		output = open(args[3], 'w')
-	elif( args[0] == '-o' and args[2] == 'i' ):
+	elif args[0] == '-o' and args[2] == 'i' :
 		output = open(args[1], 'w')
 		input = open(args[3], 'r')
 	else:
 		printSyntaxAndQuit()
 
 # else invalid syntax
-elif( len(args) != 0 ):
+elif len(args) != 0 :
 	printSyntaxAndQuit()
 
 # if we are not sending to stdout, then print out app information
 bOutputReport = False
-if( output != sys.stdout ):
+if output != sys.stdout :
 	bOutputReport = True
 	printHeader()
 
@@ -92,13 +95,13 @@ doc = xml.dom.minidom.parse(input)
 # returns all elements with id attributes
 def findElementsWithId(node,elems={}):
 	id = node.getAttribute('id')
-	if( id != '' ):
+	if id != '' :
 		elems[id] = node
-	if( node.hasChildNodes() ):
+	if node.hasChildNodes() :
 		for child in node.childNodes:
 			# from http://www.w3.org/TR/DOM-Level-2-Core/idl-definitions.html
 			# we are only really interested in nodes of type Element (1)
-			if( child.nodeType == 1 ):
+			if child.nodeType == 1 :
 				findElementsWithId(child, elems)
 	return elems
 
@@ -107,13 +110,13 @@ def findElementsWithId(node,elems={}):
 def findReferencedElements(node,ids={}):
 	# TODO: error here (ids is not cleared upon next invocation), the
 	# input argument ids is clunky here (see below how it is called)
-	href = node.getAttributeNS(XLINKNS,'href')
+	href = node.getAttributeNS(NS['XLINK'],'href')
 	
 	# if xlink:href is set, then grab the id
-	if( href != '' and len(href) > 1 and href[0] == '#'):
+	if href != '' and len(href) > 1 and href[0] == '#':
 		# we remove the hash mark from the beginning of the id
 		id = href[1:]
-		if( ids.has_key(id) ):
+		if ids.has_key(id) :
 			ids[id] += 1
 		else:
 			ids[id] = 1
@@ -128,24 +131,25 @@ def findReferencedElements(node,ids={}):
 			
 	for style in styles:
 		propval = string.split(style,':')
-		if(len(propval) == 2):
+		if len(propval) == 2 :
 			prop = propval[0].strip()
 			val = propval[1].strip()
-			if( prop in referencingProps and val != '' and val[0:5] == 'url(#' ):
+			if prop in referencingProps and val != '' and val[0:5] == 'url(#' :
 				id = val[5:val.find(')')]
-				if( ids.has_key(id) ):
+				if ids.has_key(id) :
 					ids[id] += 1
 				else:
 					ids[id] = 1
 					
-	if( node.hasChildNodes() ):
+	if node.hasChildNodes() :
 		for child in node.childNodes:
-			if( child.nodeType == 1 ):
+			if child.nodeType == 1 :
 				findReferencedElements(child, ids)
 	return ids
 
 numIDsRemoved = 0
 numElemsRemoved = 0
+numAttrsRemoved = 0
 
 # removes the unreferenced ID attributes
 # returns the number of ID attributes removed
@@ -154,7 +158,7 @@ def removeUnreferencedIDs(referencedIDs, identifiedElements):
 	num = 0;
 	for id in identifiedElements.keys():
 		node = identifiedElements[id]
-		if( referencedIDs.has_key(id) == False ):
+		if referencedIDs.has_key(id) == False :
 			node.removeAttribute('id')
 			# now remove the element from our list of elements with ids
 			# not necessary if we're calculating the array again every time
@@ -166,15 +170,60 @@ def removeUnreferencedIDs(referencedIDs, identifiedElements):
 def vacuumDefs(doc):
 	global numElemsRemoved
 	num = 0
-	defs = doc.documentElement.getElementsByTagNameNS(SVGNS, 'defs')
+	defs = doc.documentElement.getElementsByTagNameNS(NS['SVG'], 'defs')
 	for aDef in defs:
 		for elem in aDef.childNodes:
-			if( elem.nodeType == 1 and elem.getAttribute('id') == '' ):
+			if elem.nodeType == 1 and elem.getAttribute('id') == '' :
 				aDef.removeChild(elem)
 				numElemsRemoved += 1
 				num += 1
 	return num
 
+# TODO: check namespaceURI and remove
+# TODO: iterate through children
+def removeNamespacedAttributes(node, namespaces):
+	global numAttrsRemoved
+	num = 0
+	if node.nodeType == 1 :
+		# remove all namespace'd attributes from this element
+		attrList = node.attributes
+		for attrNum in range(attrList.length):
+			attr = attrList.item(attrNum)
+			if attr != None and attr.namespaceURI in namespaces:
+				num += 1
+				numAttrsRemoved += 1
+				node.removeAttribute(attr.nodeName)
+		
+		# now recurse for children
+		for child in node.childNodes:
+			removeNamespacedAttributes(child, namespaces)
+	return num
+	
+def removeNamespacedElements(node, namespaces):
+	global numElemsRemoved
+	num = 0
+	if node.nodeType == 1 :
+		# remove all namespace'd child nodes from this element
+		childList = node.childNodes
+		for child in childList:
+			if child != None and child.namespaceURI in namespaces:
+				num += 1
+				numElemsRemoved += 1
+				node.removeChild(child)
+		
+		# now recurse for children
+		for child in node.childNodes:
+			removeNamespacedElements(child, namespaces)
+	return num
+
+# for whatever reason this does not always remove all inkscape/sodipodi attributes/elements
+# on the first pass, so we do it multiple times
+while removeNamespacedElements( doc.documentElement, [ NS['SODIPODI'], NS['INKSCAPE'] ] ) > 0 :
+	pass
+	
+while removeNamespacedAttributes( doc.documentElement, [ NS['SODIPODI'], NS['INKSCAPE'] ] ) > 0 :
+	pass
+
 bContinueLooping = True
 while bContinueLooping:
 	identifiedElements = findElementsWithId(doc.documentElement, {})
@@ -191,4 +240,5 @@ output.close()
 # output some statistics if we are not using stdout
 if( bOutputReport):
 	print "Number of unreferenced id attributes removed:", numIDsRemoved 
-	print "Number of unreferenced elements removed:", numElemsRemoved
+	print "Number of elements removed:", numElemsRemoved
+	print "Number of attributes removed:", numAttrsRemoved