Start 0.07: move scour functionality into a module-level function and only call when __main__. Move tests around a bit. Add starter testscour.py

This commit is contained in:
JSCHILL1 2009-04-13 10:22:27 -05:00
parent b1f861d285
commit a1f6d9e8dd
14 changed files with 146 additions and 112 deletions

231
scour.py
View file

@ -1,6 +1,6 @@
#!/usr/local/bin/python
# Scour
# Version 0.06
# Version 0.07
#
# Copyright 2009 Jeff Schiller
#
@ -20,7 +20,7 @@
# Notes:
# rubys path-crunching ideas here: http://intertwingly.net/code/svgtidy/spec.rb
# rubys' path-crunching ideas here: http://intertwingly.net/code/svgtidy/spec.rb
# (and implemented here: http://intertwingly.net/code/svgtidy/svgtidy.rb )
# Yet more ideas here: http://wiki.inkscape.org/wiki/index.php/Save_Cleaned_SVG
@ -45,13 +45,10 @@
# * Process Transformations
# * Process quadratic Bezier curves
# * Collapse all group based transformations
# * Output Standard SVG
# * Use viewPort instead of document width/height
# Next Up:
# + Prevent error when stroke-width property value has a unit
# + Convert width/height into a viewBox where possible
# + Convert all referenced rasters into base64 encoded URLs if the files can be found
# + move all functionality into a module level function named 'scour' and only call it
# when being run as main (prepare for unit testing)
# - Removed duplicate gradient stops
# - Convert all colors to #RRGGBB format
# -
@ -70,7 +67,7 @@ import os.path
import urllib
APP = 'scour'
VER = '0.06'
VER = '0.07'
COPYRIGHT = 'Copyright Jeff Schiller, 2009'
NS = { 'SVG': 'http://www.w3.org/2000/svg',
@ -550,117 +547,131 @@ def properlySizeDoc(docElement):
docElement.removeAttribute('width')
docElement.removeAttribute('height')
# parse command-line arguments
args = sys.argv[1:]
# this is the main method
# input is a string representation of the input XML
# returns a string representation of the output XML
def scour(in_string):
global numAttrsRemoved
global numStylePropsFixed
global numElemsRemoved
doc = xml.dom.minidom.parseString(in_string)
# by default the input and output are the standard streams
input = sys.stdin
output = sys.stdout
# if -i or -o is supplied, switch the stream to the file
if len(args) == 2:
if args[0] == '-i' :
input = open(args[1], 'r')
elif args[0] == '-o' :
output = open(args[1], 'w')
else:
printSyntaxAndQuit()
# if both -o and -o are supplied, switch streams to the files
elif len(args) == 4 :
if args[0] == '-i' and args[2] == '-o' :
input = open(args[1], 'r')
output = open(args[3], 'w')
elif args[0] == '-o' and args[2] == 'i' :
output = open(args[1], 'w')
input = open(args[3], 'r')
else:
printSyntaxAndQuit()
# else invalid syntax
elif len(args) != 0 :
printSyntaxAndQuit()
# if we are not sending to stdout, then print out app information
bOutputReport = False
if output != sys.stdout :
bOutputReport = True
printHeader()
# build DOM in memory
doc = xml.dom.minidom.parse(input)
# for whatever reason this does not always remove all inkscape/sodipodi attributes/elements
# on the first pass, so we do it multiple times
# does it have to do with removal of children affecting the childlist?
while removeNamespacedElements( doc.documentElement, unwanted_ns ) > 0 :
pass
while removeNamespacedAttributes( doc.documentElement, unwanted_ns ) > 0 :
pass
# for whatever reason this does not always remove all inkscape/sodipodi attributes/elements
# on the first pass, so we do it multiple times
# does it have to do with removal of children affecting the childlist?
while removeNamespacedElements( doc.documentElement, unwanted_ns ) > 0 :
pass
while removeNamespacedAttributes( doc.documentElement, unwanted_ns ) > 0 :
pass
# remove the xmlns: declarations now
xmlnsDeclsToRemove = []
attrList = doc.documentElement.attributes
for num in range(attrList.length) :
if attrList.item(num).nodeValue in unwanted_ns :
xmlnsDeclsToRemove.append(attrList.item(num).nodeName)
# remove the xmlns: declarations now
xmlnsDeclsToRemove = []
attrList = doc.documentElement.attributes
for num in range(attrList.length) :
if attrList.item(num).nodeValue in unwanted_ns :
xmlnsDeclsToRemove.append(attrList.item(num).nodeName)
for attr in xmlnsDeclsToRemove :
doc.documentElement.removeAttribute(attr)
numAttrsRemoved += 1
for attr in xmlnsDeclsToRemove :
doc.documentElement.removeAttribute(attr)
numAttrsRemoved += 1
bContinueLooping = True
while bContinueLooping:
identifiedElements = findElementsWithId(doc.documentElement, {})
referencedIDs = findReferencedElements(doc.documentElement, {})
bContinueLooping = ((removeUnreferencedIDs(referencedIDs, identifiedElements) + vacuumDefs(doc)) > 0)
bContinueLooping = True
while bContinueLooping:
identifiedElements = findElementsWithId(doc.documentElement, {})
referencedIDs = findReferencedElements(doc.documentElement, {})
bContinueLooping = ((removeUnreferencedIDs(referencedIDs, identifiedElements) + vacuumDefs(doc)) > 0)
# repair style (remove unnecessary style properties and change them into XML attributes)
numStylePropsFixed = repairStyle(doc.documentElement)
# repair style (remove unnecessary style properties and change them into XML attributes)
numStylePropsFixed = repairStyle(doc.documentElement)
# remove empty defs, metadata, g
# NOTE: these elements will be removed even if they have (invalid) text nodes
elemsToRemove = []
for tag in ['defs', 'metadata', 'g'] :
for elem in doc.documentElement.getElementsByTagNameNS(NS['SVG'], tag) :
removeElem = not elem.hasChildNodes()
if removeElem == False :
for child in elem.childNodes :
if child.nodeType in [1, 4, 8] :
break
else:
removeElem = True
if removeElem :
elem.parentNode.removeChild(elem)
numElemsRemoved += 1
# remove empty defs, metadata, g
# NOTE: these elements will be removed even if they have (invalid) text nodes
elemsToRemove = []
for tag in ['defs', 'metadata', 'g'] :
for elem in doc.documentElement.getElementsByTagNameNS(NS['SVG'], tag) :
removeElem = not elem.hasChildNodes()
if removeElem == False :
for child in elem.childNodes :
if child.nodeType in [1, 4, 8] :
break
else:
removeElem = True
if removeElem :
elem.parentNode.removeChild(elem)
numElemsRemoved += 1
# clean path data
for elem in doc.documentElement.getElementsByTagNameNS(NS['SVG'], 'path') :
cleanPath(elem)
# clean path data
for elem in doc.documentElement.getElementsByTagNameNS(NS['SVG'], 'path') :
cleanPath(elem)
# convert rasters refereces to base64-encoded strings
for elem in doc.documentElement.getElementsByTagNameNS(NS['SVG'], 'image') :
embedRasters(elem)
# convert rasters refereces to base64-encoded strings
for elem in doc.documentElement.getElementsByTagNameNS(NS['SVG'], 'image') :
embedRasters(elem)
# properly size the SVG document (ideally width/height should be 100% with a viewBox)
properlySizeDoc(doc.documentElement)
# properly size the SVG document (ideally width/height should be 100% with a viewBox)
properlySizeDoc(doc.documentElement)
# output the document
doc.documentElement.writexml(output)
# output the document
out_string = doc.documentElement.toxml()
return out_string
if __name__ == '__main__':
# Close input and output files
input.close()
output.close()
# parse command-line arguments
args = sys.argv[1:]
# output some statistics if we are not using stdout
if bOutputReport :
print " Number of unreferenced id attributes removed:", numIDsRemoved
print " Number of elements removed:", numElemsRemoved
print " Number of attributes removed:", numAttrsRemoved
print " Number of style properties fixed:", numStylePropsFixed
print " Number of raster images embedded inline:", numRastersEmbedded
oldsize = os.path.getsize(input.name)
newsize = os.path.getsize(output.name)
#sizediff = (min(oldsize, newsize) / max(oldsize, newsize)) * 100;
sizediff = (newsize / oldsize);
print " Original file size:", oldsize, "kb; new file size:", newsize, "kb (" + str(sizediff)[:5] + "x)"
# by default the input and output are the standard streams
input = sys.stdin
output = sys.stdout
# if -i or -o is supplied, switch the stream to the file
if len(args) == 2:
if args[0] == '-i' :
input = open(args[1], 'r')
elif args[0] == '-o' :
output = open(args[1], 'w')
else:
printSyntaxAndQuit()
# if both -o and -o are supplied, switch streams to the files
elif len(args) == 4 :
if args[0] == '-i' and args[2] == '-o' :
input = open(args[1], 'r')
output = open(args[3], 'w')
elif args[0] == '-o' and args[2] == 'i' :
output = open(args[1], 'w')
input = open(args[3], 'r')
else:
printSyntaxAndQuit()
# else invalid syntax
elif len(args) != 0 :
printSyntaxAndQuit()
# if we are not sending to stdout, then print out app information
bOutputReport = False
if output != sys.stdout :
bOutputReport = True
printHeader()
# do the work
in_string = input.read()
out_string = scour(in_string)
output.write(out_string)
# Close input and output files
input.close()
output.close()
# output some statistics if we are not using stdout
if bOutputReport :
print " Number of unreferenced id attributes removed:", numIDsRemoved
print " Number of elements removed:", numElemsRemoved
print " Number of attributes removed:", numAttrsRemoved
print " Number of style properties fixed:", numStylePropsFixed
print " Number of raster images embedded inline:", numRastersEmbedded
oldsize = os.path.getsize(input.name)
newsize = os.path.getsize(output.name)
#sizediff = (min(oldsize, newsize) / max(oldsize, newsize)) * 100;
sizediff = (newsize / oldsize);
print " Original file size:", oldsize, "kb; new file size:", newsize, "kb (" + str(sizediff)[:5] + "x)"