diff --git a/fulltests/gimp.svg b/fulltests/gimp.svg
new file mode 100644
index 0000000..435665f
--- /dev/null
+++ b/fulltests/gimp.svg
@@ -0,0 +1,199 @@
+
+
+
\ No newline at end of file
diff --git a/fulltests/header.svg b/fulltests/header.svg
deleted file mode 100644
index 67fd2d8..0000000
--- a/fulltests/header.svg
+++ /dev/null
@@ -1,3662 +0,0 @@
-
-
-
-
diff --git a/scour.py b/scour.py
index 5a15dfe..c33017d 100755
--- a/scour.py
+++ b/scour.py
@@ -52,16 +52,6 @@
# - Reduce #RRGGBB format to #RGB format when possible
# https://bugs.edge.launchpad.net/ubuntu/+source/human-icon-theme/+bug/361667/
-# Some notes to not forget:
-# - removing empty nested groups also potentially loses some semantic information
-# (i.e. the following button:
-#
-#
-#
-#
-# will be flattened)
-
-
# necessary to get true division
from __future__ import division
@@ -74,6 +64,8 @@ import math
import base64
import os.path
import urllib
+import svg_regex
+from svg_regex import svg_parser
APP = 'scour'
VER = '0.10'
@@ -610,9 +602,41 @@ def repairStyle(node):
return num
-# does nothing at the moment but waste time
+# This method will do the following:
+# - parse the path data and reserialize
def cleanPath(element) :
- path = element.getAttribute('d')
+ path = svg_parser.parse(element.getAttribute('d'))
+ for (cmd,dataset) in path:
+ if not dataset == None:
+ for data in dataset:
+ pass
+ element.setAttribute('d', serializePath(path))
+
+# - reserialize the path data with some cleanups:
+# - removes scientific notation (exponents)
+# - removes trailing zeros after the decimal
+# - removes extraneous whitespace
+# - adds commas between all values in a subcommand
+def serializePath(pathObj):
+ pathStr = ""
+# print pathObj
+ for (cmd,dataset) in pathObj:
+ pathStr += cmd
+ if not dataset == None:
+ for data in dataset:
+ try:
+ c = 0
+ for coord in data:
+ # if coord can be an integer without loss of precision, go for it
+ if int(coord) == coord: pathStr += str(int(coord))
+ else: pathStr += str(coord)
+ if c < len(data)-1:
+ pathStr += ','
+ c += 1
+ except TypeError:
+ pathStr += str(data)
+ pathStr += ' '
+ return pathStr
# converts raster references to inline images
# NOTE: there are size limits to base64-encoding handling in browsers
diff --git a/svg_regex.py b/svg_regex.py
new file mode 100644
index 0000000..f4e5dc0
--- /dev/null
+++ b/svg_regex.py
@@ -0,0 +1,280 @@
+# This software is OSI Certified Open Source Software.
+# OSI Certified is a certification mark of the Open Source Initiative.
+#
+# Copyright (c) 2006, Enthought, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * Neither the name of Enthought, Inc. nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+""" Small hand-written recursive descent parser for SVG data.
+
+
+In [1]: from svg_regex import svg_parser
+
+In [3]: svg_parser.parse('M 10,20 30,40V50 60 70')
+Out[3]: [('M', [(10.0, 20.0), (30.0, 40.0)]), ('V', [50.0, 60.0, 70.0])]
+
+In [4]: svg_parser.parse('M 0.6051.5') # An edge case
+Out[4]: [('M', [(0.60509999999999997, 0.5)])]
+
+In [5]: svg_parser.parse('M 100-200') # Another edge case
+Out[5]: [('M', [(100.0, -200.0)])]
+"""
+
+import re
+
+
+# Sentinel.
+class _EOF(object):
+ def __repr__(self):
+ return 'EOF'
+EOF = _EOF()
+
+lexicon = [
+ ('float', r'[-\+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.))(?:[Ee][-\+]?[0-9]+)?'),
+ ('int', r'[-\+]?[0-9]+'),
+ ('command', r'[AaCcHhLlMmQqSsTtVvZz]'),
+]
+
+
+class Lexer(object):
+ """ Break SVG path data into tokens.
+
+ The SVG spec requires that tokens are greedy. This lexer relies on Python's
+ regexes defaulting to greediness.
+
+ This style of implementation was inspired by this article:
+
+ http://www.gooli.org/blog/a-simple-lexer-in-python/
+ """
+ def __init__(self, lexicon):
+ self.lexicon = lexicon
+ parts = []
+ for name, regex in lexicon:
+ parts.append('(?P<%s>%s)' % (name, regex))
+ self.regex_string = '|'.join(parts)
+ self.regex = re.compile(self.regex_string)
+
+ def lex(self, text):
+ """ Yield (token_type, str_data) tokens.
+
+ The last token will be (EOF, None) where EOF is the singleton object
+ defined in this module.
+ """
+ for match in self.regex.finditer(text):
+ for name, _ in self.lexicon:
+ m = match.group(name)
+ if m is not None:
+ yield (name, m)
+ break
+ yield (EOF, None)
+
+svg_lexer = Lexer(lexicon)
+
+
+class SVGPathParser(object):
+ """ Parse SVG data into a list of commands.
+
+ Each distinct command will take the form of a tuple (command, data). The
+ `command` is just the character string that starts the command group in the
+ data, so 'M' for absolute moveto, 'm' for relative moveto, 'Z' for
+ closepath, etc. The kind of data it carries with it depends on the command.
+ For 'Z' (closepath), it's just None. The others are lists of individual
+ argument groups. Multiple elements in these lists usually mean to repeat the
+ command. The notable exception is 'M' (moveto) where only the first element
+ is truly a moveto. The remainder are implicit linetos.
+
+ See the SVG documentation for the interpretation of the individual elements
+ for each command.
+
+ The main method is `parse(text)`. It can only consume actual strings, not
+ filelike objects or iterators.
+ """
+
+ def __init__(self, lexer=svg_lexer):
+ self.lexer = lexer
+
+ self.command_dispatch = {
+ 'Z': self.rule_closepath,
+ 'z': self.rule_closepath,
+ 'M': self.rule_moveto_or_lineto,
+ 'm': self.rule_moveto_or_lineto,
+ 'L': self.rule_moveto_or_lineto,
+ 'l': self.rule_moveto_or_lineto,
+ 'H': self.rule_orthogonal_lineto,
+ 'h': self.rule_orthogonal_lineto,
+ 'V': self.rule_orthogonal_lineto,
+ 'v': self.rule_orthogonal_lineto,
+ 'C': self.rule_curveto3,
+ 'c': self.rule_curveto3,
+ 'S': self.rule_curveto2,
+ 's': self.rule_curveto2,
+ 'Q': self.rule_curveto2,
+ 'q': self.rule_curveto2,
+ 'T': self.rule_curveto1,
+ 't': self.rule_curveto1,
+ 'A': self.rule_elliptical_arc,
+ 'a': self.rule_elliptical_arc,
+ }
+
+ self.number_tokens = set(['int', 'float'])
+
+ def parse(self, text):
+ """ Parse a string of SVG data.
+ """
+ next = self.lexer.lex(text).next
+ token = next()
+ return self.rule_svg_path(next, token)
+
+ def rule_svg_path(self, next, token):
+ commands = []
+ while token[0] is not EOF:
+ if token[0] != 'command':
+ raise SyntaxError("expecting a command; got %r" % (token,))
+ rule = self.command_dispatch[token[1]]
+ command_group, token = rule(next, token)
+ commands.append(command_group)
+ return commands
+
+ def rule_closepath(self, next, token):
+ command = token[1]
+ token = next()
+ return (command, None), token
+
+ def rule_moveto_or_lineto(self, next, token):
+ command = token[1]
+ token = next()
+ coordinates = []
+ while token[0] in self.number_tokens:
+ pair, token = self.rule_coordinate_pair(next, token)
+ coordinates.append(pair)
+ return (command, coordinates), token
+
+ def rule_orthogonal_lineto(self, next, token):
+ command = token[1]
+ token = next()
+ coordinates = []
+ while token[0] in self.number_tokens:
+ coord, token = self.rule_coordinate(next, token)
+ coordinates.append(coord)
+ return (command, coordinates), token
+
+ def rule_curveto3(self, next, token):
+ command = token[1]
+ token = next()
+ coordinates = []
+ while token[0] in self.number_tokens:
+ pair1, token = self.rule_coordinate_pair(next, token)
+ pair2, token = self.rule_coordinate_pair(next, token)
+ pair3, token = self.rule_coordinate_pair(next, token)
+ coordinates.append((pair1, pair2, pair3))
+ return (command, coordinates), token
+
+ def rule_curveto2(self, next, token):
+ command = token[1]
+ token = next()
+ coordinates = []
+ while token[0] in self.number_tokens:
+ pair1, token = self.rule_coordinate_pair(next, token)
+ pair2, token = self.rule_coordinate_pair(next, token)
+ coordinates.append((pair1, pair2))
+ return (command, coordinates), token
+
+ def rule_curveto1(self, next, token):
+ command = token[1]
+ token = next()
+ coordinates = []
+ while token[0] in self.number_tokens:
+ pair1, token = self.rule_coordinate_pair(next, token)
+ coordinates.append(pair1)
+ return (command, coordinates), token
+
+ def rule_elliptical_arc(self, next, token):
+ command = token[1]
+ token = next()
+ arguments = []
+ while token[0] in self.number_tokens:
+ rx = float(token[1])
+ if rx < 0.0:
+ raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
+
+ token = next()
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ ry = float(token[1])
+ if ry < 0.0:
+ raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
+
+ token = next()
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ axis_rotation = float(token[1])
+
+ token = next()
+ if token[1] not in ('0', '1'):
+ raise SyntaxError("expecting a boolean flag; got %r" % (token,))
+ large_arc_flag = bool(int(token[1]))
+
+ token = next()
+ if token[1] not in ('0', '1'):
+ raise SyntaxError("expecting a boolean flag; got %r" % (token,))
+ sweep_flag = bool(int(token[1]))
+
+ token = next()
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ x = float(token[1])
+
+ token = next()
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ y = float(token[1])
+
+ token = next()
+ arguments.append(((rx,ry), axis_rotation, large_arc_flag, sweep_flag, (x,y)))
+
+ return (command, arguments), token
+
+ def rule_coordinate(self, next, token):
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ x = float(token[1])
+ token = next()
+ return x, token
+
+
+ def rule_coordinate_pair(self, next, token):
+ # Inline these since this rule is so common.
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ x = float(token[1])
+ token = next()
+ if token[0] not in self.number_tokens:
+ raise SyntaxError("expecting a number; got %r" % (token,))
+ y = float(token[1])
+ token = next()
+ return (x,y), token
+
+
+svg_parser = SVGPathParser()
diff --git a/unittests/path-simple-triangle.svg b/unittests/path-simple-triangle.svg
new file mode 100644
index 0000000..1153720
--- /dev/null
+++ b/unittests/path-simple-triangle.svg
@@ -0,0 +1,7 @@
+