From d598c2db5c653b252f43a77f11b3b3deb9502d9a Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 10 Mar 2018 19:55:17 +0000 Subject: [PATCH] Update the Lexer to accept expected tokens from the parser The SVG spec allows paths to omit whitespace in some cases and expect parsers to gracefully handle this. In particularly, the parsers must greedly match as much of the token required as possible but stop as soon the token no longer matches. The latter bit is where the SVG standard gets interesting. An Elliptical command (i.e. A or a) will accept among other: ..., number, flag, flag, number, ... Where flag is defined as "0" or "1" (exactly one character). Given those tokens and the following input: 1 010 The spec requires scour to parse that as: "1" (number), "0" (flag), "1" (flag) and "0" (number). It might be tempting to just include "flag" in the default tokenization. Unfortunately this falls apart pretty quickly if you want to follow the spec. E.g. if you have 100 as input and the lexer has no hint about the next token then it can now parse it as: * Three flags * Two flags and a one-digit number (in that order) * A flag and a two-digit number (in that order) * A three-digit number Therefore, to support this, the SVGPathParser must provide the Lexer with a hint about what it is expecting in some cases. This turns out to be trivially possible by exploiting the fact that "lex" is a generator function and can trivially be converted to a "coroutine" (by replacing "next(x)" with "x.send(value)"). Signed-off-by: Niels Thykier --- scour/svg_regex.py | 93 ++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 31 deletions(-) diff --git a/scour/svg_regex.py b/scour/svg_regex.py index d4dfe3a..7ab2d58 100644 --- a/scour/svg_regex.py +++ b/scour/svg_regex.py @@ -45,8 +45,6 @@ from __future__ import absolute_import import re from decimal import Decimal, getcontext -from functools import partial - # Sentinel. @@ -59,10 +57,18 @@ class _EOF(object): EOF = _EOF() +# default tokens +# (name, default-token, regex pattern) lexicon = [ - ('float', r'[-+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.?))(?:[Ee][-+]?[0-9]+)?'), - ('int', r'[-+]?[0-9]+'), - ('command', r'[AaCcHhLlMmQqSsTtVvZz]'), + ('float', True, r'[-+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.?))(?:[Ee][-+]?[0-9]+)?'), + ('int', True, r'[-+]?[0-9]+'), + ('command', True, r'[AaCcHhLlMmQqSsTtVvZz]'), + # The "flag" token is defined as a single 0 or single 1. We + # cannot parse this as a float or an int because those tokens + # might consume multiple digits (e.g. "int, 11" instead of "flag, + # 1" + "flag, 1") and the spec allows SVGs to omit the space after + # a flag here. + ('flag', False, r'[01]'), ] @@ -80,22 +86,47 @@ class Lexer(object): def __init__(self, lexicon): self.lexicon = lexicon parts = [] - for name, regex in lexicon: - parts.append('(?P<%s>%s)' % (name, regex)) + for name, is_default, regex in lexicon: + if is_default: + parts.append('(?P<%s>%s)' % (name, regex)) + self.all_token_names = [x for x, _, _ in lexicon] + self.single_token_rules = {x: re.compile('(?P<%s>%s)' % (x, y)) for x, _, y in lexicon} self.regex_string = '|'.join(parts) self.regex = re.compile(self.regex_string) def lex(self, text): - """ Yield (token_type, str_data) tokens. + """Coroutine that yields (token_type, str_data) tokens. + + The parser can send a token name defined in the lexicon if the + default token rules are not useful. The last token will be (EOF, None) where EOF is the singleton object defined in this module. + """ - for match in self.regex.finditer(text): - for name, _ in self.lexicon: - m = match.group(name) + offset = 0 + current_pattern = self.regex + + while True: + match = current_pattern.search(text, offset) + if not match: + break + offset = match.end() + for name in self.all_token_names: + try: + m = match.group(name) + except IndexError: + # Thrown if "name" is defined in the pattern. + # This happens when the parser requests a + # non-default token as the default token names are + # tried before non-default ones. + continue if m is not None: - yield (name, m) + pattern_request = (yield (name, m)) + if pattern_request is None: + current_pattern = self.regex + else: + current_pattern = self.single_token_rules[pattern_request] break yield (EOF, None) @@ -155,8 +186,8 @@ class SVGPathParser(object): """ Parse a string of SVG data. """ gen = self.lexer.lex(text) - next_val_fn = partial(next, *(gen,)) - token = next_val_fn() + next_val_fn = gen.send + token = next_val_fn(None) return self.rule_svg_path(next_val_fn, token) def rule_svg_path(self, next_val_fn, token): @@ -171,12 +202,12 @@ class SVGPathParser(object): def rule_closepath(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) return (command, []), token def rule_moveto_or_lineto(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) coordinates = [] while token[0] in self.number_tokens: pair, token = self.rule_coordinate_pair(next_val_fn, token) @@ -185,7 +216,7 @@ class SVGPathParser(object): def rule_orthogonal_lineto(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) coordinates = [] while token[0] in self.number_tokens: coord, token = self.rule_coordinate(next_val_fn, token) @@ -194,7 +225,7 @@ class SVGPathParser(object): def rule_curveto3(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) coordinates = [] while token[0] in self.number_tokens: pair1, token = self.rule_coordinate_pair(next_val_fn, token) @@ -207,7 +238,7 @@ class SVGPathParser(object): def rule_curveto2(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) coordinates = [] while token[0] in self.number_tokens: pair1, token = self.rule_coordinate_pair(next_val_fn, token) @@ -218,7 +249,7 @@ class SVGPathParser(object): def rule_curveto1(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) coordinates = [] while token[0] in self.number_tokens: pair1, token = self.rule_coordinate_pair(next_val_fn, token) @@ -227,46 +258,46 @@ class SVGPathParser(object): def rule_elliptical_arc(self, next_val_fn, token): command = token[1] - token = next_val_fn() + token = next_val_fn(None) arguments = [] while token[0] in self.number_tokens: rx = Decimal(token[1]) * 1 if rx < Decimal("0.0"): raise SyntaxError("expecting a nonnegative number; got %r" % (token,)) - token = next_val_fn() + token = next_val_fn(None) if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) ry = Decimal(token[1]) * 1 if ry < Decimal("0.0"): raise SyntaxError("expecting a nonnegative number; got %r" % (token,)) - token = next_val_fn() + token = next_val_fn(None) if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) axis_rotation = Decimal(token[1]) * 1 - token = next_val_fn() + token = next_val_fn('flag') if token[1] not in ('0', '1'): raise SyntaxError("expecting a boolean flag; got %r" % (token,)) large_arc_flag = Decimal(token[1]) * 1 - token = next_val_fn() + token = next_val_fn('flag') if token[1] not in ('0', '1'): raise SyntaxError("expecting a boolean flag; got %r" % (token,)) sweep_flag = Decimal(token[1]) * 1 - token = next_val_fn() + token = next_val_fn(None) if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) x = Decimal(token[1]) * 1 - token = next_val_fn() + token = next_val_fn(None) if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) y = Decimal(token[1]) * 1 - token = next_val_fn() + token = next_val_fn(None) arguments.extend([rx, ry, axis_rotation, large_arc_flag, sweep_flag, x, y]) return (command, arguments), token @@ -275,7 +306,7 @@ class SVGPathParser(object): if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) x = getcontext().create_decimal(token[1]) - token = next_val_fn() + token = next_val_fn(None) return x, token def rule_coordinate_pair(self, next_val_fn, token): @@ -283,11 +314,11 @@ class SVGPathParser(object): if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) x = getcontext().create_decimal(token[1]) - token = next_val_fn() + token = next_val_fn(None) if token[0] not in self.number_tokens: raise SyntaxError("expecting a number; got %r" % (token,)) y = getcontext().create_decimal(token[1]) - token = next_val_fn() + token = next_val_fn(None) return [x, y], token