Update the Lexer to accept expected tokens from the parser

The SVG spec allows paths to omit whitespace in some cases and expect
parsers to gracefully handle this.  In particularly, the parsers must
greedly match as much of the token required as possible but stop as
soon the token no longer matches.  The latter bit is where the SVG
standard gets interesting.

An Elliptical command (i.e. A or a) will accept among other:
  ..., number, flag, flag, number, ...

Where flag is defined as "0" or "1" (exactly one character).  Given
those tokens and the following input:

   1 010

The spec requires scour to parse that as:

  "1" (number), "0" (flag), "1" (flag) and "0" (number).

It might be tempting to just include "flag" in the default
tokenization.  Unfortunately this falls apart pretty quickly if you
want to follow the spec.  E.g. if you have 100 as input and the lexer
has no hint about the next token then it can now parse it as:

 * Three flags
 * Two flags and a one-digit number (in that order)
 * A flag and a two-digit number (in that order)
 * A three-digit number

Therefore, to support this, the SVGPathParser must provide the Lexer
with a hint about what it is expecting in some cases.  This turns out
to be trivially possible by exploiting the fact that "lex" is a
generator function and can trivially be converted to a "coroutine" (by
replacing "next(x)" with "x.send(value)").

Signed-off-by: Niels Thykier <niels@thykier.net>
This commit is contained in:
Niels Thykier 2018-03-10 19:55:17 +00:00
parent 6ea126d290
commit d598c2db5c

View file

@ -45,8 +45,6 @@ from __future__ import absolute_import
import re
from decimal import Decimal, getcontext
from functools import partial
# Sentinel.
@ -59,10 +57,18 @@ class _EOF(object):
EOF = _EOF()
# default tokens
# (name, default-token, regex pattern)
lexicon = [
('float', r'[-+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.?))(?:[Ee][-+]?[0-9]+)?'),
('int', r'[-+]?[0-9]+'),
('command', r'[AaCcHhLlMmQqSsTtVvZz]'),
('float', True, r'[-+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.?))(?:[Ee][-+]?[0-9]+)?'),
('int', True, r'[-+]?[0-9]+'),
('command', True, r'[AaCcHhLlMmQqSsTtVvZz]'),
# The "flag" token is defined as a single 0 or single 1. We
# cannot parse this as a float or an int because those tokens
# might consume multiple digits (e.g. "int, 11" instead of "flag,
# 1" + "flag, 1") and the spec allows SVGs to omit the space after
# a flag here.
('flag', False, r'[01]'),
]
@ -80,22 +86,47 @@ class Lexer(object):
def __init__(self, lexicon):
self.lexicon = lexicon
parts = []
for name, regex in lexicon:
for name, is_default, regex in lexicon:
if is_default:
parts.append('(?P<%s>%s)' % (name, regex))
self.all_token_names = [x for x, _, _ in lexicon]
self.single_token_rules = {x: re.compile('(?P<%s>%s)' % (x, y)) for x, _, y in lexicon}
self.regex_string = '|'.join(parts)
self.regex = re.compile(self.regex_string)
def lex(self, text):
""" Yield (token_type, str_data) tokens.
"""Coroutine that yields (token_type, str_data) tokens.
The parser can send a token name defined in the lexicon if the
default token rules are not useful.
The last token will be (EOF, None) where EOF is the singleton object
defined in this module.
"""
for match in self.regex.finditer(text):
for name, _ in self.lexicon:
offset = 0
current_pattern = self.regex
while True:
match = current_pattern.search(text, offset)
if not match:
break
offset = match.end()
for name in self.all_token_names:
try:
m = match.group(name)
except IndexError:
# Thrown if "name" is defined in the pattern.
# This happens when the parser requests a
# non-default token as the default token names are
# tried before non-default ones.
continue
if m is not None:
yield (name, m)
pattern_request = (yield (name, m))
if pattern_request is None:
current_pattern = self.regex
else:
current_pattern = self.single_token_rules[pattern_request]
break
yield (EOF, None)
@ -155,8 +186,8 @@ class SVGPathParser(object):
""" Parse a string of SVG <path> data.
"""
gen = self.lexer.lex(text)
next_val_fn = partial(next, *(gen,))
token = next_val_fn()
next_val_fn = gen.send
token = next_val_fn(None)
return self.rule_svg_path(next_val_fn, token)
def rule_svg_path(self, next_val_fn, token):
@ -171,12 +202,12 @@ class SVGPathParser(object):
def rule_closepath(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
return (command, []), token
def rule_moveto_or_lineto(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
coordinates = []
while token[0] in self.number_tokens:
pair, token = self.rule_coordinate_pair(next_val_fn, token)
@ -185,7 +216,7 @@ class SVGPathParser(object):
def rule_orthogonal_lineto(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
coordinates = []
while token[0] in self.number_tokens:
coord, token = self.rule_coordinate(next_val_fn, token)
@ -194,7 +225,7 @@ class SVGPathParser(object):
def rule_curveto3(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
coordinates = []
while token[0] in self.number_tokens:
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
@ -207,7 +238,7 @@ class SVGPathParser(object):
def rule_curveto2(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
coordinates = []
while token[0] in self.number_tokens:
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
@ -218,7 +249,7 @@ class SVGPathParser(object):
def rule_curveto1(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
coordinates = []
while token[0] in self.number_tokens:
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
@ -227,46 +258,46 @@ class SVGPathParser(object):
def rule_elliptical_arc(self, next_val_fn, token):
command = token[1]
token = next_val_fn()
token = next_val_fn(None)
arguments = []
while token[0] in self.number_tokens:
rx = Decimal(token[1]) * 1
if rx < Decimal("0.0"):
raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
token = next_val_fn()
token = next_val_fn(None)
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
ry = Decimal(token[1]) * 1
if ry < Decimal("0.0"):
raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
token = next_val_fn()
token = next_val_fn(None)
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
axis_rotation = Decimal(token[1]) * 1
token = next_val_fn()
token = next_val_fn('flag')
if token[1] not in ('0', '1'):
raise SyntaxError("expecting a boolean flag; got %r" % (token,))
large_arc_flag = Decimal(token[1]) * 1
token = next_val_fn()
token = next_val_fn('flag')
if token[1] not in ('0', '1'):
raise SyntaxError("expecting a boolean flag; got %r" % (token,))
sweep_flag = Decimal(token[1]) * 1
token = next_val_fn()
token = next_val_fn(None)
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
x = Decimal(token[1]) * 1
token = next_val_fn()
token = next_val_fn(None)
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
y = Decimal(token[1]) * 1
token = next_val_fn()
token = next_val_fn(None)
arguments.extend([rx, ry, axis_rotation, large_arc_flag, sweep_flag, x, y])
return (command, arguments), token
@ -275,7 +306,7 @@ class SVGPathParser(object):
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
x = getcontext().create_decimal(token[1])
token = next_val_fn()
token = next_val_fn(None)
return x, token
def rule_coordinate_pair(self, next_val_fn, token):
@ -283,11 +314,11 @@ class SVGPathParser(object):
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
x = getcontext().create_decimal(token[1])
token = next_val_fn()
token = next_val_fn(None)
if token[0] not in self.number_tokens:
raise SyntaxError("expecting a number; got %r" % (token,))
y = getcontext().create_decimal(token[1])
token = next_val_fn()
token = next_val_fn(None)
return [x, y], token