From ada3a4cfcf287827165f52e0c35dc63608743d35 Mon Sep 17 00:00:00 2001 From: guido Date: Sat, 10 Feb 2007 15:20:21 +0100 Subject: [PATCH] [svn r38386] Fixed some more nasty problems after running py.test --apigen on pypy: problem recognizing """foo "bar" baz""" as a single line string (hack unfortunately), unicode problems. Also added some code to debug the tokenizer: you can now run the color.py script with a Python file as arg to tokenize the file. --HG-- branch : trunk --- py/apigen/source/color.py | 38 +++++++++++++++++++++----- py/apigen/source/testing/test_color.py | 5 ++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/py/apigen/source/color.py b/py/apigen/source/color.py index a9e2214e1..2338f4887 100644 --- a/py/apigen/source/color.py +++ b/py/apigen/source/color.py @@ -44,10 +44,11 @@ class Tokenizer(object): very naive tokenizer, state is recorded for multi-line strings, etc. """ - _re_word = re.compile('[\w_]+') - _re_space = re.compile('\s+') - _re_number = re.compile('[\d\.]*\d[\d\.]*l?', re.I) - _re_rest = re.compile('[^\w\s\d\'"]+') # XXX cheating a bit with the quotes + _re_word = re.compile('[\w_]+', re.U) + _re_space = re.compile('\s+', re.U) + _re_number = re.compile('[\d\.]*\d[\d\.]*l?', re.I | re.U) + # XXX cheating a bit with the quotes + _re_rest = re.compile('[^\w\s\d\'"]+', re.U) # these will be filled using the schema _re_strings_full = None @@ -68,13 +69,14 @@ class Tokenizer(object): re.compile(r'%s[^\\%s]+(\\.[^\\%s]*)*%s' % (d, d, d, d))) self._re_strings_empty.append(re.compile('%s%s' % (d, d))) for d in schema.multiline_string: - self._re_strings_multiline.append((re.compile('%s.*' % (d,), re.S), + self._re_strings_multiline.append((re.compile('(%s).*' % (d,), + re.S), re.compile('.*?%s' % (d,)))) if schema.linejoin: j = schema.linejoin for d in schema.string: self._re_strings_multiline.append( - (re.compile('%s.*%s$' % (d, j)), + (re.compile('(%s).*%s$' % (d, j)), re.compile('.*?%s' % (d,)))) # no multi-line comments in Python... phew :) self._re_comments = [] @@ -123,7 +125,15 @@ class Tokenizer(object): if m: s = m.group(0) data = '' - self._inside_multiline = end + # XXX take care of a problem which is hard to fix with regexps: + # '''foo 'bar' baz''' will not match single-line strings + # (because [^"""] matches just a single " already), so let's + # try to catch it here... (quite Python specific issue!) + endm = end.match(s[len(m.group(1)):]) + if endm: # see if it ends here already + s = m.group(1) + endm.group(0) + else: + self._inside_multiline = end token = Token(s, 'string') break return data, token @@ -184,4 +194,18 @@ class Tokenizer(object): return data[len(s):], Token(s, 'unknown') return data, None +if __name__ == '__main__': + import py, sys + if len(sys.argv) != 2: + print 'usage: %s ' + print ' tokenizes the file and prints the tokens per line' + sys.exit(1) + t = Tokenizer(PythonSchema) + p = py.path.local(sys.argv[1]) + assert p.ext == '.py' + for line in p.read().split('\n'): + print repr(line) + print 't in multiline mode:', not not t._inside_multiline + tokens = t.tokenize(line) + print list(tokens) diff --git a/py/apigen/source/testing/test_color.py b/py/apigen/source/testing/test_color.py index 412b8ac7c..931a7279d 100644 --- a/py/apigen/source/testing/test_color.py +++ b/py/apigen/source/testing/test_color.py @@ -90,3 +90,8 @@ class TestTokenizer(object): assert self.tokens('."foo"') == [Token('.', type='unknown'), Token('"foo"', type='string')] + def test_something_strange(self): + t = Tokenizer(PythonSchema) + tokens = list(t.tokenize('"""foo "bar" baz"""')) + assert not t._inside_multiline +