From ada3a4cfcf287827165f52e0c35dc63608743d35 Mon Sep 17 00:00:00 2001
From: guido <none@none>
Date: Sat, 10 Feb 2007 15:20:21 +0100
Subject: [PATCH] [svn r38386] Fixed some more nasty problems after running
 py.test --apigen on pypy: problem recognizing """foo "bar" baz""" as a single
 line string (hack unfortunately), unicode problems. Also added some code to
 debug the tokenizer: you can now run the color.py script with a Python file
 as arg to tokenize the file.

--HG--
branch : trunk
---
 py/apigen/source/color.py              | 38 +++++++++++++++++++++-----
 py/apigen/source/testing/test_color.py |  5 ++++
 2 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/py/apigen/source/color.py b/py/apigen/source/color.py
index a9e2214e1..2338f4887 100644
--- a/py/apigen/source/color.py
+++ b/py/apigen/source/color.py
@@ -44,10 +44,11 @@ class Tokenizer(object):
         very naive tokenizer, state is recorded for multi-line strings, etc.
     """
 
-    _re_word = re.compile('[\w_]+')
-    _re_space = re.compile('\s+')
-    _re_number = re.compile('[\d\.]*\d[\d\.]*l?', re.I)
-    _re_rest = re.compile('[^\w\s\d\'"]+') # XXX cheating a bit with the quotes
+    _re_word = re.compile('[\w_]+', re.U)
+    _re_space = re.compile('\s+', re.U)
+    _re_number = re.compile('[\d\.]*\d[\d\.]*l?', re.I | re.U)
+    # XXX cheating a bit with the quotes
+    _re_rest = re.compile('[^\w\s\d\'"]+', re.U)
 
     # these will be filled using the schema
     _re_strings_full = None
@@ -68,13 +69,14 @@ class Tokenizer(object):
                 re.compile(r'%s[^\\%s]+(\\.[^\\%s]*)*%s' % (d, d, d, d)))
             self._re_strings_empty.append(re.compile('%s%s' % (d, d)))
         for d in schema.multiline_string:
-            self._re_strings_multiline.append((re.compile('%s.*' % (d,), re.S),
+            self._re_strings_multiline.append((re.compile('(%s).*' % (d,),
+                                                          re.S),
                                                re.compile('.*?%s' % (d,))))
         if schema.linejoin:
             j = schema.linejoin
             for d in schema.string:
                 self._re_strings_multiline.append(
-                    (re.compile('%s.*%s$' % (d, j)),
+                    (re.compile('(%s).*%s$' % (d, j)),
                      re.compile('.*?%s' % (d,))))
         # no multi-line comments in Python... phew :)
         self._re_comments = []
@@ -123,7 +125,15 @@ class Tokenizer(object):
             if m:
                 s = m.group(0)
                 data = ''
-                self._inside_multiline = end
+                # XXX take care of a problem which is hard to fix with regexps:
+                # '''foo 'bar' baz''' will not match single-line strings
+                # (because [^"""] matches just a single " already), so let's
+                # try to catch it here... (quite Python specific issue!)
+                endm = end.match(s[len(m.group(1)):])
+                if endm: # see if it ends here already
+                    s = m.group(1) + endm.group(0)
+                else:
+                    self._inside_multiline = end
                 token = Token(s, 'string')
                 break
         return data, token
@@ -184,4 +194,18 @@ class Tokenizer(object):
             return data[len(s):], Token(s, 'unknown')
         return data, None
 
+if __name__ == '__main__':
+    import py, sys
+    if len(sys.argv) != 2:
+        print 'usage: %s <filename>'
+        print '  tokenizes the file and prints the tokens per line'
+        sys.exit(1)
+    t = Tokenizer(PythonSchema)
+    p = py.path.local(sys.argv[1])
+    assert p.ext == '.py'
+    for line in p.read().split('\n'):
+        print repr(line)
+        print 't in multiline mode:', not not t._inside_multiline
+        tokens = t.tokenize(line)
+        print list(tokens)
 
diff --git a/py/apigen/source/testing/test_color.py b/py/apigen/source/testing/test_color.py
index 412b8ac7c..931a7279d 100644
--- a/py/apigen/source/testing/test_color.py
+++ b/py/apigen/source/testing/test_color.py
@@ -90,3 +90,8 @@ class TestTokenizer(object):
         assert self.tokens('."foo"') == [Token('.', type='unknown'),
                                          Token('"foo"', type='string')]
 
+    def test_something_strange(self):
+        t = Tokenizer(PythonSchema)
+        tokens = list(t.tokenize('"""foo "bar" baz"""'))
+        assert not t._inside_multiline
+