From 64e19ffb4ee32767861d25c874f0d2dfc75618b7 Mon Sep 17 00:00:00 2001 From: Jannis Leidel Date: Tue, 7 Jun 2011 16:11:25 +0000 Subject: [PATCH] Fixed #7704, #14045 and #15495 -- Introduce a lexer for Javascript to fix multiple problems of the translation of Javascript files with xgettext. Many thanks to Ned Batchelder for his contribution of the JsLex library. git-svn-id: http://code.djangoproject.com/svn/django/trunk@16333 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- .../core/management/commands/makemessages.py | 8 +- django/utils/jslex.py | 213 +++++++++++++++++ .../i18n/commands/extraction.py | 16 +- .../i18n/commands/javascript.js | 47 +++- tests/regressiontests/utils/jslex.py | 217 ++++++++++++++++++ tests/regressiontests/utils/tests.py | 1 + 6 files changed, 494 insertions(+), 8 deletions(-) create mode 100644 django/utils/jslex.py create mode 100644 tests/regressiontests/utils/jslex.py diff --git a/django/core/management/commands/makemessages.py b/django/core/management/commands/makemessages.py index a244a60de5..899775d020 100644 --- a/django/core/management/commands/makemessages.py +++ b/django/core/management/commands/makemessages.py @@ -9,8 +9,8 @@ from subprocess import PIPE, Popen from django.core.management.base import CommandError, NoArgsCommand from django.utils.text import get_text_list +from django.utils.jslex import prepare_js_for_gettext -pythonize_re = re.compile(r'(?:^|\n)\s*//') plural_forms_re = re.compile(r'^(?P"Plural-Forms.+?\\n")\s*$', re.MULTILINE | re.DOTALL) def handle_extensions(extensions=('html',)): @@ -184,15 +184,15 @@ def make_messages(locale=None, domain='django', verbosity='1', all=False, if verbosity > 1: sys.stdout.write('processing file %s in %s\n' % (file, dirpath)) src = open(os.path.join(dirpath, file), "rU").read() - src = pythonize_re.sub('\n#', src) - thefile = '%s.py' % file + src = prepare_js_for_gettext(src) + thefile = '%s.c' % file f = open(os.path.join(dirpath, thefile), "w") try: f.write(src) finally: f.close() cmd = ( - 'xgettext -d %s -L Perl %s --keyword=gettext_noop ' + 'xgettext -d %s -L C %s --keyword=gettext_noop ' '--keyword=gettext_lazy --keyword=ngettext_lazy:1,2 ' '--keyword=pgettext:1c,2 --keyword=npgettext:1c,2,3 ' '--from-code UTF-8 --add-comments=Translators -o - "%s"' % ( diff --git a/django/utils/jslex.py b/django/utils/jslex.py new file mode 100644 index 0000000000..88a22ec67d --- /dev/null +++ b/django/utils/jslex.py @@ -0,0 +1,213 @@ +"""JsLex: a lexer for Javascript""" +# Originally from https://bitbucket.org/ned/jslex +import re + +class Tok(object): + """ + A specification for a token class. + """ + num = 0 + + def __init__(self, name, regex, next=None): + self.id = Tok.num + Tok.num += 1 + self.name = name + self.regex = regex + self.next = next + +def literals(choices, prefix="", suffix=""): + """ + Create a regex from a space-separated list of literal `choices`. + + If provided, `prefix` and `suffix` will be attached to each choice + individually. + + """ + return "|".join(prefix+re.escape(c)+suffix for c in choices.split()) + + +class Lexer(object): + """ + A generic multi-state regex-based lexer. + """ + + def __init__(self, states, first): + self.regexes = {} + self.toks = {} + + for state, rules in states.items(): + parts = [] + for tok in rules: + groupid = "t%d" % tok.id + self.toks[groupid] = tok + parts.append("(?P<%s>%s)" % (groupid, tok.regex)) + self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE) + + self.state = first + + def lex(self, text): + """ + Lexically analyze `text`. + + Yields pairs (`name`, `tokentext`). + """ + while text: + eaten = 0 + for match in self.regexes[self.state].finditer(text): + for name, toktext in match.groupdict().iteritems(): + if toktext is not None: + tok = self.toks[name] + new_state = tok.next + eaten += len(toktext) + yield (tok.name, toktext) + if new_state: + self.state = new_state + break + text = text[eaten:] + + +class JsLexer(Lexer): + """ + A Javascript lexer + + >>> lexer = JsLexer() + >>> list(lexer.lex("a = 1")) + [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')] + + This doesn't properly handle non-Ascii characters in the Javascript source. + """ + + # Because these tokens are matched as alternatives in a regex, longer + # possibilities must appear in the list before shorter ones, for example, + # '>>' before '>'. + # + # Note that we don't have to detect malformed Javascript, only properly + # lex correct Javascript, so much of this is simplified. + + # Details of Javascript lexical structure are taken from + # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf + + # A useful explanation of automatic semicolon insertion is at + # http://inimino.org/~inimino/blog/javascript_semicolons + + both_before = [ + Tok("comment", r"/\*(.|\n)*?\*/"), + Tok("linecomment", r"//.*?$"), + Tok("ws", r"\s+"), + Tok("keyword", literals(""" + break case catch class const continue debugger + default delete do else enum export extends + finally for function if import in instanceof + new return super switch this throw try typeof + var void while with + """, suffix=r"\b"), next='reg'), + Tok("reserved", literals("null true false", suffix=r"\b"), next='div'), + Tok("id", r""" + ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char + ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars + """, next='div'), + Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'), + Tok("onum", r"0[0-7]+"), + Tok("dnum", r""" + ( (0|[1-9][0-9]*) # DecimalIntegerLiteral + \. # dot + [0-9]* # DecimalDigits-opt + ([eE][-+]?[0-9]+)? # ExponentPart-opt + | + \. # dot + [0-9]+ # DecimalDigits + ([eE][-+]?[0-9]+)? # ExponentPart-opt + | + (0|[1-9][0-9]*) # DecimalIntegerLiteral + ([eE][-+]?[0-9]+)? # ExponentPart-opt + ) + """, next='div'), + Tok("punct", literals(""" + >>>= === !== >>> <<= >>= <= >= == != << >> && + || += -= *= %= &= |= ^= + """), next="reg"), + Tok("punct", literals("++ -- ) ]"), next='div'), + Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'), + Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'), + Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'), + ] + + both_after = [ + Tok("other", r"."), + ] + + states = { + 'div': # slash will mean division + both_before + [ + Tok("punct", literals("/= /"), next='reg'), + ] + both_after, + + 'reg': # slash will mean regex + both_before + [ + Tok("regex", + r""" + / # opening slash + # First character is.. + ( [^*\\/[] # anything but * \ / or [ + | \\. # or an escape sequence + | \[ # or a class, which has + ( [^\]\\] # anything but \ or ] + | \\. # or an escape sequence + )* # many times + \] + ) + # Following characters are same, except for excluding a star + ( [^\\/[] # anything but \ / or [ + | \\. # or an escape sequence + | \[ # or a class, which has + ( [^\]\\] # anything but \ or ] + | \\. # or an escape sequence + )* # many times + \] + )* # many times + / # closing slash + [a-zA-Z0-9]* # trailing flags + """, next='div'), + ] + both_after, + } + + def __init__(self): + super(JsLexer, self).__init__(self.states, 'reg') + + +def prepare_js_for_gettext(js): + """ + Convert the Javascript source `js` into something resembling C for + xgettext. + + What actually happens is that all the regex literals are replaced with + "REGEX". + """ + def escape_quotes(m): + """Used in a regex to properly escape double quotes.""" + s = m.group(0) + if s == '"': + return r'\"' + else: + return s + + lexer = JsLexer() + c = [] + for name, tok in lexer.lex(js): + if name == 'regex': + # C doesn't grok regexes, and they aren't needed for gettext, + # so just output a string instead. + tok = '"REGEX"'; + elif name == 'string': + # C doesn't have single-quoted strings, so make all strings + # double-quoted. + if tok.startswith("'"): + guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1]) + tok = '"' + guts + '"' + elif name == 'id': + # C can't deal with Unicode escapes in identifiers. We don't + # need them for gettext anyway, so replace them with something + # innocuous + tok = tok.replace("\\", "U"); + c.append(tok) + return ''.join(c) diff --git a/tests/regressiontests/i18n/commands/extraction.py b/tests/regressiontests/i18n/commands/extraction.py index a5b01f3617..7341c4f7cb 100644 --- a/tests/regressiontests/i18n/commands/extraction.py +++ b/tests/regressiontests/i18n/commands/extraction.py @@ -31,11 +31,13 @@ class ExtractorTests(TestCase): def assertMsgId(self, msgid, s, use_quotes=True): if use_quotes: msgid = '"%s"' % msgid + msgid = re.escape(msgid) return self.assertTrue(re.search('^msgid %s' % msgid, s, re.MULTILINE)) def assertNotMsgId(self, msgid, s, use_quotes=True): if use_quotes: msgid = '"%s"' % msgid + msgid = re.escape(msgid) return self.assertTrue(not re.search('^msgid %s' % msgid, s, re.MULTILINE)) @@ -73,7 +75,7 @@ class BasicExtractorTests(ExtractorTests): self.assertTrue(os.path.exists(self.PO_FILE)) po_contents = open(self.PO_FILE, 'r').read() self.assertMsgId('I think that 100%% is more that 50%% of anything.', po_contents) - self.assertMsgId('I think that 100%% is more that 50%% of %\(obj\)s.', po_contents) + self.assertMsgId('I think that 100%% is more that 50%% of %(obj)s.', po_contents) def test_extraction_error(self): os.chdir(self.test_dir) @@ -102,7 +104,17 @@ class JavascriptExtractorTests(ExtractorTests): po_contents = open(self.PO_FILE, 'r').read() self.assertMsgId('This literal should be included.', po_contents) self.assertMsgId('This one as well.', po_contents) - + self.assertMsgId(r'He said, \"hello\".', po_contents) + self.assertMsgId("okkkk", po_contents) + self.assertMsgId("TEXT", po_contents) + self.assertMsgId("It's at http://example.com", po_contents) + self.assertMsgId("String", po_contents) + self.assertMsgId("/* but this one will be too */ 'cause there is no way of telling...", po_contents) + self.assertMsgId("foo", po_contents) + self.assertMsgId("bar", po_contents) + self.assertMsgId("baz", po_contents) + self.assertMsgId("quz", po_contents) + self.assertMsgId("foobar", po_contents) class IgnoredExtractorTests(ExtractorTests): diff --git a/tests/regressiontests/i18n/commands/javascript.js b/tests/regressiontests/i18n/commands/javascript.js index bc5ec87957..fa059d70f4 100644 --- a/tests/regressiontests/i18n/commands/javascript.js +++ b/tests/regressiontests/i18n/commands/javascript.js @@ -1,4 +1,47 @@ // ' gettext('This literal should be included.') -// ' -gettext('This one as well.') +x = y; // ' +gettext("This one as well.") + +/** (from ticket 7704) + * ***************************** + * AddModule main / window + * @constructor + * @class MyDesktop.AddModule + * ***************************** + */ + +gettext('He said, \"hello".') + +// from ticket 14045 +function mfunc() { + var val = 0; + return val ? 1 : 0; +} +gettext('okkkk'); +print mysub(); + +// from ticket 15495 +/* / ' */ gettext("TEXT"); + +gettext("It's at http://example.com") + +// also from ticket 15495 +gettext("String"); // This comment won't be caught by pythonize_re and it contains "'" which is a string start in Perl +/* + * This one will be removed by the patch + */ +gettext("/* but this one will be too */ 'cause there is no way of telling..."); +f(/* ... if it's different from this one */); + +// from ticket 15331 +gettext("foo"); +true ? true : false; +gettext("bar"); +true ? true : false; +gettext("baz"); +true ? true : false; // ? +gettext("quz"); +"?"; +gettext("foobar"); + diff --git a/tests/regressiontests/utils/jslex.py b/tests/regressiontests/utils/jslex.py new file mode 100644 index 0000000000..08e8386831 --- /dev/null +++ b/tests/regressiontests/utils/jslex.py @@ -0,0 +1,217 @@ +"""Tests for jslex.""" +# encoding: utf-8 +# originally from https://bitbucket.org/ned/jslex + +import difflib +from django.test import TestCase +from django.utils.jslex import JsLexer, prepare_js_for_gettext + +class JsTokensTest(TestCase): + LEX_CASES = [ + # ids + ("a ABC $ _ a123", ["id a", "id ABC", "id $", "id _", "id a123"]), + (r"\u1234 abc\u0020 \u0065_\u0067", [r"id \u1234", r"id abc\u0020", r"id \u0065_\u0067"]), + # numbers + ("123 1.234 0.123e-3 0 1E+40 1e1 .123", ["dnum 123", "dnum 1.234", "dnum 0.123e-3", "dnum 0", "dnum 1E+40", "dnum 1e1", "dnum .123"]), + ("0x1 0xabCD 0XABcd", ["hnum 0x1", "hnum 0xabCD", "hnum 0XABcd"]), + ("010 0377 090", ["onum 010", "onum 0377", "dnum 0", "dnum 90"]), + ("0xa123ghi", ["hnum 0xa123", "id ghi"]), + # keywords + ("function Function FUNCTION", ["keyword function", "id Function", "id FUNCTION"]), + ("const constructor in inherits", ["keyword const", "id constructor", "keyword in", "id inherits"]), + ("true true_enough", ["reserved true", "id true_enough"]), + # strings + (''' 'hello' "hello" ''', ["string 'hello'", 'string "hello"']), + (r""" 'don\'t' "don\"t" '"' "'" '\'' "\"" """, + [r"""string 'don\'t'""", r'''string "don\"t"''', r"""string '"'""", r'''string "'"''', r"""string '\''""", r'''string "\""''']), + (ur'"ƃuıxǝ⅂ ʇdıɹɔsɐʌɐſ\""', [ur'string "ƃuıxǝ⅂ ʇdıɹɔsɐʌɐſ\""']), + # comments + ("a//b", ["id a", "linecomment //b"]), + ("/****/a/=2//hello", ["comment /****/", "id a", "punct /=", "dnum 2", "linecomment //hello"]), + ("/*\n * Header\n */\na=1;", ["comment /*\n * Header\n */", "id a", "punct =", "dnum 1", "punct ;"]), + # punctuation + ("a+++b", ["id a", "punct ++", "punct +", "id b"]), + # regex + (r"a=/a*/,1", ["id a", "punct =", "regex /a*/", "punct ,", "dnum 1"]), + (r"a=/a*[^/]+/,1", ["id a", "punct =", "regex /a*[^/]+/", "punct ,", "dnum 1"]), + (r"a=/a*\[^/,1", ["id a", "punct =", r"regex /a*\[^/", "punct ,", "dnum 1"]), + (r"a=/\//,1", ["id a", "punct =", r"regex /\//", "punct ,", "dnum 1"]), + + # next two are from http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions + ("""for (var x = a in foo && "" || mot ? z:/x:3;x<5;y"', "punct ||", "id mot", "punct ?", "id z", + "punct :", "regex /x:3;x<5;y" || mot ? z/x:3;x<5;y"', "punct ||", "id mot", "punct ?", "id z", + "punct /", "id x", "punct :", "dnum 3", "punct ;", "id x", "punct <", "dnum 5", + "punct ;", "id y", "punct <", "regex /g/i", "punct )", "punct {", + "id xyz", "punct (", "id x", "punct ++", "punct )", "punct ;", "punct }"]), + + # Various "illegal" regexes that are valid according to the std. + (r"""/????/, /++++/, /[----]/ """, ["regex /????/", "punct ,", "regex /++++/", "punct ,", "regex /[----]/"]), + + # Stress cases from http://stackoverflow.com/questions/5533925/what-javascript-constructs-does-jslex-incorrectly-lex/5573409#5573409 + (r"""/\[/""", [r"""regex /\[/"""]), + (r"""/[i]/""", [r"""regex /[i]/"""]), + (r"""/[\]]/""", [r"""regex /[\]]/"""]), + (r"""/a[\]]/""", [r"""regex /a[\]]/"""]), + (r"""/a[\]]b/""", [r"""regex /a[\]]b/"""]), + (r"""/[\]/]/gi""", [r"""regex /[\]/]/gi"""]), + (r"""/\[[^\]]+\]/gi""", [r"""regex /\[[^\]]+\]/gi"""]), + (""" + rexl.re = { + NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/, + UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/, + QUOTED_LITERAL: /^'(?:[^']|'')*'/, + NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/, + SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/ + }; + """, + ["id rexl", "punct .", "id re", "punct =", "punct {", + "id NAME", "punct :", r"""regex /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", "punct ,", + "id UNQUOTED_LITERAL", "punct :", r"""regex /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", "punct ,", + "id QUOTED_LITERAL", "punct :", r"""regex /^'(?:[^']|'')*'/""", "punct ,", + "id NUMERIC_LITERAL", "punct :", r"""regex /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", "punct ,", + "id SYMBOL", "punct :", r"""regex /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""", + "punct }", "punct ;" + ]), + + (""" + rexl.re = { + NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/, + UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/, + QUOTED_LITERAL: /^'(?:[^']|'')*'/, + NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/, + SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/ + }; + str = '"'; + """, + ["id rexl", "punct .", "id re", "punct =", "punct {", + "id NAME", "punct :", r"""regex /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", "punct ,", + "id UNQUOTED_LITERAL", "punct :", r"""regex /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", "punct ,", + "id QUOTED_LITERAL", "punct :", r"""regex /^'(?:[^']|'')*'/""", "punct ,", + "id NUMERIC_LITERAL", "punct :", r"""regex /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", "punct ,", + "id SYMBOL", "punct :", r"""regex /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""", + "punct }", "punct ;", + "id str", "punct =", """string '"'""", "punct ;", + ]), + + (r""" this._js = "e.str(\"" + this.value.replace(/\\/g, "\\\\").replace(/"/g, "\\\"") + "\")"; """, + ["keyword this", "punct .", "id _js", "punct =", r'''string "e.str(\""''', "punct +", "keyword this", "punct .", + "id value", "punct .", "id replace", "punct (", r"regex /\\/g", "punct ,", r'string "\\\\"', "punct )", + "punct .", "id replace", "punct (", r'regex /"/g', "punct ,", r'string "\\\""', "punct )", "punct +", + r'string "\")"', "punct ;"]), + ] + +def make_function(input, toks): + def test_func(self): + lexer = JsLexer() + result = ["%s %s" % (name, tok) for name, tok in lexer.lex(input) if name != 'ws'] + self.assertListEqual(result, toks) + return test_func + +for i, (input, toks) in enumerate(JsTokensTest.LEX_CASES): + setattr(JsTokensTest, "test_case_%d" % i, make_function(input, toks)) + + +GETTEXT_CASES = ( + ( + r""" + a = 1; /* /[0-9]+/ */ + b = 0x2a0b / 1; // /[0-9]+/ + c = 3; + """, + r""" + a = 1; /* /[0-9]+/ */ + b = 0x2a0b / 1; // /[0-9]+/ + c = 3; + """ + ), ( + r""" + a = 1.234e-5; + /* + * /[0-9+/ + */ + b = .0123; + """, + r""" + a = 1.234e-5; + /* + * /[0-9+/ + */ + b = .0123; + """ + ), ( + r""" + x = y / z; + alert(gettext("hello")); + x /= 3; + """, + r""" + x = y / z; + alert(gettext("hello")); + x /= 3; + """ + ), ( + r""" + s = "Hello \"th/foo/ere\""; + s = 'He\x23llo \'th/foo/ere\''; + s = 'slash quote \", just quote "'; + """, + r""" + s = "Hello \"th/foo/ere\""; + s = "He\x23llo \'th/foo/ere\'"; + s = "slash quote \", just quote \""; + """ + ), ( + r""" + s = "Line continuation\ + continued /hello/ still the string";/hello/; + """, + r""" + s = "Line continuation\ + continued /hello/ still the string";"REGEX"; + """ + ), ( + r""" + var regex = /pattern/; + var regex2 = /matter/gm; + var regex3 = /[*/]+/gm.foo("hey"); + """, + r""" + var regex = "REGEX"; + var regex2 = "REGEX"; + var regex3 = "REGEX".foo("hey"); + """ + ), ( + r""" + for (var x = a in foo && "" || mot ? z:/x:3;x<5;y" || mot ? z/x:3;x<5;y" || mot ? z:"REGEX"/i) {xyz(x++);} + for (var x = a in foo && "" || mot ? z/x:3;x<5;y<"REGEX") {xyz(x++);} + """ + ), ( + r""" + \u1234xyz = gettext('Hello there'); + """, r""" + Uu1234xyz = gettext("Hello there"); + """ + ) +) + + +class JsToCForGettextTest(TestCase): + pass + +def make_function(js, c): + def test_func(self): + self.assertMultiLineEqual(prepare_js_for_gettext(js), c) + return test_func + +for i, pair in enumerate(GETTEXT_CASES): + setattr(JsToCForGettextTest, "test_case_%d" % i, make_function(*pair)) diff --git a/tests/regressiontests/utils/tests.py b/tests/regressiontests/utils/tests.py index 2b61627c48..384d394879 100644 --- a/tests/regressiontests/utils/tests.py +++ b/tests/regressiontests/utils/tests.py @@ -18,3 +18,4 @@ from datastructures import * from tzinfo import * from datetime_safe import * from baseconv import * +from jslex import *