Merge pull request #426 from sharwell/fix-175

Support grammar files encoded with UTF-8 with a byte order mark
2014-01-17 10:22:47 -08:00 · 2014-01-17 10:22:47 -08:00 · 9b85e9c636
parent 10f45abbe4 da0864a2f8
commit 9b85e9c636
1 changed files with 11 additions and 1 deletions
--- a/tool/src/org/antlr/v4/parse/ANTLRLexer.g
+++ b/tool/src/org/antlr/v4/parse/ANTLRLexer.g
@ -541,7 +541,8 @@ NameStartChar
            |   '\u2C00'..'\u2FEF'
            |   '\u3001'..'\uD7FF'
            |   '\uF900'..'\uFDCF'
-            |   '\uFDF0'..'\uFFFD'
+            |   '\uFDF0'..'\uFEFE'
+            |   '\uFF00'..'\uFFFD'
            ; // ignores | ['\u10000-'\uEFFFF] ;

 // ----------------------------
@ -757,6 +758,15 @@ WSNLCHARS
    : ' ' | '\t' | '\f' | '\n' | '\r'
    ;

+// This rule allows ANTLR 4 to parse grammars using the UTF-8 encoding with a
+// byte order mark. Since this Unicode character doesn't appear as a token
+// anywhere else in the grammar, we can simply skip all instances of it without
+// problem. This rule will not break usage of \uFEFF inside a LEXER_CHAR_SET or
+// STRING_LITERAL.
+UnicodeBOM
+    :   '\uFEFF' {skip();}
+    ;
+
 // -----------------
 // Illegal Character
 //