Merge pull request #426 from sharwell/fix-175

Support grammar files encoded with UTF-8 with a byte order mark
This commit is contained in:
Terence Parr 2014-01-17 10:22:47 -08:00
commit 9b85e9c636
1 changed files with 11 additions and 1 deletions

View File

@ -541,7 +541,8 @@ NameStartChar
| '\u2C00'..'\u2FEF'
| '\u3001'..'\uD7FF'
| '\uF900'..'\uFDCF'
| '\uFDF0'..'\uFFFD'
| '\uFDF0'..'\uFEFE'
| '\uFF00'..'\uFFFD'
; // ignores | ['\u10000-'\uEFFFF] ;
// ----------------------------
@ -757,6 +758,15 @@ WSNLCHARS
: ' ' | '\t' | '\f' | '\n' | '\r'
;
// This rule allows ANTLR 4 to parse grammars using the UTF-8 encoding with a
// byte order mark. Since this Unicode character doesn't appear as a token
// anywhere else in the grammar, we can simply skip all instances of it without
// problem. This rule will not break usage of \uFEFF inside a LEXER_CHAR_SET or
// STRING_LITERAL.
UnicodeBOM
: '\uFEFF' {skip();}
;
// -----------------
// Illegal Character
//