From da0864a2f81116a3e5e32ef8b833bdb8c031b4b1 Mon Sep 17 00:00:00 2001 From: Sam Harwell Date: Fri, 17 Jan 2014 08:30:04 -0600 Subject: [PATCH] Support grammar files encoded with UTF-8 with a byte order mark (fixes #175) --- tool/src/org/antlr/v4/parse/ANTLRLexer.g | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tool/src/org/antlr/v4/parse/ANTLRLexer.g b/tool/src/org/antlr/v4/parse/ANTLRLexer.g index dd190b5eb..975c8d21b 100644 --- a/tool/src/org/antlr/v4/parse/ANTLRLexer.g +++ b/tool/src/org/antlr/v4/parse/ANTLRLexer.g @@ -541,7 +541,8 @@ NameStartChar | '\u2C00'..'\u2FEF' | '\u3001'..'\uD7FF' | '\uF900'..'\uFDCF' - | '\uFDF0'..'\uFFFD' + | '\uFDF0'..'\uFEFE' + | '\uFF00'..'\uFFFD' ; // ignores | ['\u10000-'\uEFFFF] ; // ---------------------------- @@ -757,6 +758,15 @@ WSNLCHARS : ' ' | '\t' | '\f' | '\n' | '\r' ; +// This rule allows ANTLR 4 to parse grammars using the UTF-8 encoding with a +// byte order mark. Since this Unicode character doesn't appear as a token +// anywhere else in the grammar, we can simply skip all instances of it without +// problem. This rule will not break usage of \uFEFF inside a LEXER_CHAR_SET or +// STRING_LITERAL. +UnicodeBOM + : '\uFEFF' {skip();} + ; + // ----------------- // Illegal Character //