forked from jasder/antlr
Added ranges, escapes to [a-z] notation in lexer
[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9897]
This commit is contained in:
parent
89ad588023
commit
15d537ce6e
14
CHANGES.txt
14
CHANGES.txt
|
@ -1,5 +1,19 @@
|
|||
ANTLR v4 Honey Badger early access
|
||||
|
||||
Jan 22, 2012
|
||||
|
||||
* Added ranges, escapes to [a-z] notation in lexer:
|
||||
|
||||
a-z is the inclusive range
|
||||
escape characters with special meaning: trnbf\'" such as \t
|
||||
\uXXXX Unicode character with text digits
|
||||
\- is the - character
|
||||
\] is the ] character
|
||||
|
||||
Missing final range value gives just first char.
|
||||
Inverted ranges give nothing
|
||||
Bad escape sequence gives nothing
|
||||
|
||||
Jan 21, 2012
|
||||
|
||||
* Added modeNames to gen'd lexers
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
lexer grammar E;
|
||||
I : 'z' -> popMode ;
|
||||
I : [\-\]] {System.out.println("I");} ;
|
||||
|
|
|
@ -205,16 +205,57 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
return new Handle(left, right);
|
||||
}
|
||||
|
||||
/** [Aa] char sets */
|
||||
/** [Aa\t \u1234a-z\]\-] char sets */
|
||||
@Override
|
||||
public Handle charSetLiteral(GrammarAST charSetAST) {
|
||||
ATNState left = newState(charSetAST);
|
||||
ATNState right = newState(charSetAST);
|
||||
IntervalSet set = new IntervalSet();
|
||||
String cset = '"'+charSetAST.getText()+'"';
|
||||
|
||||
IntervalSet set = new IntervalSet();
|
||||
// int n = cset.length();
|
||||
// int i = 0;
|
||||
// while ( i < n ) {
|
||||
// if ( (i+2)<n && cset.charAt(i+1)=='-' ) { // range x-y
|
||||
// int x = cset.charAt(i);
|
||||
// int y = cset.charAt(i+2);
|
||||
// if ( y=='\\' ) { i = i+3; continue; } // x-\
|
||||
// if ( x<=y ) set.add(x,y);
|
||||
// }
|
||||
// else if ( cset.charAt(i)=='\\' ) {
|
||||
// int end = i+2;
|
||||
// if ( (i+1)>=n ) break; // ignore spurious \ on end
|
||||
// if ( cset.charAt(i+1) == 'u' ) end = i+6;
|
||||
// if ( end>n ) break;
|
||||
// int c = CharSupport.getCharValueFromCharInGrammarLiteral(cset.substring(i,end));
|
||||
// set.add(c);
|
||||
// i = end;
|
||||
// }
|
||||
// else {
|
||||
// set.add(cset.charAt(i));
|
||||
// i++;
|
||||
// }
|
||||
// }
|
||||
|
||||
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
|
||||
// so we can avoid seeing them as '-' range ops.
|
||||
String chars = CharSupport.getStringFromGrammarStringLiteral(cset);
|
||||
for (int i=0; i<chars.length(); i++) {
|
||||
set.add((int)chars.charAt(i));
|
||||
// now make x-y become set of char
|
||||
int n = chars.length();
|
||||
for (int i=0; i< n; i++) {
|
||||
int c = chars.charAt(i);
|
||||
if ( c=='\\' && (i+1)<n && chars.charAt(i+1)=='-' ) { // \-
|
||||
set.add('-');
|
||||
i++;
|
||||
}
|
||||
else if ( (i+2)<n && chars.charAt(i+1)=='-' ) { // range x-y
|
||||
int x = c;
|
||||
int y = chars.charAt(i+2);
|
||||
if ( x<=y ) set.add(x,y);
|
||||
}
|
||||
else {
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
left.addTransition(new SetTransition(right, set));
|
||||
charSetAST.atnState = left;
|
||||
|
|
|
@ -95,69 +95,58 @@ public class CharSupport {
|
|||
* Return -1 if not single char.
|
||||
*/
|
||||
public static int getCharValueFromGrammarCharLiteral(String literal) {
|
||||
switch ( literal.length() ) {
|
||||
case 3 :
|
||||
if ( literal==null || literal.length()<3 ) return -1;
|
||||
return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1));
|
||||
}
|
||||
|
||||
/** Given char x or \t or \u1234 return the char value;
|
||||
* Unnecessary escapes like '\{' yield -1.
|
||||
*/
|
||||
public static int getCharValueFromCharInGrammarLiteral(String cstr) {
|
||||
switch ( cstr.length() ) {
|
||||
case 1 :
|
||||
// 'x'
|
||||
return literal.charAt(1); // no escape char
|
||||
case 4 :
|
||||
if ( literal.charAt(1)!='\\' ) return -1;
|
||||
return cstr.charAt(0); // no escape char
|
||||
case 2 :
|
||||
if ( cstr.charAt(0)!='\\' ) return -1;
|
||||
// '\x' (antlr lexer will catch invalid char)
|
||||
if ( Character.isDigit(literal.charAt(2)) ) {
|
||||
// ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR,
|
||||
// "invalid char literal: "+literal);
|
||||
return -1;
|
||||
}
|
||||
int escChar = literal.charAt(2);
|
||||
if ( Character.isDigit(cstr.charAt(1)) ) return -1;
|
||||
int escChar = cstr.charAt(1);
|
||||
int charVal = ANTLRLiteralEscapedCharValue[escChar];
|
||||
if ( charVal==0 ) {
|
||||
// Unnecessary escapes like '\{' should just yield {
|
||||
return escChar;
|
||||
}
|
||||
if ( charVal==0 ) return -1;
|
||||
return charVal;
|
||||
case 8 :
|
||||
case 6 :
|
||||
// '\u1234'
|
||||
String unicodeChars = literal.substring(3,literal.length()-1);
|
||||
if ( !cstr.startsWith("\\u") ) return -1;
|
||||
String unicodeChars = cstr.substring(2, cstr.length());
|
||||
return Integer.parseInt(unicodeChars, 16);
|
||||
default :
|
||||
// ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR,
|
||||
// "invalid char literal: "+literal);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
public static String getStringFromGrammarStringLiteral(String literal) {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
int n = literal.length();
|
||||
int i = 1; // skip first quote
|
||||
while ( i < (n-1) ) { // scan all but last quote
|
||||
switch ( literal.charAt(i) ) {
|
||||
case '\\' :
|
||||
i++;
|
||||
if ( literal.charAt(i)=='u' ) { // '\u1234'
|
||||
i++;
|
||||
String unicodeChars = literal.substring(i,i+4);
|
||||
int h = Integer.parseInt(unicodeChars, 16);
|
||||
buf.append((char)h);
|
||||
i += 4;
|
||||
}
|
||||
else {
|
||||
char escChar = literal.charAt(i);
|
||||
int charVal = ANTLRLiteralEscapedCharValue[escChar];
|
||||
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
|
||||
else buf.append((char)charVal);
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
default :
|
||||
buf.append(literal.charAt(i));
|
||||
i++;
|
||||
break;
|
||||
int n = literal.length()-1; // skip last quote
|
||||
while ( i < n ) { // scan all but last quote
|
||||
int end = i+1;
|
||||
if ( literal.charAt(i) == '\\' ) {
|
||||
end = i+2;
|
||||
if ( (i+1)>=n ) break; // ignore spurious \ on end
|
||||
if ( literal.charAt(i+1) == 'u' ) end = i+6;
|
||||
}
|
||||
if ( end>n ) break;
|
||||
String esc = literal.substring(i, end);
|
||||
int c = getCharValueFromCharInGrammarLiteral(esc);
|
||||
if ( c==-1 ) { buf.append(esc); }
|
||||
else buf.append((char)c);
|
||||
i = end;
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public static final String capitalize(String s) {
|
||||
public static String capitalize(String s) {
|
||||
return Character.toUpperCase(s.charAt(0)) + s.substring(1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -198,5 +198,108 @@ public class TestLexerExec extends BaseTest {
|
|||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSet() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"I : '0'..'9'+ {System.out.println(\"I\");} ;\n"+
|
||||
"WS : [ \\n\\u000D] -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "34\r\n 34");
|
||||
String expecting =
|
||||
"I\n" +
|
||||
"I\n" +
|
||||
"[@0,0:1='34',<3>,1:0]\n" +
|
||||
"[@1,5:6='34',<3>,2:1]\n" +
|
||||
"[@2,7:6='<EOF>',<-1>,2:3]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSetPlus() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"I : '0'..'9'+ {System.out.println(\"I\");} ;\n"+
|
||||
"WS : [ \\n\\u000D]+ -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "34\r\n 34");
|
||||
String expecting =
|
||||
"I\n" +
|
||||
"I\n" +
|
||||
"[@0,0:1='34',<3>,1:0]\n" +
|
||||
"[@1,5:6='34',<3>,2:1]\n" +
|
||||
"[@2,7:6='<EOF>',<-1>,2:3]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSetRange() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"I : [0-9]+ {System.out.println(\"I\");} ;\n"+
|
||||
"ID : [a-zA-Z] [a-zA-Z0-9]* {System.out.println(\"ID\");} ;\n"+
|
||||
"WS : [ \\n\\u0009\r]+ -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "34\r 34 a2 abc \n ");
|
||||
String expecting =
|
||||
"I\n" +
|
||||
"I\n" +
|
||||
"ID\n" +
|
||||
"ID\n" +
|
||||
"[@0,0:1='34',<3>,1:0]\n" +
|
||||
"[@1,4:5='34',<3>,1:4]\n" +
|
||||
"[@2,7:8='a2',<4>,1:7]\n" +
|
||||
"[@3,10:12='abc',<4>,1:10]\n" +
|
||||
"[@4,18:17='<EOF>',<-1>,2:3]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSetWithMissingEndRange() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"I : [0-]+ {System.out.println(\"I\");} ;\n"+
|
||||
"WS : [ \\n\\u000D]+ -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "00\r\n");
|
||||
String expecting =
|
||||
"I\n" +
|
||||
"[@0,0:1='00',<3>,1:0]\n" +
|
||||
"[@1,4:3='<EOF>',<-1>,2:0]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSetWithMissingEscapeChar() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"I : [0-9]+ {System.out.println(\"I\");} ;\n"+
|
||||
"WS : [ \\u]+ -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "34 ");
|
||||
String expecting =
|
||||
"I\n" +
|
||||
"[@0,0:1='34',<3>,1:0]\n" +
|
||||
"[@1,3:2='<EOF>',<-1>,1:3]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSetWithEscapedChar() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"DASHBRACK : [\\-\\]]+ {System.out.println(\"DASHBRACK\");} ;\n"+
|
||||
"WS : [ \\u]+ -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "- ] ");
|
||||
String expecting =
|
||||
"DASHBRACK\n" +
|
||||
"DASHBRACK\n" +
|
||||
"[@0,0:0='-',<3>,1:0]\n" +
|
||||
"[@1,2:2=']',<3>,1:2]\n" +
|
||||
"[@2,4:3='<EOF>',<-1>,1:4]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
@Test public void testCharSetWithReversedRange() throws Exception {
|
||||
String grammar =
|
||||
"lexer grammar L;\n"+
|
||||
"A : [z-a9]+ {System.out.println(\"A\");} ;\n"+
|
||||
"WS : [ \\u]+ -> skip ;";
|
||||
String found = execLexer("L.g", grammar, "L", "9");
|
||||
String expecting =
|
||||
"A\n" +
|
||||
"[@0,0:0='9',<3>,1:0]\n" +
|
||||
"[@1,1:0='<EOF>',<-1>,1:1]\n";
|
||||
assertEquals(expecting, found);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue