Added ranges, escapes to [a-z] notation in lexer

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9897]
This commit is contained in:
parrt 2012-01-22 11:37:15 -08:00
parent 89ad588023
commit 15d537ce6e
5 changed files with 196 additions and 49 deletions

View File

@ -1,5 +1,19 @@
ANTLR v4 Honey Badger early access
Jan 22, 2012
* Added ranges, escapes to [a-z] notation in lexer:
a-z is the inclusive range
escape characters with special meaning: trnbf\'" such as \t
\uXXXX Unicode character with text digits
\- is the - character
\] is the ] character
Missing final range value gives just first char.
Inverted ranges give nothing
Bad escape sequence gives nothing
Jan 21, 2012
* Added modeNames to gen'd lexers

View File

@ -1,2 +1,2 @@
lexer grammar E;
I : 'z' -> popMode ;
I : [\-\]] {System.out.println("I");} ;

View File

@ -205,16 +205,57 @@ public class LexerATNFactory extends ParserATNFactory {
return new Handle(left, right);
}
/** [Aa] char sets */
/** [Aa\t \u1234a-z\]\-] char sets */
@Override
public Handle charSetLiteral(GrammarAST charSetAST) {
ATNState left = newState(charSetAST);
ATNState right = newState(charSetAST);
IntervalSet set = new IntervalSet();
String cset = '"'+charSetAST.getText()+'"';
IntervalSet set = new IntervalSet();
// int n = cset.length();
// int i = 0;
// while ( i < n ) {
// if ( (i+2)<n && cset.charAt(i+1)=='-' ) { // range x-y
// int x = cset.charAt(i);
// int y = cset.charAt(i+2);
// if ( y=='\\' ) { i = i+3; continue; } // x-\
// if ( x<=y ) set.add(x,y);
// }
// else if ( cset.charAt(i)=='\\' ) {
// int end = i+2;
// if ( (i+1)>=n ) break; // ignore spurious \ on end
// if ( cset.charAt(i+1) == 'u' ) end = i+6;
// if ( end>n ) break;
// int c = CharSupport.getCharValueFromCharInGrammarLiteral(cset.substring(i,end));
// set.add(c);
// i = end;
// }
// else {
// set.add(cset.charAt(i));
// i++;
// }
// }
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
// so we can avoid seeing them as '-' range ops.
String chars = CharSupport.getStringFromGrammarStringLiteral(cset);
for (int i=0; i<chars.length(); i++) {
set.add((int)chars.charAt(i));
// now make x-y become set of char
int n = chars.length();
for (int i=0; i< n; i++) {
int c = chars.charAt(i);
if ( c=='\\' && (i+1)<n && chars.charAt(i+1)=='-' ) { // \-
set.add('-');
i++;
}
else if ( (i+2)<n && chars.charAt(i+1)=='-' ) { // range x-y
int x = c;
int y = chars.charAt(i+2);
if ( x<=y ) set.add(x,y);
}
else {
set.add(c);
}
}
left.addTransition(new SetTransition(right, set));
charSetAST.atnState = left;

View File

@ -95,69 +95,58 @@ public class CharSupport {
* Return -1 if not single char.
*/
public static int getCharValueFromGrammarCharLiteral(String literal) {
switch ( literal.length() ) {
case 3 :
if ( literal==null || literal.length()<3 ) return -1;
return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1));
}
/** Given char x or \t or \u1234 return the char value;
* Unnecessary escapes like '\{' yield -1.
*/
public static int getCharValueFromCharInGrammarLiteral(String cstr) {
switch ( cstr.length() ) {
case 1 :
// 'x'
return literal.charAt(1); // no escape char
case 4 :
if ( literal.charAt(1)!='\\' ) return -1;
return cstr.charAt(0); // no escape char
case 2 :
if ( cstr.charAt(0)!='\\' ) return -1;
// '\x' (antlr lexer will catch invalid char)
if ( Character.isDigit(literal.charAt(2)) ) {
// ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR,
// "invalid char literal: "+literal);
return -1;
}
int escChar = literal.charAt(2);
if ( Character.isDigit(cstr.charAt(1)) ) return -1;
int escChar = cstr.charAt(1);
int charVal = ANTLRLiteralEscapedCharValue[escChar];
if ( charVal==0 ) {
// Unnecessary escapes like '\{' should just yield {
return escChar;
}
if ( charVal==0 ) return -1;
return charVal;
case 8 :
case 6 :
// '\u1234'
String unicodeChars = literal.substring(3,literal.length()-1);
if ( !cstr.startsWith("\\u") ) return -1;
String unicodeChars = cstr.substring(2, cstr.length());
return Integer.parseInt(unicodeChars, 16);
default :
// ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR,
// "invalid char literal: "+literal);
return -1;
}
}
public static String getStringFromGrammarStringLiteral(String literal) {
StringBuilder buf = new StringBuilder();
int n = literal.length();
int i = 1; // skip first quote
while ( i < (n-1) ) { // scan all but last quote
switch ( literal.charAt(i) ) {
case '\\' :
i++;
if ( literal.charAt(i)=='u' ) { // '\u1234'
i++;
String unicodeChars = literal.substring(i,i+4);
int h = Integer.parseInt(unicodeChars, 16);
buf.append((char)h);
i += 4;
}
else {
char escChar = literal.charAt(i);
int charVal = ANTLRLiteralEscapedCharValue[escChar];
if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
else buf.append((char)charVal);
i++;
}
break;
default :
buf.append(literal.charAt(i));
i++;
break;
int n = literal.length()-1; // skip last quote
while ( i < n ) { // scan all but last quote
int end = i+1;
if ( literal.charAt(i) == '\\' ) {
end = i+2;
if ( (i+1)>=n ) break; // ignore spurious \ on end
if ( literal.charAt(i+1) == 'u' ) end = i+6;
}
if ( end>n ) break;
String esc = literal.substring(i, end);
int c = getCharValueFromCharInGrammarLiteral(esc);
if ( c==-1 ) { buf.append(esc); }
else buf.append((char)c);
i = end;
}
return buf.toString();
}
public static final String capitalize(String s) {
public static String capitalize(String s) {
return Character.toUpperCase(s.charAt(0)) + s.substring(1);
}
}

View File

@ -198,5 +198,108 @@ public class TestLexerExec extends BaseTest {
assertEquals(expecting, found);
}
@Test public void testCharSet() throws Exception {
String grammar =
"lexer grammar L;\n"+
"I : '0'..'9'+ {System.out.println(\"I\");} ;\n"+
"WS : [ \\n\\u000D] -> skip ;";
String found = execLexer("L.g", grammar, "L", "34\r\n 34");
String expecting =
"I\n" +
"I\n" +
"[@0,0:1='34',<3>,1:0]\n" +
"[@1,5:6='34',<3>,2:1]\n" +
"[@2,7:6='<EOF>',<-1>,2:3]\n";
assertEquals(expecting, found);
}
@Test public void testCharSetPlus() throws Exception {
String grammar =
"lexer grammar L;\n"+
"I : '0'..'9'+ {System.out.println(\"I\");} ;\n"+
"WS : [ \\n\\u000D]+ -> skip ;";
String found = execLexer("L.g", grammar, "L", "34\r\n 34");
String expecting =
"I\n" +
"I\n" +
"[@0,0:1='34',<3>,1:0]\n" +
"[@1,5:6='34',<3>,2:1]\n" +
"[@2,7:6='<EOF>',<-1>,2:3]\n";
assertEquals(expecting, found);
}
@Test public void testCharSetRange() throws Exception {
String grammar =
"lexer grammar L;\n"+
"I : [0-9]+ {System.out.println(\"I\");} ;\n"+
"ID : [a-zA-Z] [a-zA-Z0-9]* {System.out.println(\"ID\");} ;\n"+
"WS : [ \\n\\u0009\r]+ -> skip ;";
String found = execLexer("L.g", grammar, "L", "34\r 34 a2 abc \n ");
String expecting =
"I\n" +
"I\n" +
"ID\n" +
"ID\n" +
"[@0,0:1='34',<3>,1:0]\n" +
"[@1,4:5='34',<3>,1:4]\n" +
"[@2,7:8='a2',<4>,1:7]\n" +
"[@3,10:12='abc',<4>,1:10]\n" +
"[@4,18:17='<EOF>',<-1>,2:3]\n";
assertEquals(expecting, found);
}
@Test public void testCharSetWithMissingEndRange() throws Exception {
String grammar =
"lexer grammar L;\n"+
"I : [0-]+ {System.out.println(\"I\");} ;\n"+
"WS : [ \\n\\u000D]+ -> skip ;";
String found = execLexer("L.g", grammar, "L", "00\r\n");
String expecting =
"I\n" +
"[@0,0:1='00',<3>,1:0]\n" +
"[@1,4:3='<EOF>',<-1>,2:0]\n";
assertEquals(expecting, found);
}
@Test public void testCharSetWithMissingEscapeChar() throws Exception {
String grammar =
"lexer grammar L;\n"+
"I : [0-9]+ {System.out.println(\"I\");} ;\n"+
"WS : [ \\u]+ -> skip ;";
String found = execLexer("L.g", grammar, "L", "34 ");
String expecting =
"I\n" +
"[@0,0:1='34',<3>,1:0]\n" +
"[@1,3:2='<EOF>',<-1>,1:3]\n";
assertEquals(expecting, found);
}
@Test public void testCharSetWithEscapedChar() throws Exception {
String grammar =
"lexer grammar L;\n"+
"DASHBRACK : [\\-\\]]+ {System.out.println(\"DASHBRACK\");} ;\n"+
"WS : [ \\u]+ -> skip ;";
String found = execLexer("L.g", grammar, "L", "- ] ");
String expecting =
"DASHBRACK\n" +
"DASHBRACK\n" +
"[@0,0:0='-',<3>,1:0]\n" +
"[@1,2:2=']',<3>,1:2]\n" +
"[@2,4:3='<EOF>',<-1>,1:4]\n";
assertEquals(expecting, found);
}
@Test public void testCharSetWithReversedRange() throws Exception {
String grammar =
"lexer grammar L;\n"+
"A : [z-a9]+ {System.out.println(\"A\");} ;\n"+
"WS : [ \\u]+ -> skip ;";
String found = execLexer("L.g", grammar, "L", "9");
String expecting =
"A\n" +
"[@0,0:0='9',<3>,1:0]\n" +
"[@1,1:0='<EOF>',<-1>,1:1]\n";
assertEquals(expecting, found);
}
}