Added ranges, escapes to [a-z] notation in lexer

[git-p4: depot-paths = "//depot/code/antlr4/main/": change = 9897]
2012-01-22 11:37:15 -08:00 · 2012-01-22 11:37:15 -08:00 · 15d537ce6e
parent 89ad588023
commit 15d537ce6e
5 changed files with 196 additions and 49 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -1,5 +1,19 @@
 ANTLR v4 Honey Badger early access

+Jan 22, 2012
+
+* Added ranges, escapes to [a-z] notation in lexer:
+
+    a-z is the inclusive range
+    escape characters with special meaning: trnbf\'" such as \t
+    \uXXXX Unicode character with text digits
+    \- is the - character
+    \] is the ] character
+
+    Missing final range value gives just first char.
+    Inverted ranges give nothing
+    Bad escape sequence gives nothing
+
 Jan 21, 2012

 * Added modeNames to gen'd lexers
--- a/tool/playground/E.g
+++ b/tool/playground/E.g
@ -1,2 +1,2 @@
 lexer grammar E;
-I : 'z' -> popMode ;
+I : [\-\]] {System.out.println("I");} ;
--- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java
+++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@ -205,16 +205,57 @@ public class LexerATNFactory extends ParserATNFactory {
 		return new Handle(left, right);
 	}

-	/** [Aa] char sets */
+	/** [Aa\t \u1234a-z\]\-] char sets */
 	@Override
 	public Handle charSetLiteral(GrammarAST charSetAST) {
 		ATNState left = newState(charSetAST);
 		ATNState right = newState(charSetAST);
-		IntervalSet set = new IntervalSet();
 		String cset = '"'+charSetAST.getText()+'"';
+
+		IntervalSet set = new IntervalSet();
+//		int n = cset.length();
+//		int i = 0;
+//		while ( i < n ) {
+//			if ( (i+2)<n && cset.charAt(i+1)=='-' ) { // range x-y
+//				int x = cset.charAt(i);
+//				int y = cset.charAt(i+2);
+//				if ( y=='\\' ) { i = i+3; continue; } // x-\
+//				if ( x<=y ) set.add(x,y);
+//			}
+//			else if ( cset.charAt(i)=='\\' ) {
+//				int end = i+2;
+//				if ( (i+1)>=n ) break; // ignore spurious \ on end
+//				if ( cset.charAt(i+1) == 'u' ) end = i+6;
+//				if ( end>n ) break;
+//				int c = CharSupport.getCharValueFromCharInGrammarLiteral(cset.substring(i,end));
+//				set.add(c);
+//				i = end;
+//			}
+//			else {
+//				set.add(cset.charAt(i));
+//				i++;
+//			}
+//		}
+
+		// unescape all valid escape char like \n, leaving escaped dashes as '\-'
+		// so we can avoid seeing them as '-' range ops.
 		String chars = CharSupport.getStringFromGrammarStringLiteral(cset);
-		for (int i=0; i<chars.length(); i++) {
-			set.add((int)chars.charAt(i));
+		// now make x-y become set of char
+		int n = chars.length();
+		for (int i=0; i< n; i++) {
+			int c = chars.charAt(i);
+			if ( c=='\\' && (i+1)<n && chars.charAt(i+1)=='-' ) { // \-
+				set.add('-');
+				i++;
+			}
+			else if ( (i+2)<n && chars.charAt(i+1)=='-' ) { // range x-y
+				int x = c;
+				int y = chars.charAt(i+2);
+				if ( x<=y ) set.add(x,y);
+			}
+			else {
+				set.add(c);
+			}
 		}
 		left.addTransition(new SetTransition(right, set));
 		charSetAST.atnState = left;
--- a/tool/src/org/antlr/v4/misc/CharSupport.java
+++ b/tool/src/org/antlr/v4/misc/CharSupport.java
@ -95,69 +95,58 @@ public class CharSupport {
 	 *  Return -1 if not single char.
 	 */
 	public static int getCharValueFromGrammarCharLiteral(String literal) {
-		switch ( literal.length() ) {
-			case 3 :
+		if ( literal==null || literal.length()<3 ) return -1;
+		return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1));
+	}
+
+	/** Given char x or \t or \u1234 return the char value;
+	 *  Unnecessary escapes like '\{' yield -1.
+	 */
+	public static int getCharValueFromCharInGrammarLiteral(String cstr) {
+		switch ( cstr.length() ) {
+			case 1 :
 				// 'x'
-				return literal.charAt(1); // no escape char
-			case 4 :
-				if ( literal.charAt(1)!='\\' ) return -1;
+				return cstr.charAt(0); // no escape char
+			case 2 :
+				if ( cstr.charAt(0)!='\\' ) return -1;
 				// '\x'  (antlr lexer will catch invalid char)
-				if ( Character.isDigit(literal.charAt(2)) ) {
-//					ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR,
-//									   "invalid char literal: "+literal);
-					return -1;
-				}
-				int escChar = literal.charAt(2);
+				if ( Character.isDigit(cstr.charAt(1)) ) return -1;
+				int escChar = cstr.charAt(1);
 				int charVal = ANTLRLiteralEscapedCharValue[escChar];
-				if ( charVal==0 ) {
-					// Unnecessary escapes like '\{' should just yield {
-					return escChar;
-				}
+				if ( charVal==0 ) return -1;
 				return charVal;
-			case 8 :
+			case 6 :
 				// '\u1234'
-				String unicodeChars = literal.substring(3,literal.length()-1);
+				if ( !cstr.startsWith("\\u") ) return -1;
+				String unicodeChars = cstr.substring(2, cstr.length());
 				return Integer.parseInt(unicodeChars, 16);
 			default :
-//				ErrorManager.error(ErrorManager.MSG_SYNTAX_ERROR,
-//								   "invalid char literal: "+literal);
 				return -1;
 		}
 	}

 	public static String getStringFromGrammarStringLiteral(String literal) {
 		StringBuilder buf = new StringBuilder();
-		int n = literal.length();
 		int i = 1; // skip first quote
-		while ( i < (n-1) ) { // scan all but last quote
-			switch ( literal.charAt(i) ) {
-				case '\\' :
-					i++;
-					if ( literal.charAt(i)=='u' ) { // '\u1234'
-						i++;
-						String unicodeChars = literal.substring(i,i+4);
-						int h = Integer.parseInt(unicodeChars, 16);
-						buf.append((char)h);
-						i += 4;
-					}
-					else {
-						char escChar = literal.charAt(i);
-						int charVal = ANTLRLiteralEscapedCharValue[escChar];
-						if ( charVal==0 ) buf.append(escChar); // Unnecessary escapes like '\{' should just yield {
-						else buf.append((char)charVal);
-						i++;
-					}
-					break;
-				default :
-					buf.append(literal.charAt(i));
-					i++;
-					break;
+		int n = literal.length()-1; // skip last quote
+		while ( i < n ) { // scan all but last quote
+			int end = i+1;
+			if ( literal.charAt(i) == '\\' ) {
+				end = i+2;
+				if ( (i+1)>=n ) break; // ignore spurious \ on end
+				if ( literal.charAt(i+1) == 'u' ) end = i+6;
 			}
+			if ( end>n ) break;
+			String esc = literal.substring(i, end);
+			int c = getCharValueFromCharInGrammarLiteral(esc);
+			if ( c==-1 ) { buf.append(esc); }
+			else buf.append((char)c);
+			i = end;
 		}
 		return buf.toString();
 	}

-	public static final String capitalize(String s) {
+	public static String capitalize(String s) {
 		return Character.toUpperCase(s.charAt(0)) + s.substring(1);
 	}
 }
--- a/tool/test/org/antlr/v4/test/TestLexerExec.java
+++ b/tool/test/org/antlr/v4/test/TestLexerExec.java
@ -198,5 +198,108 @@ public class TestLexerExec extends BaseTest {
 		assertEquals(expecting, found);
 	}

+	@Test public void testCharSet() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"I : '0'..'9'+ {System.out.println(\"I\");} ;\n"+
+			"WS : [ \\n\\u000D] -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "34\r\n 34");
+		String expecting =
+			"I\n" +
+			"I\n" +
+			"[@0,0:1='34',<3>,1:0]\n" +
+			"[@1,5:6='34',<3>,2:1]\n" +
+			"[@2,7:6='<EOF>',<-1>,2:3]\n";
+		assertEquals(expecting, found);
+	}
+
+	@Test public void testCharSetPlus() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"I : '0'..'9'+ {System.out.println(\"I\");} ;\n"+
+			"WS : [ \\n\\u000D]+ -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "34\r\n 34");
+		String expecting =
+			"I\n" +
+			"I\n" +
+			"[@0,0:1='34',<3>,1:0]\n" +
+			"[@1,5:6='34',<3>,2:1]\n" +
+			"[@2,7:6='<EOF>',<-1>,2:3]\n";
+		assertEquals(expecting, found);
+	}
+
+	@Test public void testCharSetRange() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"I : [0-9]+ {System.out.println(\"I\");} ;\n"+
+			"ID : [a-zA-Z] [a-zA-Z0-9]* {System.out.println(\"ID\");} ;\n"+
+			"WS : [ \\n\\u0009\r]+ -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "34\r 34 a2 abc \n   ");
+		String expecting =
+			"I\n" +
+			"I\n" +
+			"ID\n" +
+			"ID\n" +
+			"[@0,0:1='34',<3>,1:0]\n" +
+			"[@1,4:5='34',<3>,1:4]\n" +
+			"[@2,7:8='a2',<4>,1:7]\n" +
+			"[@3,10:12='abc',<4>,1:10]\n" +
+			"[@4,18:17='<EOF>',<-1>,2:3]\n";
+		assertEquals(expecting, found);
+	}
+
+	@Test public void testCharSetWithMissingEndRange() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"I : [0-]+ {System.out.println(\"I\");} ;\n"+
+			"WS : [ \\n\\u000D]+ -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "00\r\n");
+		String expecting =
+			"I\n" +
+			"[@0,0:1='00',<3>,1:0]\n" +
+			"[@1,4:3='<EOF>',<-1>,2:0]\n";
+		assertEquals(expecting, found);
+	}
+
+	@Test public void testCharSetWithMissingEscapeChar() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"I : [0-9]+ {System.out.println(\"I\");} ;\n"+
+			"WS : [ \\u]+ -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "34 ");
+		String expecting =
+			"I\n" +
+			"[@0,0:1='34',<3>,1:0]\n" +
+			"[@1,3:2='<EOF>',<-1>,1:3]\n";
+		assertEquals(expecting, found);
+	}
+
+	@Test public void testCharSetWithEscapedChar() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"DASHBRACK : [\\-\\]]+ {System.out.println(\"DASHBRACK\");} ;\n"+
+			"WS : [ \\u]+ -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "- ] ");
+		String expecting =
+			"DASHBRACK\n" +
+			"DASHBRACK\n" +
+			"[@0,0:0='-',<3>,1:0]\n" +
+			"[@1,2:2=']',<3>,1:2]\n" +
+			"[@2,4:3='<EOF>',<-1>,1:4]\n";
+		assertEquals(expecting, found);
+	}
+
+	@Test public void testCharSetWithReversedRange() throws Exception {
+		String grammar =
+			"lexer grammar L;\n"+
+			"A : [z-a9]+ {System.out.println(\"A\");} ;\n"+
+			"WS : [ \\u]+ -> skip ;";
+		String found = execLexer("L.g", grammar, "L", "9");
+		String expecting =
+			"A\n" +
+			"[@0,0:0='9',<3>,1:0]\n" +
+			"[@1,1:0='<EOF>',<-1>,1:1]\n";
+		assertEquals(expecting, found);
+	}

 }