Merge pull request #1521 from parrt/KvanTTT-character_issues

Kvan ttt character issues
2016-12-15 11:56:34 -08:00 · 2016-12-15 11:56:34 -08:00 · 9f948c5453
parent 05e39fab8e 160825a86f
commit 9f948c5453
9 changed files with 246 additions and 82 deletions
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/LexerExecDescriptors.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/LexerExecDescriptors.java
@ -302,30 +302,6 @@ public class LexerExecDescriptors {

 	}

-	public static class CharSetWithReversedRange extends BaseLexerTestDescriptor {
-		public String input = "9";
-		/**
-		A
-		[@0,0:0='9',<1>,1:0]
-		[@1,1:0='<EOF>',<-1>,1:1]
-		 */
-		@CommentHasStringValue
-		public String output;
-
-		public String errors = null;
-		public String startRule = "";
-		public String grammarName = "L";
-
-		/**
-		 lexer grammar L;
-		 A : [z-a9]+ {<writeln("\"A\"")>} ;
-		 WS : [ \n]+ -> skip ;
-		 */
-		@CommentHasStringValue
-		public String grammar;
-
-	}
-
 	public static class EOFByItself extends BaseLexerTestDescriptor {
 		public String input = "";
 		/**
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java
@ -320,4 +320,21 @@ public class TestSymbolIssues extends BaseJavaToolTest {

 		testErrors(test, false);
 	}
+
+	@Test public void testCharsCollision() throws  Exception {
+		String[] test = {
+				"lexer grammar L;\n" +
+				"TOKEN_RANGE:      [aa-f];\n" +
+				"TOKEN_RANGE_2:    [A-FD-J];\n" +
+				"TOKEN_RANGE_3:    'Z' | 'K'..'R' | 'O'..'V';\n" +
+				"TOKEN_RANGE_4:    'g'..'l' | [g-l];\n",             // Handling in ATNOptimizer.
+
+				"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:2:18: chars \"a-f\" used multiple times in set [aa-f]\n" +
+				"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars \"D-J\" used multiple times in set [A-FD-J]\n" +
+				"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars \"O-V\" used multiple times in set 'Z' | 'K'..'R' | 'O'..'V'\n" +
+				"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars \"g-l\" used multiple times in set [g-l]\n"
+		};
+
+		testErrors(test, false);
+	}
 }
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
@ -487,11 +487,39 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
 			"Error3: '';\n" +
 			"NotError: ' ';";
 		String expected =
-			"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:2:8: string literals cannot be empty\n" +
-			"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:2:16: string literals cannot be empty\n" +
-			"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:3:8: string literals cannot be empty\n" +
-			"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:4:15: string literals cannot be empty\n" +
-			"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:5:8: string literals cannot be empty\n";
+			"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:2:8: string literals and sets cannot be empty: ''\n" +
+			"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:2:16: string literals and sets cannot be empty: ''\n" +
+			"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:3:8: string literals and sets cannot be empty: ''\n" +
+			"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:4:15: string literals and sets cannot be empty: ''\n" +
+			"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:5:8: string literals and sets cannot be empty: ''\n";
+
+		String[] pair = new String[] {
+				grammar,
+				expected
+		};
+
+		super.testErrors(pair, true);
+	}
+
+	@Test public void testInvalidCharSetAndRange() {
+		String grammar =
+				"lexer grammar Test;\n" +
+				"INVALID_RANGE:         'GH'..'LM';\n" +
+				"INVALID_RANGE_2:       'F'..'A' | 'Z';\n" +
+				"VALID_STRING_LITERALS: '\\u1234' | '\\t' | [\\-\\]];\n" +
+				"INVALID_CHAR_SET:      [f-az][];\n" +
+				"INVALID_CHAR_SET_2:    [\\u24\\uA2][\\u24];\n" +  //https://github.com/antlr/antlr4/issues/1077
+				"INVALID_CHAR_SET_3:    [\\t\\{];";
+
+		String expected =
+				"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:23: multi-character literals are not allowed in lexer sets: 'GH'\n" +
+				"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:29: multi-character literals are not allowed in lexer sets: 'LM'\n" +
+				"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:3:26: string literals and sets cannot be empty: 'F'..'A'\n" +
+				"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:23: string literals and sets cannot be empty: [f-a]\n" +
+				"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:29: string literals and sets cannot be empty: []\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:23: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:33: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:23: invalid escape sequence\n";

 		String[] pair = new String[] {
 				grammar,
--- a/tool/src/org/antlr/v4/automata/ATNOptimizer.java
+++ b/tool/src/org/antlr/v4/automata/ATNOptimizer.java
@ -18,6 +18,7 @@ import org.antlr.v4.runtime.atn.SetTransition;
 import org.antlr.v4.runtime.atn.Transition;
 import org.antlr.v4.runtime.misc.Interval;
 import org.antlr.v4.runtime.misc.IntervalSet;
+import org.antlr.v4.tool.ErrorType;
 import org.antlr.v4.tool.Grammar;
 import org.antlr.v4.tool.Rule;

@ -94,20 +95,34 @@ public class ATNOptimizer {
 					Transition matchTransition = decision.transition(j).target.transition(0);
 					if (matchTransition instanceof NotSetTransition) {
 						throw new UnsupportedOperationException("Not yet implemented.");
-					} else {
-						matchSet.addAll(matchTransition.label());
 					}
+					IntervalSet set = matchTransition.label();
+					int minElem = set.getMinElement();
+					int maxElem = set.getMaxElement();
+					for (int k = minElem; k <= maxElem; k++) {
+						if (matchSet.contains(k)) {
+							char setMin = (char) set.getMinElement();
+							char setMax = (char) set.getMaxElement();
+							// TODO: Token is missing (i.e. position in source will not be displayed).
+							g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName,
+							                           null, (char) minElem + "-" + (char) maxElem, "[" + setMin + "-" + setMax + "]");
+							break;
+						}
+					}
+					matchSet.addAll(set);
 				}

 				Transition newTransition;
 				if (matchSet.getIntervals().size() == 1) {
 					if (matchSet.size() == 1) {
 						newTransition = new AtomTransition(blockEndState, matchSet.getMinElement());
-					} else {
+					}
+					else {
 						Interval matchInterval = matchSet.getIntervals().get(0);
 						newTransition = new RangeTransition(blockEndState, matchInterval.a, matchInterval.b);
 					}
-				} else {
+				}
+				else {
 					newTransition = new SetTransition(blockEndState, matchSet);
 				}

--- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java
+++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@ -39,6 +39,7 @@ import org.antlr.v4.tool.LexerGrammar;
 import org.antlr.v4.tool.Rule;
 import org.antlr.v4.tool.ast.ActionAST;
 import org.antlr.v4.tool.ast.GrammarAST;
+import org.antlr.v4.tool.ast.RangeAST;
 import org.antlr.v4.tool.ast.TerminalAST;
 import org.stringtemplate.v4.ST;
 import org.stringtemplate.v4.STGroup;
@ -253,6 +254,7 @@ public class LexerATNFactory extends ParserATNFactory {
 		ATNState right = newState(b);
 		int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText());
 		int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText());
+		checkRange(a, b, t1, t2);
 		left.addTransition(new  RangeTransition(right, t1, t2));
 		a.atnState = left;
 		b.atnState = left;
@ -268,20 +270,23 @@ public class LexerATNFactory extends ParserATNFactory {
 			if ( t.getType()==ANTLRParser.RANGE ) {
 				int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText());
 				int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText());
+				if (checkRange((GrammarAST) t.getChild(0), (GrammarAST) t.getChild(1), a, b)) {
+					checkSetCollision(associatedAST, set, a, b);
 					set.add(a,b);
 				}
+			}
 			else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) {
 				set.addAll(getSetFromCharSetLiteral(t));
 			}
 			else if ( t.getType()==ANTLRParser.STRING_LITERAL ) {
 				int c = CharSupport.getCharValueFromGrammarCharLiteral(t.getText());
 				if ( c != -1 ) {
+					checkSetCollision(associatedAST, set, c);
 					set.add(c);
 				}
 				else {
 					g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
 											   g.fileName, t.getToken(), t.getText());
-
 				}
 			}
 			else if ( t.getType()==ANTLRParser.TOKEN_REF ) {
@ -307,6 +312,27 @@ public class LexerATNFactory extends ParserATNFactory {
 		return new Handle(left, right);
 	}

+	protected boolean checkRange(GrammarAST leftNode, GrammarAST rightNode, int leftValue, int rightValue) {
+		boolean result = true;
+		if (leftValue == -1) {
+			result = false;
+			g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
+					g.fileName, leftNode.getToken(), leftNode.getText());
+		}
+		if (rightValue == -1) {
+			result = false;
+			g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
+					g.fileName, rightNode.getToken(), rightNode.getText());
+		}
+		if (!result) return result;
+
+		if (rightValue < leftValue) {
+			g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+					g.fileName, leftNode.parent.getToken(), leftNode.getText() + ".." + rightNode.getText());
+		}
+		return result;
+	}
+
 	/** For a lexer, a string is a sequence of char to match.  That is,
 	 *  "fog" is treated as 'f' 'o' 'g' not as a single transition in
 	 *  the DFA.  Machine== o-'f'-&gt;o-'o'-&gt;o-'g'-&gt;o and has n+1 states
@ -315,11 +341,18 @@ public class LexerATNFactory extends ParserATNFactory {
 	@Override
 	public Handle stringLiteral(TerminalAST stringLiteralAST) {
 		String chars = stringLiteralAST.getText();
-		chars = CharSupport.getStringFromGrammarStringLiteral(chars);
-		int n = chars.length();
 		ATNState left = newState(stringLiteralAST);
+		ATNState right;
+		chars = CharSupport.getStringFromGrammarStringLiteral(chars);
+		if (chars == null) {
+			g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
+					g.fileName, stringLiteralAST.getToken());
+			return new Handle(left, left);
+		}
+
+		int n = chars.length();
 		ATNState prev = left;
-		ATNState right = null;
+		right = null;
 		for (int i = 0; i < n; i++) {
 			right = newState(stringLiteralAST);
 			prev.addTransition(new AtomTransition(right, chars.charAt(i)));
@ -346,30 +379,86 @@ public class LexerATNFactory extends ParserATNFactory {
 		String cset = '"' + chars + '"';
 		IntervalSet set = new IntervalSet();

+		if (chars.length() == 0) {
+			g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+					g.fileName, charSetAST.getToken(), "[]");
+			return set;
+		}
 		// unescape all valid escape char like \n, leaving escaped dashes as '\-'
 		// so we can avoid seeing them as '-' range ops.
 		chars = CharSupport.getStringFromGrammarStringLiteral(cset);
-		// now make x-y become set of char
+		if (chars == null) {
+			g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
+			                           g.fileName, charSetAST.getToken());
+			return set;
+		}
 		int n = chars.length();
+		// now make x-y become set of char
 		for (int i = 0; i < n; i++) {
 			int c = chars.charAt(i);
-			if ( c=='\\' && (i+1)<n && chars.charAt(i+1)=='-' ) { // \-
+			if (c == '\\' && i+1 < n && chars.charAt(i+1) == '-') { // \-
+				checkSetCollision(charSetAST, set, '-');
 				set.add('-');
 				i++;
 			}
-			else if ( (i+2)<n && chars.charAt(i+1)=='-' ) { // range x-y
+			else if (i+2 < n && chars.charAt(i+1) == '-') { // range x-y
 				int x = c;
 				int y = chars.charAt(i+2);
-				if ( x<=y ) set.add(x,y);
+				if (x <= y) {
+					checkSetCollision(charSetAST, set, x, y);
+					set.add(x,y);
+				}
+				else {
+					g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+					                           g.fileName, charSetAST.getToken(), "[" + (char) x + "-" + (char) y + "]");
+				}
 				i += 2;
 			}
 			else {
+				checkSetCollision(charSetAST, set, c);
 				set.add(c);
 			}
 		}
 		return set;
 	}

+	protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) {
+		if (set.contains(el)) {
+			g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
+					(char)el, ast.getText());
+		}
+	}
+
+	protected void checkSetCollision(GrammarAST ast, IntervalSet set, int a, int b) {
+		for (int i = a; i <= b; i++) {
+			if (set.contains(i)) {
+				String setText;
+				if (ast.getChildren() == null) {
+					setText = ast.getText();
+				}
+				else {
+					StringBuilder sb = new StringBuilder();
+					for (Object child : ast.getChildren()) {
+						if (child instanceof RangeAST) {
+							sb.append(((RangeAST) child).getChild(0).getText());
+							sb.append("..");
+							sb.append(((RangeAST) child).getChild(1).getText());
+						}
+						else {
+							sb.append(((GrammarAST)child).getText());
+						}
+						sb.append(" | ");
+					}
+					sb.replace(sb.length() - 3, sb.length(), "");
+					setText = sb.toString();
+				}
+				g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
+						(char)a + "-" + (char)b, setText);
+				break;
+			}
+		}
+	}
+
 	@Override
 	public Handle tokenRef(TerminalAST node) {
 		// Ref to EOF in lexer yields char transition on -1
--- a/tool/src/org/antlr/v4/misc/CharSupport.java
+++ b/tool/src/org/antlr/v4/misc/CharSupport.java
@ -28,6 +28,8 @@ public class CharSupport {
 		ANTLRLiteralEscapedCharValue['\\'] = '\\';
 		ANTLRLiteralEscapedCharValue['\''] = '\'';
 		ANTLRLiteralEscapedCharValue['"'] = '"';
+		ANTLRLiteralEscapedCharValue['-'] = '-';
+		ANTLRLiteralEscapedCharValue[']'] = ']';
 		ANTLRLiteralCharValueEscape['\n'] = "\\n";
 		ANTLRLiteralCharValueEscape['\r'] = "\\r";
 		ANTLRLiteralCharValueEscape['\t'] = "\\t";
@ -76,6 +78,36 @@ public class CharSupport {
 		return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1));
 	}

+	public static String getStringFromGrammarStringLiteral(String literal) {
+		StringBuilder buf = new StringBuilder();
+		int i = 1; // skip first quote
+		int n = literal.length()-1; // skip last quote
+		while ( i < n ) { // scan all but last quote
+			int end = i+1;
+			if ( literal.charAt(i) == '\\' ) {
+				end = i+2;
+				if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
+					for (end = i + 2; end < i + 6; end++) {
+						if ( end>n ) return null; // invalid escape sequence.
+						char charAt = literal.charAt(end);
+						if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
+							return null; // invalid escape sequence.
+						}
+					}
+				}
+			}
+			if ( end>n ) return null; // invalid escape sequence.
+			String esc = literal.substring(i, end);
+			int c = getCharValueFromCharInGrammarLiteral(esc);
+			if ( c==-1 ) {
+				return null; // invalid escape sequence.
+			}
+			else buf.append((char)c);
+			i = end;
+		}
+		return buf.toString();
+	}
+
 	/** Given char x or \t or \u1234 return the char value;
 	 *  Unnecessary escapes like '\{' yield -1.
 	 */
@ -96,33 +128,18 @@ public class CharSupport {
 				// '\u1234'
 				if ( !cstr.startsWith("\\u") ) return -1;
 				String unicodeChars = cstr.substring(2, cstr.length());
-				return Integer.parseInt(unicodeChars, 16);
+				int result = -1;
+				try {
+					result = Integer.parseInt(unicodeChars, 16);
+				}
+				catch (NumberFormatException e) {
+				}
+				return result;
 			default:
 				return -1;
 		}
 	}

-	public static String getStringFromGrammarStringLiteral(String literal) {
-		StringBuilder buf = new StringBuilder();
-		int i = 1; // skip first quote
-		int n = literal.length()-1; // skip last quote
-		while ( i < n ) { // scan all but last quote
-			int end = i+1;
-			if ( literal.charAt(i) == '\\' ) {
-				end = i+2;
-				if ( (i+1)>=n ) break; // ignore spurious \ on end
-				if ( literal.charAt(i+1) == 'u' ) end = i+6;
-			}
-			if ( end>n ) break;
-			String esc = literal.substring(i, end);
-			int c = getCharValueFromCharInGrammarLiteral(esc);
-			if ( c==-1 ) { buf.append(esc); }
-			else buf.append((char)c);
-			i = end;
-		}
-		return buf.toString();
-	}
-
 	public static String capitalize(String s) {
 		return Character.toUpperCase(s.charAt(0)) + s.substring(1);
 	}
--- a/tool/src/org/antlr/v4/semantics/BasicSemanticChecks.java
+++ b/tool/src/org/antlr/v4/semantics/BasicSemanticChecks.java
@ -471,7 +471,7 @@ public class BasicSemanticChecks extends GrammarTreeVisitor {
 	protected void enterTerminal(GrammarAST tree) {
 		String text = tree.getText();
 		if (text.equals("''")) {
-			g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_NOT_ALLOWED, g.fileName, tree.token);
+			g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, tree.token, "''");
 		}
 	}

--- a/tool/src/org/antlr/v4/tool/ErrorType.java
+++ b/tool/src/org/antlr/v4/tool/ErrorType.java
@ -982,11 +982,15 @@ public enum ErrorType {
 	 *
 	 * <p>empty strings not allowed</p>
 	 *
-	 * <pre>A: '''test''';</pre>
-	 * <pre>B: '';</pre>
-	 * <pre>C: 'test' '';</pre>
+	 * <pre>
+	 * A: '''test''';
+	 * B: '';
+	 * C: 'test' '';
+	 * D: [];
+	 * E: [f-a];
+	 * </pre>
 	 */
-	EMPTY_STRINGS_NOT_ALLOWED(174, "string literals cannot be empty", ErrorSeverity.ERROR),
+	EMPTY_STRINGS_AND_SETS_NOT_ALLOWED(174, "string literals and sets cannot be empty: <arg>", ErrorSeverity.ERROR),
 	/**
 	 * Compiler Error 175.
 	 *
@ -1027,6 +1031,19 @@ public enum ErrorType {
 	* <p>T00: 'a00' -> skip, more;</p>
 	 */
 	INCOMPATIBLE_COMMANDS(179, "incompatible commands <arg> and <arg2>", ErrorSeverity.WARNING),
+	/**
+	 * Compiler Warning 180.
+	 *
+	 * <p>chars "a-f" used multiple times in set [a-fc-m]</p>
+	 *
+	 * <pre>
+	 * A:    [aa-z];   // warning
+	 * B:    [a-fc-m]; // warning
+	 * </pre>
+	 *
+	 * TODO: Does not work with fragment rules.
+	 */
+	CHARACTERS_COLLISION_IN_SET(180, "chars \"<arg>\" used multiple times in set <arg2>", ErrorSeverity.WARNING),

 	/*
 	 * Backward incompatibility errors
--- a/tool/src/org/antlr/v4/tool/ast/GrammarASTWithOptions.java
+++ b/tool/src/org/antlr/v4/tool/ast/GrammarASTWithOptions.java
@ -8,6 +8,7 @@ package org.antlr.v4.tool.ast;

 import org.antlr.runtime.Token;
 import org.antlr.v4.misc.CharSupport;
+import org.antlr.v4.tool.ErrorType;

 import java.util.Collections;
 import java.util.HashMap;
@ -41,6 +42,10 @@ public abstract class GrammarASTWithOptions extends GrammarAST {
 			String v = value.getText();
 			if ( v.startsWith("'") || v.startsWith("\"") ) {
 				v = CharSupport.getStringFromGrammarStringLiteral(v);
+				if (v == null) {
+					g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, g.fileName, value.getToken());
+					v = "";
+				}
 			}
 			return v;
 		}