diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/LexerExecDescriptors.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/LexerExecDescriptors.java index 46fc4812f..4b76f4286 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/LexerExecDescriptors.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/LexerExecDescriptors.java @@ -302,30 +302,6 @@ public class LexerExecDescriptors { } - public static class CharSetWithReversedRange extends BaseLexerTestDescriptor { - public String input = "9"; - /** - A - [@0,0:0='9',<1>,1:0] - [@1,1:0='',<-1>,1:1] - */ - @CommentHasStringValue - public String output; - - public String errors = null; - public String startRule = ""; - public String grammarName = "L"; - - /** - lexer grammar L; - A : [z-a9]+ {} ; - WS : [ \n]+ -> skip ; - */ - @CommentHasStringValue - public String grammar; - - } - public static class EOFByItself extends BaseLexerTestDescriptor { public String input = ""; /** diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java index 1905d2382..6d56775b2 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java @@ -320,4 +320,21 @@ public class TestSymbolIssues extends BaseJavaToolTest { testErrors(test, false); } + + @Test public void testCharsCollision() throws Exception { + String[] test = { + "lexer grammar L;\n" + + "TOKEN_RANGE: [aa-f];\n" + + "TOKEN_RANGE_2: [A-FD-J];\n" + + "TOKEN_RANGE_3: 'Z' | 'K'..'R' | 'O'..'V';\n" + + "TOKEN_RANGE_4: 'g'..'l' | [g-l];\n", // Handling in ATNOptimizer. + + "warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:2:18: chars \"a-f\" used multiple times in set [aa-f]\n" + + "warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars \"D-J\" used multiple times in set [A-FD-J]\n" + + "warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars \"O-V\" used multiple times in set 'Z' | 'K'..'R' | 'O'..'V'\n" + + "warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars \"g-l\" used multiple times in set [g-l]\n" + }; + + testErrors(test, false); + } } diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java index e54697a73..639c63804 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java @@ -487,11 +487,39 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest { "Error3: '';\n" + "NotError: ' ';"; String expected = - "error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:2:8: string literals cannot be empty\n" + - "error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:2:16: string literals cannot be empty\n" + - "error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:3:8: string literals cannot be empty\n" + - "error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:4:15: string literals cannot be empty\n" + - "error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:5:8: string literals cannot be empty\n"; + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:2:8: string literals and sets cannot be empty: ''\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:2:16: string literals and sets cannot be empty: ''\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:3:8: string literals and sets cannot be empty: ''\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:4:15: string literals and sets cannot be empty: ''\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:5:8: string literals and sets cannot be empty: ''\n"; + + String[] pair = new String[] { + grammar, + expected + }; + + super.testErrors(pair, true); + } + + @Test public void testInvalidCharSetAndRange() { + String grammar = + "lexer grammar Test;\n" + + "INVALID_RANGE: 'GH'..'LM';\n" + + "INVALID_RANGE_2: 'F'..'A' | 'Z';\n" + + "VALID_STRING_LITERALS: '\\u1234' | '\\t' | [\\-\\]];\n" + + "INVALID_CHAR_SET: [f-az][];\n" + + "INVALID_CHAR_SET_2: [\\u24\\uA2][\\u24];\n" + //https://github.com/antlr/antlr4/issues/1077 + "INVALID_CHAR_SET_3: [\\t\\{];"; + + String expected = + "error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:23: multi-character literals are not allowed in lexer sets: 'GH'\n" + + "error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:29: multi-character literals are not allowed in lexer sets: 'LM'\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:3:26: string literals and sets cannot be empty: 'F'..'A'\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:23: string literals and sets cannot be empty: [f-a]\n" + + "error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:29: string literals and sets cannot be empty: []\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:23: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:33: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:23: invalid escape sequence\n"; String[] pair = new String[] { grammar, diff --git a/tool/src/org/antlr/v4/automata/ATNOptimizer.java b/tool/src/org/antlr/v4/automata/ATNOptimizer.java index a2eda1fb8..36e5009f5 100644 --- a/tool/src/org/antlr/v4/automata/ATNOptimizer.java +++ b/tool/src/org/antlr/v4/automata/ATNOptimizer.java @@ -18,6 +18,7 @@ import org.antlr.v4.runtime.atn.SetTransition; import org.antlr.v4.runtime.atn.Transition; import org.antlr.v4.runtime.misc.Interval; import org.antlr.v4.runtime.misc.IntervalSet; +import org.antlr.v4.tool.ErrorType; import org.antlr.v4.tool.Grammar; import org.antlr.v4.tool.Rule; @@ -94,20 +95,34 @@ public class ATNOptimizer { Transition matchTransition = decision.transition(j).target.transition(0); if (matchTransition instanceof NotSetTransition) { throw new UnsupportedOperationException("Not yet implemented."); - } else { - matchSet.addAll(matchTransition.label()); } + IntervalSet set = matchTransition.label(); + int minElem = set.getMinElement(); + int maxElem = set.getMaxElement(); + for (int k = minElem; k <= maxElem; k++) { + if (matchSet.contains(k)) { + char setMin = (char) set.getMinElement(); + char setMax = (char) set.getMaxElement(); + // TODO: Token is missing (i.e. position in source will not be displayed). + g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, + null, (char) minElem + "-" + (char) maxElem, "[" + setMin + "-" + setMax + "]"); + break; + } + } + matchSet.addAll(set); } Transition newTransition; if (matchSet.getIntervals().size() == 1) { if (matchSet.size() == 1) { newTransition = new AtomTransition(blockEndState, matchSet.getMinElement()); - } else { + } + else { Interval matchInterval = matchSet.getIntervals().get(0); newTransition = new RangeTransition(blockEndState, matchInterval.a, matchInterval.b); } - } else { + } + else { newTransition = new SetTransition(blockEndState, matchSet); } diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java index 7d10300c6..b596df553 100644 --- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java +++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java @@ -39,6 +39,7 @@ import org.antlr.v4.tool.LexerGrammar; import org.antlr.v4.tool.Rule; import org.antlr.v4.tool.ast.ActionAST; import org.antlr.v4.tool.ast.GrammarAST; +import org.antlr.v4.tool.ast.RangeAST; import org.antlr.v4.tool.ast.TerminalAST; import org.stringtemplate.v4.ST; import org.stringtemplate.v4.STGroup; @@ -253,6 +254,7 @@ public class LexerATNFactory extends ParserATNFactory { ATNState right = newState(b); int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText()); int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText()); + checkRange(a, b, t1, t2); left.addTransition(new RangeTransition(right, t1, t2)); a.atnState = left; b.atnState = left; @@ -268,7 +270,10 @@ public class LexerATNFactory extends ParserATNFactory { if ( t.getType()==ANTLRParser.RANGE ) { int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText()); int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText()); - set.add(a, b); + if (checkRange((GrammarAST) t.getChild(0), (GrammarAST) t.getChild(1), a, b)) { + checkSetCollision(associatedAST, set, a, b); + set.add(a,b); + } } else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) { set.addAll(getSetFromCharSetLiteral(t)); @@ -276,12 +281,12 @@ public class LexerATNFactory extends ParserATNFactory { else if ( t.getType()==ANTLRParser.STRING_LITERAL ) { int c = CharSupport.getCharValueFromGrammarCharLiteral(t.getText()); if ( c != -1 ) { + checkSetCollision(associatedAST, set, c); set.add(c); } else { g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET, g.fileName, t.getToken(), t.getText()); - } } else if ( t.getType()==ANTLRParser.TOKEN_REF ) { @@ -307,6 +312,27 @@ public class LexerATNFactory extends ParserATNFactory { return new Handle(left, right); } + protected boolean checkRange(GrammarAST leftNode, GrammarAST rightNode, int leftValue, int rightValue) { + boolean result = true; + if (leftValue == -1) { + result = false; + g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET, + g.fileName, leftNode.getToken(), leftNode.getText()); + } + if (rightValue == -1) { + result = false; + g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET, + g.fileName, rightNode.getToken(), rightNode.getText()); + } + if (!result) return result; + + if (rightValue < leftValue) { + g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, + g.fileName, leftNode.parent.getToken(), leftNode.getText() + ".." + rightNode.getText()); + } + return result; + } + /** For a lexer, a string is a sequence of char to match. That is, * "fog" is treated as 'f' 'o' 'g' not as a single transition in * the DFA. Machine== o-'f'->o-'o'->o-'g'->o and has n+1 states @@ -315,12 +341,19 @@ public class LexerATNFactory extends ParserATNFactory { @Override public Handle stringLiteral(TerminalAST stringLiteralAST) { String chars = stringLiteralAST.getText(); - chars = CharSupport.getStringFromGrammarStringLiteral(chars); - int n = chars.length(); ATNState left = newState(stringLiteralAST); + ATNState right; + chars = CharSupport.getStringFromGrammarStringLiteral(chars); + if (chars == null) { + g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, + g.fileName, stringLiteralAST.getToken()); + return new Handle(left, left); + } + + int n = chars.length(); ATNState prev = left; - ATNState right = null; - for (int i=0; i=n ) break; // ignore spurious \ on end - if ( literal.charAt(i+1) == 'u' ) end = i+6; + if ( i+1 < n && literal.charAt(i+1) == 'u' ) { + for (end = i + 2; end < i + 6; end++) { + if ( end>n ) return null; // invalid escape sequence. + char charAt = literal.charAt(end); + if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { + return null; // invalid escape sequence. + } + } + } } - if ( end>n ) break; + if ( end>n ) return null; // invalid escape sequence. String esc = literal.substring(i, end); int c = getCharValueFromCharInGrammarLiteral(esc); - if ( c==-1 ) { buf.append(esc); } + if ( c==-1 ) { + return null; // invalid escape sequence. + } else buf.append((char)c); i = end; } return buf.toString(); } + /** Given char x or \t or \u1234 return the char value; + * Unnecessary escapes like '\{' yield -1. + */ + public static int getCharValueFromCharInGrammarLiteral(String cstr) { + switch ( cstr.length() ) { + case 1: + // 'x' + return cstr.charAt(0); // no escape char + case 2: + if ( cstr.charAt(0)!='\\' ) return -1; + // '\x' (antlr lexer will catch invalid char) + if ( Character.isDigit(cstr.charAt(1)) ) return -1; + int escChar = cstr.charAt(1); + int charVal = ANTLRLiteralEscapedCharValue[escChar]; + if ( charVal==0 ) return -1; + return charVal; + case 6: + // '\u1234' + if ( !cstr.startsWith("\\u") ) return -1; + String unicodeChars = cstr.substring(2, cstr.length()); + int result = -1; + try { + result = Integer.parseInt(unicodeChars, 16); + } + catch (NumberFormatException e) { + } + return result; + default: + return -1; + } + } + public static String capitalize(String s) { return Character.toUpperCase(s.charAt(0)) + s.substring(1); } diff --git a/tool/src/org/antlr/v4/semantics/BasicSemanticChecks.java b/tool/src/org/antlr/v4/semantics/BasicSemanticChecks.java index 5d47d932a..4f5e792d6 100644 --- a/tool/src/org/antlr/v4/semantics/BasicSemanticChecks.java +++ b/tool/src/org/antlr/v4/semantics/BasicSemanticChecks.java @@ -471,7 +471,7 @@ public class BasicSemanticChecks extends GrammarTreeVisitor { protected void enterTerminal(GrammarAST tree) { String text = tree.getText(); if (text.equals("''")) { - g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_NOT_ALLOWED, g.fileName, tree.token); + g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, tree.token, "''"); } } diff --git a/tool/src/org/antlr/v4/tool/ErrorType.java b/tool/src/org/antlr/v4/tool/ErrorType.java index 7352033c4..c4a3861b5 100644 --- a/tool/src/org/antlr/v4/tool/ErrorType.java +++ b/tool/src/org/antlr/v4/tool/ErrorType.java @@ -982,11 +982,15 @@ public enum ErrorType { * *

empty strings not allowed

* - *
A: '''test''';
- *
B: '';
- *
C: 'test' '';
+ *
+	 * A: '''test''';
+	 * B: '';
+	 * C: 'test' '';
+	 * D: [];
+	 * E: [f-a];
+	 * 
*/ - EMPTY_STRINGS_NOT_ALLOWED(174, "string literals cannot be empty", ErrorSeverity.ERROR), + EMPTY_STRINGS_AND_SETS_NOT_ALLOWED(174, "string literals and sets cannot be empty: ", ErrorSeverity.ERROR), /** * Compiler Error 175. * @@ -1027,6 +1031,19 @@ public enum ErrorType { *

T00: 'a00' -> skip, more;

*/ INCOMPATIBLE_COMMANDS(179, "incompatible commands and ", ErrorSeverity.WARNING), + /** + * Compiler Warning 180. + * + *

chars "a-f" used multiple times in set [a-fc-m]

+ * + *
+	 * A:    [aa-z];   // warning
+	 * B:    [a-fc-m]; // warning
+	 * 
+ * + * TODO: Does not work with fragment rules. + */ + CHARACTERS_COLLISION_IN_SET(180, "chars \"\" used multiple times in set ", ErrorSeverity.WARNING), /* * Backward incompatibility errors diff --git a/tool/src/org/antlr/v4/tool/ast/GrammarASTWithOptions.java b/tool/src/org/antlr/v4/tool/ast/GrammarASTWithOptions.java index 6ec0a81f7..f6b5bcad5 100644 --- a/tool/src/org/antlr/v4/tool/ast/GrammarASTWithOptions.java +++ b/tool/src/org/antlr/v4/tool/ast/GrammarASTWithOptions.java @@ -8,6 +8,7 @@ package org.antlr.v4.tool.ast; import org.antlr.runtime.Token; import org.antlr.v4.misc.CharSupport; +import org.antlr.v4.tool.ErrorType; import java.util.Collections; import java.util.HashMap; @@ -41,6 +42,10 @@ public abstract class GrammarASTWithOptions extends GrammarAST { String v = value.getText(); if ( v.startsWith("'") || v.startsWith("\"") ) { v = CharSupport.getStringFromGrammarStringLiteral(v); + if (v == null) { + g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, g.fileName, value.getToken()); + v = ""; + } } return v; }