Merge pull request #1521 from parrt/KvanTTT-character_issues

Kvan ttt character issues
This commit is contained in:
Terence Parr 2016-12-15 11:56:34 -08:00 committed by GitHub
commit 9f948c5453
9 changed files with 246 additions and 82 deletions

View File

@ -302,30 +302,6 @@ public class LexerExecDescriptors {
}
public static class CharSetWithReversedRange extends BaseLexerTestDescriptor {
public String input = "9";
/**
A
[@0,0:0='9',<1>,1:0]
[@1,1:0='<EOF>',<-1>,1:1]
*/
@CommentHasStringValue
public String output;
public String errors = null;
public String startRule = "";
public String grammarName = "L";
/**
lexer grammar L;
A : [z-a9]+ {<writeln("\"A\"")>} ;
WS : [ \n]+ -> skip ;
*/
@CommentHasStringValue
public String grammar;
}
public static class EOFByItself extends BaseLexerTestDescriptor {
public String input = "";
/**

View File

@ -320,4 +320,21 @@ public class TestSymbolIssues extends BaseJavaToolTest {
testErrors(test, false);
}
@Test public void testCharsCollision() throws Exception {
String[] test = {
"lexer grammar L;\n" +
"TOKEN_RANGE: [aa-f];\n" +
"TOKEN_RANGE_2: [A-FD-J];\n" +
"TOKEN_RANGE_3: 'Z' | 'K'..'R' | 'O'..'V';\n" +
"TOKEN_RANGE_4: 'g'..'l' | [g-l];\n", // Handling in ATNOptimizer.
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:2:18: chars \"a-f\" used multiple times in set [aa-f]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars \"D-J\" used multiple times in set [A-FD-J]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars \"O-V\" used multiple times in set 'Z' | 'K'..'R' | 'O'..'V'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars \"g-l\" used multiple times in set [g-l]\n"
};
testErrors(test, false);
}
}

View File

@ -487,11 +487,39 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
"Error3: '';\n" +
"NotError: ' ';";
String expected =
"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:2:8: string literals cannot be empty\n" +
"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:2:16: string literals cannot be empty\n" +
"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:3:8: string literals cannot be empty\n" +
"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:4:15: string literals cannot be empty\n" +
"error(" + ErrorType.EMPTY_STRINGS_NOT_ALLOWED.code + "): T.g4:5:8: string literals cannot be empty\n";
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:2:8: string literals and sets cannot be empty: ''\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:2:16: string literals and sets cannot be empty: ''\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:3:8: string literals and sets cannot be empty: ''\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:4:15: string literals and sets cannot be empty: ''\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): T.g4:5:8: string literals and sets cannot be empty: ''\n";
String[] pair = new String[] {
grammar,
expected
};
super.testErrors(pair, true);
}
@Test public void testInvalidCharSetAndRange() {
String grammar =
"lexer grammar Test;\n" +
"INVALID_RANGE: 'GH'..'LM';\n" +
"INVALID_RANGE_2: 'F'..'A' | 'Z';\n" +
"VALID_STRING_LITERALS: '\\u1234' | '\\t' | [\\-\\]];\n" +
"INVALID_CHAR_SET: [f-az][];\n" +
"INVALID_CHAR_SET_2: [\\u24\\uA2][\\u24];\n" + //https://github.com/antlr/antlr4/issues/1077
"INVALID_CHAR_SET_3: [\\t\\{];";
String expected =
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:23: multi-character literals are not allowed in lexer sets: 'GH'\n" +
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:29: multi-character literals are not allowed in lexer sets: 'LM'\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:3:26: string literals and sets cannot be empty: 'F'..'A'\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:23: string literals and sets cannot be empty: [f-a]\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:29: string literals and sets cannot be empty: []\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:23: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:33: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:23: invalid escape sequence\n";
String[] pair = new String[] {
grammar,

View File

@ -18,6 +18,7 @@ import org.antlr.v4.runtime.atn.SetTransition;
import org.antlr.v4.runtime.atn.Transition;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.tool.ErrorType;
import org.antlr.v4.tool.Grammar;
import org.antlr.v4.tool.Rule;
@ -94,20 +95,34 @@ public class ATNOptimizer {
Transition matchTransition = decision.transition(j).target.transition(0);
if (matchTransition instanceof NotSetTransition) {
throw new UnsupportedOperationException("Not yet implemented.");
} else {
matchSet.addAll(matchTransition.label());
}
IntervalSet set = matchTransition.label();
int minElem = set.getMinElement();
int maxElem = set.getMaxElement();
for (int k = minElem; k <= maxElem; k++) {
if (matchSet.contains(k)) {
char setMin = (char) set.getMinElement();
char setMax = (char) set.getMaxElement();
// TODO: Token is missing (i.e. position in source will not be displayed).
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName,
null, (char) minElem + "-" + (char) maxElem, "[" + setMin + "-" + setMax + "]");
break;
}
}
matchSet.addAll(set);
}
Transition newTransition;
if (matchSet.getIntervals().size() == 1) {
if (matchSet.size() == 1) {
newTransition = new AtomTransition(blockEndState, matchSet.getMinElement());
} else {
}
else {
Interval matchInterval = matchSet.getIntervals().get(0);
newTransition = new RangeTransition(blockEndState, matchInterval.a, matchInterval.b);
}
} else {
}
else {
newTransition = new SetTransition(blockEndState, matchSet);
}

View File

@ -39,6 +39,7 @@ import org.antlr.v4.tool.LexerGrammar;
import org.antlr.v4.tool.Rule;
import org.antlr.v4.tool.ast.ActionAST;
import org.antlr.v4.tool.ast.GrammarAST;
import org.antlr.v4.tool.ast.RangeAST;
import org.antlr.v4.tool.ast.TerminalAST;
import org.stringtemplate.v4.ST;
import org.stringtemplate.v4.STGroup;
@ -253,6 +254,7 @@ public class LexerATNFactory extends ParserATNFactory {
ATNState right = newState(b);
int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText());
int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText());
checkRange(a, b, t1, t2);
left.addTransition(new RangeTransition(right, t1, t2));
a.atnState = left;
b.atnState = left;
@ -268,7 +270,10 @@ public class LexerATNFactory extends ParserATNFactory {
if ( t.getType()==ANTLRParser.RANGE ) {
int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText());
int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText());
set.add(a, b);
if (checkRange((GrammarAST) t.getChild(0), (GrammarAST) t.getChild(1), a, b)) {
checkSetCollision(associatedAST, set, a, b);
set.add(a,b);
}
}
else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) {
set.addAll(getSetFromCharSetLiteral(t));
@ -276,12 +281,12 @@ public class LexerATNFactory extends ParserATNFactory {
else if ( t.getType()==ANTLRParser.STRING_LITERAL ) {
int c = CharSupport.getCharValueFromGrammarCharLiteral(t.getText());
if ( c != -1 ) {
checkSetCollision(associatedAST, set, c);
set.add(c);
}
else {
g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
g.fileName, t.getToken(), t.getText());
}
}
else if ( t.getType()==ANTLRParser.TOKEN_REF ) {
@ -307,6 +312,27 @@ public class LexerATNFactory extends ParserATNFactory {
return new Handle(left, right);
}
protected boolean checkRange(GrammarAST leftNode, GrammarAST rightNode, int leftValue, int rightValue) {
boolean result = true;
if (leftValue == -1) {
result = false;
g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
g.fileName, leftNode.getToken(), leftNode.getText());
}
if (rightValue == -1) {
result = false;
g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
g.fileName, rightNode.getToken(), rightNode.getText());
}
if (!result) return result;
if (rightValue < leftValue) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, leftNode.parent.getToken(), leftNode.getText() + ".." + rightNode.getText());
}
return result;
}
/** For a lexer, a string is a sequence of char to match. That is,
* "fog" is treated as 'f' 'o' 'g' not as a single transition in
* the DFA. Machine== o-'f'-&gt;o-'o'-&gt;o-'g'-&gt;o and has n+1 states
@ -315,12 +341,19 @@ public class LexerATNFactory extends ParserATNFactory {
@Override
public Handle stringLiteral(TerminalAST stringLiteralAST) {
String chars = stringLiteralAST.getText();
chars = CharSupport.getStringFromGrammarStringLiteral(chars);
int n = chars.length();
ATNState left = newState(stringLiteralAST);
ATNState right;
chars = CharSupport.getStringFromGrammarStringLiteral(chars);
if (chars == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, stringLiteralAST.getToken());
return new Handle(left, left);
}
int n = chars.length();
ATNState prev = left;
ATNState right = null;
for (int i=0; i<n; i++) {
right = null;
for (int i = 0; i < n; i++) {
right = newState(stringLiteralAST);
prev.addTransition(new AtomTransition(right, chars.charAt(i)));
prev = right;
@ -342,34 +375,90 @@ public class LexerATNFactory extends ParserATNFactory {
public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
String chars = charSetAST.getText();
chars = chars.substring(1, chars.length()-1);
String cset = '"'+ chars +'"';
chars = chars.substring(1, chars.length() - 1);
String cset = '"' + chars + '"';
IntervalSet set = new IntervalSet();
if (chars.length() == 0) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), "[]");
return set;
}
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
// so we can avoid seeing them as '-' range ops.
chars = CharSupport.getStringFromGrammarStringLiteral(cset);
// now make x-y become set of char
if (chars == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, charSetAST.getToken());
return set;
}
int n = chars.length();
for (int i=0; i< n; i++) {
// now make x-y become set of char
for (int i = 0; i < n; i++) {
int c = chars.charAt(i);
if ( c=='\\' && (i+1)<n && chars.charAt(i+1)=='-' ) { // \-
if (c == '\\' && i+1 < n && chars.charAt(i+1) == '-') { // \-
checkSetCollision(charSetAST, set, '-');
set.add('-');
i++;
}
else if ( (i+2)<n && chars.charAt(i+1)=='-' ) { // range x-y
else if (i+2 < n && chars.charAt(i+1) == '-') { // range x-y
int x = c;
int y = chars.charAt(i+2);
if ( x<=y ) set.add(x,y);
i+=2;
if (x <= y) {
checkSetCollision(charSetAST, set, x, y);
set.add(x,y);
}
else {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), "[" + (char) x + "-" + (char) y + "]");
}
i += 2;
}
else {
checkSetCollision(charSetAST, set, c);
set.add(c);
}
}
return set;
}
protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) {
if (set.contains(el)) {
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
(char)el, ast.getText());
}
}
protected void checkSetCollision(GrammarAST ast, IntervalSet set, int a, int b) {
for (int i = a; i <= b; i++) {
if (set.contains(i)) {
String setText;
if (ast.getChildren() == null) {
setText = ast.getText();
}
else {
StringBuilder sb = new StringBuilder();
for (Object child : ast.getChildren()) {
if (child instanceof RangeAST) {
sb.append(((RangeAST) child).getChild(0).getText());
sb.append("..");
sb.append(((RangeAST) child).getChild(1).getText());
}
else {
sb.append(((GrammarAST)child).getText());
}
sb.append(" | ");
}
sb.replace(sb.length() - 3, sb.length(), "");
setText = sb.toString();
}
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
(char)a + "-" + (char)b, setText);
break;
}
}
}
@Override
public Handle tokenRef(TerminalAST node) {
// Ref to EOF in lexer yields char transition on -1

View File

@ -28,6 +28,8 @@ public class CharSupport {
ANTLRLiteralEscapedCharValue['\\'] = '\\';
ANTLRLiteralEscapedCharValue['\''] = '\'';
ANTLRLiteralEscapedCharValue['"'] = '"';
ANTLRLiteralEscapedCharValue['-'] = '-';
ANTLRLiteralEscapedCharValue[']'] = ']';
ANTLRLiteralCharValueEscape['\n'] = "\\n";
ANTLRLiteralCharValueEscape['\r'] = "\\r";
ANTLRLiteralCharValueEscape['\t'] = "\\t";
@ -76,32 +78,6 @@ public class CharSupport {
return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1));
}
/** Given char x or \t or \u1234 return the char value;
* Unnecessary escapes like '\{' yield -1.
*/
public static int getCharValueFromCharInGrammarLiteral(String cstr) {
switch ( cstr.length() ) {
case 1 :
// 'x'
return cstr.charAt(0); // no escape char
case 2 :
if ( cstr.charAt(0)!='\\' ) return -1;
// '\x' (antlr lexer will catch invalid char)
if ( Character.isDigit(cstr.charAt(1)) ) return -1;
int escChar = cstr.charAt(1);
int charVal = ANTLRLiteralEscapedCharValue[escChar];
if ( charVal==0 ) return -1;
return charVal;
case 6 :
// '\u1234'
if ( !cstr.startsWith("\\u") ) return -1;
String unicodeChars = cstr.substring(2, cstr.length());
return Integer.parseInt(unicodeChars, 16);
default :
return -1;
}
}
public static String getStringFromGrammarStringLiteral(String literal) {
StringBuilder buf = new StringBuilder();
int i = 1; // skip first quote
@ -110,19 +86,60 @@ public class CharSupport {
int end = i+1;
if ( literal.charAt(i) == '\\' ) {
end = i+2;
if ( (i+1)>=n ) break; // ignore spurious \ on end
if ( literal.charAt(i+1) == 'u' ) end = i+6;
if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
for (end = i + 2; end < i + 6; end++) {
if ( end>n ) return null; // invalid escape sequence.
char charAt = literal.charAt(end);
if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
return null; // invalid escape sequence.
}
}
}
}
if ( end>n ) break;
if ( end>n ) return null; // invalid escape sequence.
String esc = literal.substring(i, end);
int c = getCharValueFromCharInGrammarLiteral(esc);
if ( c==-1 ) { buf.append(esc); }
if ( c==-1 ) {
return null; // invalid escape sequence.
}
else buf.append((char)c);
i = end;
}
return buf.toString();
}
/** Given char x or \t or \u1234 return the char value;
* Unnecessary escapes like '\{' yield -1.
*/
public static int getCharValueFromCharInGrammarLiteral(String cstr) {
switch ( cstr.length() ) {
case 1:
// 'x'
return cstr.charAt(0); // no escape char
case 2:
if ( cstr.charAt(0)!='\\' ) return -1;
// '\x' (antlr lexer will catch invalid char)
if ( Character.isDigit(cstr.charAt(1)) ) return -1;
int escChar = cstr.charAt(1);
int charVal = ANTLRLiteralEscapedCharValue[escChar];
if ( charVal==0 ) return -1;
return charVal;
case 6:
// '\u1234'
if ( !cstr.startsWith("\\u") ) return -1;
String unicodeChars = cstr.substring(2, cstr.length());
int result = -1;
try {
result = Integer.parseInt(unicodeChars, 16);
}
catch (NumberFormatException e) {
}
return result;
default:
return -1;
}
}
public static String capitalize(String s) {
return Character.toUpperCase(s.charAt(0)) + s.substring(1);
}

View File

@ -471,7 +471,7 @@ public class BasicSemanticChecks extends GrammarTreeVisitor {
protected void enterTerminal(GrammarAST tree) {
String text = tree.getText();
if (text.equals("''")) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_NOT_ALLOWED, g.fileName, tree.token);
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, tree.token, "''");
}
}

View File

@ -982,11 +982,15 @@ public enum ErrorType {
*
* <p>empty strings not allowed</p>
*
* <pre>A: '''test''';</pre>
* <pre>B: '';</pre>
* <pre>C: 'test' '';</pre>
* <pre>
* A: '''test''';
* B: '';
* C: 'test' '';
* D: [];
* E: [f-a];
* </pre>
*/
EMPTY_STRINGS_NOT_ALLOWED(174, "string literals cannot be empty", ErrorSeverity.ERROR),
EMPTY_STRINGS_AND_SETS_NOT_ALLOWED(174, "string literals and sets cannot be empty: <arg>", ErrorSeverity.ERROR),
/**
* Compiler Error 175.
*
@ -1027,6 +1031,19 @@ public enum ErrorType {
* <p>T00: 'a00' -> skip, more;</p>
*/
INCOMPATIBLE_COMMANDS(179, "incompatible commands <arg> and <arg2>", ErrorSeverity.WARNING),
/**
* Compiler Warning 180.
*
* <p>chars "a-f" used multiple times in set [a-fc-m]</p>
*
* <pre>
* A: [aa-z]; // warning
* B: [a-fc-m]; // warning
* </pre>
*
* TODO: Does not work with fragment rules.
*/
CHARACTERS_COLLISION_IN_SET(180, "chars \"<arg>\" used multiple times in set <arg2>", ErrorSeverity.WARNING),
/*
* Backward incompatibility errors

View File

@ -8,6 +8,7 @@ package org.antlr.v4.tool.ast;
import org.antlr.runtime.Token;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.tool.ErrorType;
import java.util.Collections;
import java.util.HashMap;
@ -41,6 +42,10 @@ public abstract class GrammarASTWithOptions extends GrammarAST {
String v = value.getText();
if ( v.startsWith("'") || v.startsWith("\"") ) {
v = CharSupport.getStringFromGrammarStringLiteral(v);
if (v == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, g.fileName, value.getToken());
v = "";
}
}
return v;
}