Fixes #1815. Add info on what the invalid escape is. Match \x for any x but give error. This prevents \x from appearing like a sequence of 2 char. Updated unit tests.

This commit is contained in:
parrt 2017-04-06 10:26:03 -07:00
parent 8af0080103
commit 80aa7907a4
7 changed files with 100 additions and 83 deletions

View File

@ -8,7 +8,6 @@ package org.antlr.v4.test.tool;
import org.antlr.v4.misc.EscapeSequenceParsing;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.junit.Test;
import static org.antlr.v4.misc.EscapeSequenceParsing.Result;
@ -18,113 +17,113 @@ public class TestEscapeSequenceParsing {
@Test
public void testParseEmpty() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("", 0).type);
}
@Test
public void testParseJustBackslash() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\", 0).type);
}
@Test
public void testParseInvalidEscape() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\z", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\z", 0).type);
}
@Test
public void testParseNewline() {
assertEquals(
new Result(Result.Type.CODE_POINT, '\n', IntervalSet.EMPTY_SET, 2),
new Result(Result.Type.CODE_POINT, '\n', IntervalSet.EMPTY_SET, 0,2),
EscapeSequenceParsing.parseEscape("\\n", 0));
}
@Test
public void testParseTab() {
assertEquals(
new Result(Result.Type.CODE_POINT, '\t', IntervalSet.EMPTY_SET, 2),
new Result(Result.Type.CODE_POINT, '\t', IntervalSet.EMPTY_SET, 0,2),
EscapeSequenceParsing.parseEscape("\\t", 0));
}
@Test
public void testParseUnicodeTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\uABC", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\uABC", 0).type);
}
@Test
public void testParseUnicodeBMP() {
assertEquals(
new Result(Result.Type.CODE_POINT, 0xABCD, IntervalSet.EMPTY_SET, 6),
new Result(Result.Type.CODE_POINT, 0xABCD, IntervalSet.EMPTY_SET, 0,6),
EscapeSequenceParsing.parseEscape("\\uABCD", 0));
}
@Test
public void testParseUnicodeSMPTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{}", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\u{}", 0).type);
}
@Test
public void testParseUnicodeSMPMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{12345", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\u{12345", 0).type);
}
@Test
public void testParseUnicodeTooBig() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{110000}", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\u{110000}", 0).type);
}
@Test
public void testParseUnicodeSMP() {
assertEquals(
new Result(Result.Type.CODE_POINT, 0x10ABCD, IntervalSet.EMPTY_SET, 10),
new Result(Result.Type.CODE_POINT, 0x10ABCD, IntervalSet.EMPTY_SET, 0,10),
EscapeSequenceParsing.parseEscape("\\u{10ABCD}", 0));
}
@Test
public void testParseUnicodePropertyTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\p{}", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\p{}", 0).type);
}
@Test
public void testParseUnicodePropertyMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\p{1234", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\p{1234", 0).type);
}
@Test
public void testParseUnicodeProperty() {
assertEquals(
new Result(Result.Type.PROPERTY, -1, IntervalSet.of(66560, 66639), 11),
new Result(Result.Type.PROPERTY, -1, IntervalSet.of(66560, 66639), 0,11),
EscapeSequenceParsing.parseEscape("\\p{Deseret}", 0));
}
@Test
public void testParseUnicodePropertyInvertedTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\P{}", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\P{}", 0).type);
}
@Test
public void testParseUnicodePropertyInvertedMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\P{Deseret", 0));
EscapeSequenceParsing.Result.Type.INVALID,
EscapeSequenceParsing.parseEscape("\\P{Deseret", 0).type);
}
@Test
@ -132,7 +131,7 @@ public class TestEscapeSequenceParsing {
IntervalSet expected = IntervalSet.of(0, 66559);
expected.add(66640, Character.MAX_CODE_POINT);
assertEquals(
new Result(Result.Type.PROPERTY, -1, expected, 11),
new Result(Result.Type.PROPERTY, -1, expected, 0, 11),
EscapeSequenceParsing.parseEscape("\\P{Deseret}", 0));
}
}

View File

@ -462,9 +462,9 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
"lexer grammar A;\n" +
"RULE : 'Foo \\uAABG \\x \\u';\n";
String expected =
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): A.g4:2:12: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): A.g4:2:19: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): A.g4:2:22: invalid escape sequence\n";
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): A.g4:2:12: invalid escape sequence \\uAABG\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): A.g4:2:19: invalid escape sequence \\x\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): A.g4:2:22: invalid escape sequence \\u\n";
String[] pair = new String[] {
grammar,
@ -516,13 +516,13 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
"VALID_CHAR_SET: [`\\-=\\]];";
String expected =
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:31: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:38: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:45: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:31: invalid escape sequence \\\"\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:38: invalid escape sequence \\]\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:45: invalid escape sequence \\u24\n" +
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:3:30: multi-character literals are not allowed in lexer sets: 'GH'\n" +
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:3:36: multi-character literals are not allowed in lexer sets: 'LM'\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:30: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:40: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:30: invalid escape sequence \\u24\\u\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:40: invalid escape sequence \\{\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:33: string literals and sets cannot be empty: 'F'..'A'\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:6:30: string literals and sets cannot be empty: 'f'..'a'\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:6:36: string literals and sets cannot be empty: []\n";
@ -552,14 +552,14 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n";
String expected =
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" +
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence \\u{}\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence \\u{\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence \\u{110\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence \\p{}\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence \\p{\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence \\P{}\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence \\p{NotAProperty}\n"+
"warning(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence \\P{NotAProperty}\n"+
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" +
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" +
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" +

View File

@ -349,7 +349,7 @@ public class LexerATNFactory extends ParserATNFactory {
chars = CharSupport.getStringFromGrammarStringLiteral(chars);
if (chars == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, stringLiteralAST.getToken());
g.fileName, stringLiteralAST.getToken(), chars);
return new Handle(left, left);
}
@ -462,8 +462,10 @@ public class LexerATNFactory extends ParserATNFactory {
EscapeSequenceParsing.parseEscape(chars, i);
switch (escapeParseResult.type) {
case INVALID:
String invalid = chars.substring(escapeParseResult.startOffset,
escapeParseResult.startOffset+escapeParseResult.parseLength);
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, charSetAST.getToken(), charSetAST.getText());
g.fileName, charSetAST.getToken(), invalid);
state = CharSetParseState.ERROR;
break;
case CODE_POINT:

View File

@ -29,17 +29,17 @@ public abstract class EscapeSequenceParsing {
PROPERTY
};
public static Result INVALID = new Result(Type.INVALID, -1, IntervalSet.EMPTY_SET, -1);
public final Type type;
public final int codePoint;
public final IntervalSet propertyIntervalSet;
public final int startOffset;
public final int parseLength;
public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int parseLength) {
public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int startOffset, int parseLength) {
this.type = type;
this.codePoint = codePoint;
this.propertyIntervalSet = propertyIntervalSet;
this.startOffset = startOffset;
this.parseLength = parseLength;
}
@ -78,12 +78,12 @@ public abstract class EscapeSequenceParsing {
/**
* Parses a single escape sequence starting at {@code startOff}.
*
* Returns {@link Result#INVALID} if no valid escape sequence was found, a Result otherwise.
* Returns a type of INVALID if no valid escape sequence was found, a Result otherwise.
*/
public static Result parseEscape(String s, int startOff) {
int offset = startOff;
if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') {
return Result.INVALID;
return invalid(startOff, s.length()-1);
}
// Move past backslash
offset++;
@ -93,21 +93,21 @@ public abstract class EscapeSequenceParsing {
if (escaped == 'u') {
// \\u{1} is the shortest we support
if (offset + 3 > s.length()) {
return Result.INVALID;
return invalid(startOff, s.length()-1);
}
int hexStartOffset;
int hexEndOffset;
int hexEndOffset; // appears to be exclusive
if (s.codePointAt(offset) == '{') {
hexStartOffset = offset + 1;
hexEndOffset = s.indexOf('}', hexStartOffset);
if (hexEndOffset == -1) {
return Result.INVALID;
return invalid(startOff, s.length()-1);
}
offset = hexEndOffset + 1;
}
else {
if (offset + 4 > s.length()) {
return Result.INVALID;
return invalid(startOff, s.length()-1);
}
hexStartOffset = offset;
hexEndOffset = offset + 4;
@ -115,28 +115,32 @@ public abstract class EscapeSequenceParsing {
}
int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset);
if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) {
return Result.INVALID;
return invalid(startOff, startOff+6-1);
}
return new Result(
Result.Type.CODE_POINT,
codePointValue,
IntervalSet.EMPTY_SET,
startOff,
offset - startOff);
}
else if (escaped == 'p' || escaped == 'P') {
// \p{L} is the shortest we support
if (offset + 3 > s.length() || s.codePointAt(offset) != '{') {
return Result.INVALID;
if (offset + 3 > s.length()) {
return invalid(startOff, s.length()-1);
}
if (s.codePointAt(offset) != '{') {
return invalid(startOff, offset);
}
int openBraceOffset = offset;
int closeBraceOffset = s.indexOf('}', openBraceOffset);
if (closeBraceOffset == -1) {
return Result.INVALID;
return invalid(startOff, s.length()-1);
}
String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset);
IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName);
if (propertyIntervalSet == null) {
return Result.INVALID;
return invalid(startOff, closeBraceOffset);
}
offset = closeBraceOffset + 1;
if (escaped == 'P') {
@ -146,13 +150,14 @@ public abstract class EscapeSequenceParsing {
Result.Type.PROPERTY,
-1,
propertyIntervalSet,
startOff,
offset - startOff);
}
else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) {
int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped];
if (codePoint == 0) {
if (escaped != ']' && escaped != '-') { // escape ']' and '-' only in char sets.
return Result.INVALID;
return invalid(startOff, startOff+1);
}
else {
codePoint = escaped;
@ -162,10 +167,20 @@ public abstract class EscapeSequenceParsing {
Result.Type.CODE_POINT,
codePoint,
IntervalSet.EMPTY_SET,
startOff,
offset - startOff);
}
else {
return Result.INVALID;
return invalid(startOff,s.length()-1);
}
}
private static Result invalid(int start, int stop) { // start..stop is inclusive
return new Result(
Result.Type.INVALID,
0,
IntervalSet.EMPTY_SET,
start,
stop - start + 1);
}
}

View File

@ -146,6 +146,7 @@ tokens { SEMPRED; TOKEN_REF; RULE_REF; LEXER_CHAR_SET; ARG_ACTION; }
*/
package org.antlr.v4.parse;
import org.antlr.v4.tool.*;
import org.antlr.v4.runtime.misc.Interval;
}
@ -643,27 +644,23 @@ fragment
ESC_SEQ
: '\\'
(
// The standard escaped character set such as tab, newline,
// etc.
//
// The standard escaped character set such as tab, newline, etc...
'b'|'t'|'n'|'f'|'r'|'\''|'\\'
| // A Java style Unicode escape sequence
//
UNICODE_ESC
| // A Swift/Hack style Unicode escape sequence
//
UNICODE_EXTENDED_ESC
| // An illegal escape seqeunce
//
~('b'|'t'|'n'|'f'|'r'|'\''|'\\'|'u') // \x for any invalid x (make sure to match char here)
{
Token t = new CommonToken(input, state.type, state.channel, getCharIndex()-1, getCharIndex());
Token t = new CommonToken(input, state.type, state.channel, getCharIndex()-2, getCharIndex()-1);
t.setText(t.getText());
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-1);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t);
t.setCharPositionInLine(input.getCharPositionInLine()-2);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(getCharIndex()-2,getCharIndex()-1));
if ( state.text==null ) {
setText(input.substring(state.tokenStartCharIndex, getCharIndex()-2));
}
@ -673,7 +670,6 @@ ESC_SEQ
fragment
UNICODE_ESC
@init {
// Flag to tell us whether we have a valid number of
@ -717,14 +713,19 @@ UNICODE_ESC
// Now check the digit count and issue an error if we need to
//
{
if (hCount != 4) {
Token t = new CommonToken(input, state.type, state.channel, getCharIndex()-3-hCount, getCharIndex()-1);
t.setText(t.getText());
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-hCount-2);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t);
if (hCount < 4) {
Interval badRange = Interval.of(getCharIndex()-2-hCount, getCharIndex());
String lastChar = input.substring(badRange.b, badRange.b);
if ( lastChar.codePointAt(0)=='\'' ) {
badRange.b--;
}
String bad = input.substring(badRange.a, badRange.b);
Token t = new CommonToken(input, state.type, state.channel, badRange.a, badRange.b);
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-hCount-2);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, bad);
if ( state.text==null ) {
setText(input.substring(state.tokenStartCharIndex, getCharIndex()-hCount-3));
setText(bad);
}
}
}
@ -746,7 +747,7 @@ UNICODE_EXTENDED_ESC
t.setText(t.getText());
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-numDigits);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(state.tokenStartCharIndex,getCharIndex()-1));
if ( state.text==null ) {
setText(input.substring(state.tokenStartCharIndex, getCharIndex()-numDigits-3));
}

View File

@ -824,7 +824,7 @@ public enum ErrorType {
*
* @since 4.2.1
*/
INVALID_ESCAPE_SEQUENCE(156, "invalid escape sequence", ErrorSeverity.WARNING),
INVALID_ESCAPE_SEQUENCE(156, "invalid escape sequence <arg>", ErrorSeverity.WARNING),
/**
* Compiler Warning 157.
*

View File

@ -43,7 +43,7 @@ public abstract class GrammarASTWithOptions extends GrammarAST {
if ( v.startsWith("'") || v.startsWith("\"") ) {
v = CharSupport.getStringFromGrammarStringLiteral(v);
if (v == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, g.fileName, value.getToken());
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, g.fileName, value.getToken(), value.getText());
v = "";
}
}