New \p{Letter} Unicode property escape

This commit is contained in:
Ben Hamilton 2017-03-01 14:05:39 -08:00
parent d11968d993
commit ca03e6ab5e
4 changed files with 341 additions and 30 deletions

View File

@ -115,6 +115,129 @@ public class TestATNConstruction extends BaseJavaToolTest {
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSet() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [abc] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [a-c] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeBMPEscape() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\uABCD] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-43981->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [a-c\\uABCD-\\uABFF] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99, 43981..44031}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeSMPEscape() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\u{10ABCD}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-1092557->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99, 1092557..1092607}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodePropertyEscape() throws Exception {
// The Gothic script is long dead and unlikely to change (which would
// cause this test to fail)
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\p{Gothic}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{66352..66378}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\P{Gothic}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{0..66351, 66379..1114111}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
// Ditto the Mahajani script. Not going to change soon. I hope.
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\p{Gothic}\\p{Mahajani}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{66352..66378, 69968..70006}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testRangeOrRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+

View File

@ -529,6 +529,44 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
super.testErrors(pair, true);
}
@Test public void testInvalidUnicodeEscapesInCharSet() {
String grammar =
"lexer grammar Test;\n" +
"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}];\n" +
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Letter}-Z];\n" +
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Number}];\n" +
"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n";
String expected =
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" +
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" +
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" +
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" +
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:13:48: unicode property escapes not allowed in lexer charset range: [\\P{Uppercase_Letter}-\\P{Number}]\n";
String[] pair = new String[] {
grammar,
expected
};
super.testErrors(pair, true);
}
/**
* This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
* is produced as described in the documentation.

View File

@ -10,6 +10,7 @@ import org.antlr.runtime.CommonToken;
import org.antlr.runtime.Token;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.misc.EscapeSequenceParsing;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.IntStream;
import org.antlr.v4.runtime.Lexer;
@ -49,6 +50,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
public class LexerATNFactory extends ParserATNFactory {
@ -365,7 +367,7 @@ public class LexerATNFactory extends ParserATNFactory {
return new Handle(left, right);
}
/** [Aa\t \u1234a-z\]\-] char sets */
/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
@Override
public Handle charSetLiteral(GrammarAST charSetAST) {
ATNState left = newState(charSetAST);
@ -376,10 +378,68 @@ public class LexerATNFactory extends ParserATNFactory {
return new Handle(left, right);
}
private static class CharSetParseState {
enum Mode {
NONE,
ERROR,
PREV_CODE_POINT,
PREV_PROPERTY
}
public static final CharSetParseState NONE = new CharSetParseState(Mode.NONE, false, -1, IntervalSet.EMPTY_SET);
public static final CharSetParseState ERROR = new CharSetParseState(Mode.ERROR, false, -1, IntervalSet.EMPTY_SET);
public final Mode mode;
public final boolean inRange;
public final int prevCodePoint;
public final IntervalSet prevProperty;
public CharSetParseState(
Mode mode,
boolean inRange,
int prevCodePoint,
IntervalSet prevProperty) {
this.mode = mode;
this.inRange = inRange;
this.prevCodePoint = prevCodePoint;
this.prevProperty = prevProperty;
}
@Override
public String toString() {
return String.format(
"%s mode=%s inRange=%s prevCodePoint=%d prevProperty=%s",
super.toString(),
mode,
inRange,
prevCodePoint,
prevProperty);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof CharSetParseState)) {
return false;
}
CharSetParseState that = (CharSetParseState) other;
if (this == that) {
return true;
}
return Objects.equals(this.mode, that.mode) &&
Objects.equals(this.inRange, that.inRange) &&
Objects.equals(this.prevCodePoint, that.prevCodePoint) &&
Objects.equals(this.prevProperty, that.prevProperty);
}
@Override
public int hashCode() {
return Objects.hash(mode, inRange, prevCodePoint, prevProperty);
}
}
public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
String chars = charSetAST.getText();
chars = chars.substring(1, chars.length() - 1);
String cset = '"' + chars + '"';
IntervalSet set = new IntervalSet();
if (chars.length() == 0) {
@ -387,46 +447,122 @@ public class LexerATNFactory extends ParserATNFactory {
g.fileName, charSetAST.getToken(), "[]");
return set;
}
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
// so we can avoid seeing them as '-' range ops.
chars = CharSupport.getStringFromGrammarStringLiteral(cset);
if (chars == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, charSetAST.getToken());
return set;
}
CharSetParseState state = CharSetParseState.NONE;
int n = chars.length();
// now make x-y become set of char
for (int i = 0; i < n; ) {
if (state.mode == CharSetParseState.Mode.ERROR) {
return new IntervalSet();
}
int c = chars.codePointAt(i);
int offset = Character.charCount(c);
if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
checkSetCollision(charSetAST, set, '-');
set.add('-');
offset++;
}
else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
int x = c;
int y = chars.codePointAt(i+offset+1);
if (x <= y) {
checkSetCollision(charSetAST, set, x, y);
set.add(x,y);
if (c == '\\') {
EscapeSequenceParsing.Result escapeParseResult =
EscapeSequenceParsing.parseEscape(chars, i);
switch (escapeParseResult.type) {
case INVALID:
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, charSetAST.getToken(), charSetAST.getText());
state = CharSetParseState.ERROR;
break;
case CODE_POINT:
state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, escapeParseResult.codePoint);
break;
case PROPERTY:
state = applyPrevStateAndMoveToProperty(charSetAST, set, state, escapeParseResult.propertyIntervalSet);
break;
}
else {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
offset = escapeParseResult.parseLength;
} else if (c == '-' && !state.inRange) {
if (state.mode == CharSetParseState.Mode.PREV_PROPERTY) {
g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
g.fileName, charSetAST.getToken(), charSetAST.getText());
state = CharSetParseState.ERROR;
} else {
state = new CharSetParseState(state.mode, true, state.prevCodePoint, state.prevProperty);
}
offset += Character.charCount(y) + 1;
}
else {
checkSetCollision(charSetAST, set, c);
set.add(c);
} else {
state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, c);
}
i += offset;
}
if (state.mode == CharSetParseState.Mode.ERROR) {
return new IntervalSet();
}
// Whether or not we were in a range, we'll add the last code point found to the set.
// If the range wasn't terminated, we'll treat it as a standalone codepoint.
applyPrevState(charSetAST, set, state);
if (state.inRange) {
// Unterminated range; add a literal hyphen to the set.
checkSetCollision(charSetAST, set, '-');
set.add('-');
}
return set;
}
private CharSetParseState applyPrevStateAndMoveToCodePoint(
GrammarAST charSetAST,
IntervalSet set,
CharSetParseState state,
int codePoint) {
if (state.inRange) {
if (state.prevCodePoint > codePoint) {
g.tool.errMgr.grammarError(
ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName,
charSetAST.getToken(),
CharSupport.toRange(state.prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
}
checkSetCollision(charSetAST, set, state.prevCodePoint, codePoint);
set.add(state.prevCodePoint, codePoint);
state = CharSetParseState.NONE;
} else {
applyPrevState(charSetAST, set, state);
state = new CharSetParseState(
CharSetParseState.Mode.PREV_CODE_POINT,
false,
codePoint,
IntervalSet.EMPTY_SET);
}
return state;
}
private CharSetParseState applyPrevStateAndMoveToProperty(
GrammarAST charSetAST,
IntervalSet set,
CharSetParseState state,
IntervalSet property) {
if (state.inRange) {
g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
g.fileName, charSetAST.getToken(), charSetAST.getText());
return CharSetParseState.ERROR;
} else {
applyPrevState(charSetAST, set, state);
state = new CharSetParseState(
CharSetParseState.Mode.PREV_PROPERTY,
false,
-1,
property);
}
return state;
}
private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParseState state) {
switch (state.mode) {
case NONE:
case ERROR:
break;
case PREV_CODE_POINT:
checkSetCollision(charSetAST, set, state.prevCodePoint);
set.add(state.prevCodePoint);
break;
case PREV_PROPERTY:
set.addAll(state.prevProperty);
break;
}
}
protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) {
if (set.contains(el)) {
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),

View File

@ -1060,6 +1060,20 @@ public enum ErrorType {
*/
TOKEN_RANGE_IN_PARSER(181, "token ranges not allowed in parser: <arg>..<arg2>", ErrorSeverity.ERROR),
/**
* Compiler Error 182.
*
* <p>Unicode properties cannot be part of a lexer charset range</p>
*
* <pre>
* A: [\\p{Letter}-\\p{Number}];
* </pre>
*/
UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE(
182,
"unicode property escapes not allowed in lexer charset range: <arg>",
ErrorSeverity.ERROR),
/*
* Backward incompatibility errors
*/