New \p{Letter} Unicode property escape
This commit is contained in:
parent
d11968d993
commit
ca03e6ab5e
|
@ -115,6 +115,129 @@ public class TestATNConstruction extends BaseJavaToolTest {
|
|||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSet() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [abc] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{97..99}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetRange() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [a-c] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{97..99}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodeBMPEscape() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [\\uABCD] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-43981->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [a-c\\uABCD-\\uABFF] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{97..99, 43981..44031}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodeSMPEscape() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [\\u{10ABCD}] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-1092557->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{97..99, 1092557..1092607}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodePropertyEscape() throws Exception {
|
||||
// The Gothic script is long dead and unlikely to change (which would
|
||||
// cause this test to fail)
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [\\p{Gothic}] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{66352..66378}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [\\P{Gothic}] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{0..66351, 66379..1114111}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
|
||||
// Ditto the Mahajani script. Not going to change soon. I hope.
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [\\p{Gothic}\\p{Mahajani}] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{66352..66378, 69968..70006}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
|
||||
);
|
||||
String expecting =
|
||||
"s0->RuleStart_A_1\n" +
|
||||
"RuleStart_A_1->s3\n" +
|
||||
"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
|
||||
"s4->RuleStop_A_2\n";
|
||||
checkTokensRule(g, null, expecting);
|
||||
}
|
||||
@Test public void testRangeOrRange() throws Exception {
|
||||
LexerGrammar g = new LexerGrammar(
|
||||
"lexer grammar P;\n"+
|
||||
|
|
|
@ -529,6 +529,44 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
|
|||
super.testErrors(pair, true);
|
||||
}
|
||||
|
||||
@Test public void testInvalidUnicodeEscapesInCharSet() {
|
||||
String grammar =
|
||||
"lexer grammar Test;\n" +
|
||||
"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
|
||||
"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
|
||||
"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
|
||||
"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
|
||||
"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
|
||||
"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
|
||||
"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
|
||||
"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
|
||||
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}];\n" +
|
||||
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Letter}-Z];\n" +
|
||||
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Number}];\n" +
|
||||
"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n";
|
||||
|
||||
String expected =
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" +
|
||||
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" +
|
||||
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" +
|
||||
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" +
|
||||
"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:13:48: unicode property escapes not allowed in lexer charset range: [\\P{Uppercase_Letter}-\\P{Number}]\n";
|
||||
|
||||
String[] pair = new String[] {
|
||||
grammar,
|
||||
expected
|
||||
};
|
||||
|
||||
super.testErrors(pair, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
|
||||
* is produced as described in the documentation.
|
||||
|
|
|
@ -10,6 +10,7 @@ import org.antlr.runtime.CommonToken;
|
|||
import org.antlr.runtime.Token;
|
||||
import org.antlr.v4.codegen.CodeGenerator;
|
||||
import org.antlr.v4.misc.CharSupport;
|
||||
import org.antlr.v4.misc.EscapeSequenceParsing;
|
||||
import org.antlr.v4.parse.ANTLRParser;
|
||||
import org.antlr.v4.runtime.IntStream;
|
||||
import org.antlr.v4.runtime.Lexer;
|
||||
|
@ -49,6 +50,7 @@ import java.util.ArrayList;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
public class LexerATNFactory extends ParserATNFactory {
|
||||
|
@ -365,7 +367,7 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
return new Handle(left, right);
|
||||
}
|
||||
|
||||
/** [Aa\t \u1234a-z\]\-] char sets */
|
||||
/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
|
||||
@Override
|
||||
public Handle charSetLiteral(GrammarAST charSetAST) {
|
||||
ATNState left = newState(charSetAST);
|
||||
|
@ -376,10 +378,68 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
return new Handle(left, right);
|
||||
}
|
||||
|
||||
private static class CharSetParseState {
|
||||
enum Mode {
|
||||
NONE,
|
||||
ERROR,
|
||||
PREV_CODE_POINT,
|
||||
PREV_PROPERTY
|
||||
}
|
||||
|
||||
public static final CharSetParseState NONE = new CharSetParseState(Mode.NONE, false, -1, IntervalSet.EMPTY_SET);
|
||||
public static final CharSetParseState ERROR = new CharSetParseState(Mode.ERROR, false, -1, IntervalSet.EMPTY_SET);
|
||||
|
||||
public final Mode mode;
|
||||
public final boolean inRange;
|
||||
public final int prevCodePoint;
|
||||
public final IntervalSet prevProperty;
|
||||
|
||||
public CharSetParseState(
|
||||
Mode mode,
|
||||
boolean inRange,
|
||||
int prevCodePoint,
|
||||
IntervalSet prevProperty) {
|
||||
this.mode = mode;
|
||||
this.inRange = inRange;
|
||||
this.prevCodePoint = prevCodePoint;
|
||||
this.prevProperty = prevProperty;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format(
|
||||
"%s mode=%s inRange=%s prevCodePoint=%d prevProperty=%s",
|
||||
super.toString(),
|
||||
mode,
|
||||
inRange,
|
||||
prevCodePoint,
|
||||
prevProperty);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof CharSetParseState)) {
|
||||
return false;
|
||||
}
|
||||
CharSetParseState that = (CharSetParseState) other;
|
||||
if (this == that) {
|
||||
return true;
|
||||
}
|
||||
return Objects.equals(this.mode, that.mode) &&
|
||||
Objects.equals(this.inRange, that.inRange) &&
|
||||
Objects.equals(this.prevCodePoint, that.prevCodePoint) &&
|
||||
Objects.equals(this.prevProperty, that.prevProperty);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(mode, inRange, prevCodePoint, prevProperty);
|
||||
}
|
||||
}
|
||||
|
||||
public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
|
||||
String chars = charSetAST.getText();
|
||||
chars = chars.substring(1, chars.length() - 1);
|
||||
String cset = '"' + chars + '"';
|
||||
IntervalSet set = new IntervalSet();
|
||||
|
||||
if (chars.length() == 0) {
|
||||
|
@ -387,46 +447,122 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
g.fileName, charSetAST.getToken(), "[]");
|
||||
return set;
|
||||
}
|
||||
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
|
||||
// so we can avoid seeing them as '-' range ops.
|
||||
chars = CharSupport.getStringFromGrammarStringLiteral(cset);
|
||||
if (chars == null) {
|
||||
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
|
||||
g.fileName, charSetAST.getToken());
|
||||
return set;
|
||||
}
|
||||
|
||||
CharSetParseState state = CharSetParseState.NONE;
|
||||
|
||||
int n = chars.length();
|
||||
// now make x-y become set of char
|
||||
for (int i = 0; i < n; ) {
|
||||
if (state.mode == CharSetParseState.Mode.ERROR) {
|
||||
return new IntervalSet();
|
||||
}
|
||||
int c = chars.codePointAt(i);
|
||||
int offset = Character.charCount(c);
|
||||
if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
|
||||
checkSetCollision(charSetAST, set, '-');
|
||||
set.add('-');
|
||||
offset++;
|
||||
}
|
||||
else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
|
||||
int x = c;
|
||||
int y = chars.codePointAt(i+offset+1);
|
||||
if (x <= y) {
|
||||
checkSetCollision(charSetAST, set, x, y);
|
||||
set.add(x,y);
|
||||
if (c == '\\') {
|
||||
EscapeSequenceParsing.Result escapeParseResult =
|
||||
EscapeSequenceParsing.parseEscape(chars, i);
|
||||
switch (escapeParseResult.type) {
|
||||
case INVALID:
|
||||
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
|
||||
g.fileName, charSetAST.getToken(), charSetAST.getText());
|
||||
state = CharSetParseState.ERROR;
|
||||
break;
|
||||
case CODE_POINT:
|
||||
state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, escapeParseResult.codePoint);
|
||||
break;
|
||||
case PROPERTY:
|
||||
state = applyPrevStateAndMoveToProperty(charSetAST, set, state, escapeParseResult.propertyIntervalSet);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
|
||||
g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
|
||||
offset = escapeParseResult.parseLength;
|
||||
} else if (c == '-' && !state.inRange) {
|
||||
if (state.mode == CharSetParseState.Mode.PREV_PROPERTY) {
|
||||
g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
|
||||
g.fileName, charSetAST.getToken(), charSetAST.getText());
|
||||
state = CharSetParseState.ERROR;
|
||||
} else {
|
||||
state = new CharSetParseState(state.mode, true, state.prevCodePoint, state.prevProperty);
|
||||
}
|
||||
offset += Character.charCount(y) + 1;
|
||||
}
|
||||
else {
|
||||
checkSetCollision(charSetAST, set, c);
|
||||
set.add(c);
|
||||
} else {
|
||||
state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, c);
|
||||
}
|
||||
i += offset;
|
||||
}
|
||||
if (state.mode == CharSetParseState.Mode.ERROR) {
|
||||
return new IntervalSet();
|
||||
}
|
||||
// Whether or not we were in a range, we'll add the last code point found to the set.
|
||||
// If the range wasn't terminated, we'll treat it as a standalone codepoint.
|
||||
applyPrevState(charSetAST, set, state);
|
||||
if (state.inRange) {
|
||||
// Unterminated range; add a literal hyphen to the set.
|
||||
checkSetCollision(charSetAST, set, '-');
|
||||
set.add('-');
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private CharSetParseState applyPrevStateAndMoveToCodePoint(
|
||||
GrammarAST charSetAST,
|
||||
IntervalSet set,
|
||||
CharSetParseState state,
|
||||
int codePoint) {
|
||||
if (state.inRange) {
|
||||
if (state.prevCodePoint > codePoint) {
|
||||
g.tool.errMgr.grammarError(
|
||||
ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
|
||||
g.fileName,
|
||||
charSetAST.getToken(),
|
||||
CharSupport.toRange(state.prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
|
||||
}
|
||||
checkSetCollision(charSetAST, set, state.prevCodePoint, codePoint);
|
||||
set.add(state.prevCodePoint, codePoint);
|
||||
state = CharSetParseState.NONE;
|
||||
} else {
|
||||
applyPrevState(charSetAST, set, state);
|
||||
state = new CharSetParseState(
|
||||
CharSetParseState.Mode.PREV_CODE_POINT,
|
||||
false,
|
||||
codePoint,
|
||||
IntervalSet.EMPTY_SET);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
private CharSetParseState applyPrevStateAndMoveToProperty(
|
||||
GrammarAST charSetAST,
|
||||
IntervalSet set,
|
||||
CharSetParseState state,
|
||||
IntervalSet property) {
|
||||
if (state.inRange) {
|
||||
g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
|
||||
g.fileName, charSetAST.getToken(), charSetAST.getText());
|
||||
return CharSetParseState.ERROR;
|
||||
} else {
|
||||
applyPrevState(charSetAST, set, state);
|
||||
state = new CharSetParseState(
|
||||
CharSetParseState.Mode.PREV_PROPERTY,
|
||||
false,
|
||||
-1,
|
||||
property);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParseState state) {
|
||||
switch (state.mode) {
|
||||
case NONE:
|
||||
case ERROR:
|
||||
break;
|
||||
case PREV_CODE_POINT:
|
||||
checkSetCollision(charSetAST, set, state.prevCodePoint);
|
||||
set.add(state.prevCodePoint);
|
||||
break;
|
||||
case PREV_PROPERTY:
|
||||
set.addAll(state.prevProperty);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) {
|
||||
if (set.contains(el)) {
|
||||
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
|
||||
|
|
|
@ -1060,6 +1060,20 @@ public enum ErrorType {
|
|||
*/
|
||||
TOKEN_RANGE_IN_PARSER(181, "token ranges not allowed in parser: <arg>..<arg2>", ErrorSeverity.ERROR),
|
||||
|
||||
/**
|
||||
* Compiler Error 182.
|
||||
*
|
||||
* <p>Unicode properties cannot be part of a lexer charset range</p>
|
||||
*
|
||||
* <pre>
|
||||
* A: [\\p{Letter}-\\p{Number}];
|
||||
* </pre>
|
||||
*/
|
||||
UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE(
|
||||
182,
|
||||
"unicode property escapes not allowed in lexer charset range: <arg>",
|
||||
ErrorSeverity.ERROR),
|
||||
|
||||
/*
|
||||
* Backward incompatibility errors
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue