New \p{Letter} Unicode property escape

2017-03-01 14:05:39 -08:00 · 2017-03-01 14:05:39 -08:00 · ca03e6ab5e
parent d11968d993
commit ca03e6ab5e
4 changed files with 341 additions and 30 deletions
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
@ -115,6 +115,129 @@ public class TestATNConstruction extends BaseJavaToolTest {
 				"s4->RuleStop_A_2\n";
 		checkTokensRule(g, null, expecting);
 	}
+	@Test public void testCharSet() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [abc] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\uABCD] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-43981->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\uABCD-\\uABFF] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 43981..44031}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\u{10ABCD}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-1092557->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 1092557..1092607}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyEscape() throws Exception {
+		// The Gothic script is long dead and unlikely to change (which would
+		// cause this test to fail)
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\P{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{0..66351, 66379..1114111}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
+		// Ditto the Mahajani script. Not going to change soon. I hope.
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}\\p{Mahajani}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378, 69968..70006}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
 	@Test public void testRangeOrRange() throws Exception {
 		LexerGrammar g = new LexerGrammar(
 			"lexer grammar P;\n"+
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
@ -529,6 +529,44 @@ public class TestToolSyntaxErrors extends BaseJavaToolTest {
 		super.testErrors(pair, true);
 	}

+	@Test public void testInvalidUnicodeEscapesInCharSet() {
+		String grammar =
+				"lexer grammar Test;\n" +
+				"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
+				"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
+				"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
+				"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
+				"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
+				"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Letter}-Z];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Number}];\n" +
+				"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n";
+
+		String expected =
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:13:48: unicode property escapes not allowed in lexer charset range: [\\P{Uppercase_Letter}-\\P{Number}]\n";
+
+		String[] pair = new String[] {
+				grammar,
+				expected
+		};
+
+		super.testErrors(pair, true);
+	}
+
 	/**
 	 * This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
 	 * is produced as described in the documentation.
--- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java
+++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@ -10,6 +10,7 @@ import org.antlr.runtime.CommonToken;
 import org.antlr.runtime.Token;
 import org.antlr.v4.codegen.CodeGenerator;
 import org.antlr.v4.misc.CharSupport;
+import org.antlr.v4.misc.EscapeSequenceParsing;
 import org.antlr.v4.parse.ANTLRParser;
 import org.antlr.v4.runtime.IntStream;
 import org.antlr.v4.runtime.Lexer;
@ -49,6 +50,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Set;

 public class LexerATNFactory extends ParserATNFactory {
@ -365,7 +367,7 @@ public class LexerATNFactory extends ParserATNFactory {
 		return new Handle(left, right);
 	}

-	/** [Aa\t \u1234a-z\]\-] char sets */
+	/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
 	@Override
 	public Handle charSetLiteral(GrammarAST charSetAST) {
 		ATNState left = newState(charSetAST);
@ -376,10 +378,68 @@ public class LexerATNFactory extends ParserATNFactory {
 		return new Handle(left, right);
 	}

+	private static class CharSetParseState {
+		enum Mode {
+			NONE,
+			ERROR,
+			PREV_CODE_POINT,
+			PREV_PROPERTY
+		}
+
+		public static final CharSetParseState NONE = new CharSetParseState(Mode.NONE, false, -1, IntervalSet.EMPTY_SET);
+		public static final CharSetParseState ERROR = new CharSetParseState(Mode.ERROR, false, -1, IntervalSet.EMPTY_SET);
+
+		public final Mode mode;
+		public final boolean inRange;
+		public final int prevCodePoint;
+		public final IntervalSet prevProperty;
+
+		public CharSetParseState(
+				Mode mode,
+				boolean inRange,
+				int prevCodePoint,
+				IntervalSet prevProperty) {
+			this.mode = mode;
+			this.inRange = inRange;
+			this.prevCodePoint = prevCodePoint;
+			this.prevProperty = prevProperty;
+		}
+
+		@Override
+		public String toString() {
+			return String.format(
+					"%s mode=%s inRange=%s prevCodePoint=%d prevProperty=%s",
+					super.toString(),
+					mode,
+					inRange,
+					prevCodePoint,
+					prevProperty);
+		}
+
+		@Override
+		public boolean equals(Object other) {
+			if (!(other instanceof CharSetParseState)) {
+				return false;
+			}
+			CharSetParseState that = (CharSetParseState) other;
+			if (this == that) {
+				return true;
+			}
+			return Objects.equals(this.mode, that.mode) &&
+				Objects.equals(this.inRange, that.inRange) &&
+				Objects.equals(this.prevCodePoint, that.prevCodePoint) &&
+				Objects.equals(this.prevProperty, that.prevProperty);
+		}
+
+		@Override
+		public int hashCode() {
+			return Objects.hash(mode, inRange, prevCodePoint, prevProperty);
+		}
+	}
+
 	public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
 		String chars = charSetAST.getText();
 		chars = chars.substring(1, chars.length() - 1);
-		String cset = '"' + chars + '"';
 		IntervalSet set = new IntervalSet();

 		if (chars.length() == 0) {
@ -387,46 +447,122 @@ public class LexerATNFactory extends ParserATNFactory {
 					g.fileName, charSetAST.getToken(), "[]");
 			return set;
 		}
-		// unescape all valid escape char like \n, leaving escaped dashes as '\-'
-		// so we can avoid seeing them as '-' range ops.
-		chars = CharSupport.getStringFromGrammarStringLiteral(cset);
-		if (chars == null) {
-			g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
-			                           g.fileName, charSetAST.getToken());
-			return set;
-		}
+
+		CharSetParseState state = CharSetParseState.NONE;
+
 		int n = chars.length();
-		// now make x-y become set of char
 		for (int i = 0; i < n; ) {
+			if (state.mode == CharSetParseState.Mode.ERROR) {
+				return new IntervalSet();
+			}
 			int c = chars.codePointAt(i);
 			int offset = Character.charCount(c);
-			if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
-				checkSetCollision(charSetAST, set, '-');
-				set.add('-');
-				offset++;
-			}
-			else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
-				int x = c;
-				int y = chars.codePointAt(i+offset+1);
-				if (x <= y) {
-					checkSetCollision(charSetAST, set, x, y);
-					set.add(x,y);
+			if (c == '\\') {
+				EscapeSequenceParsing.Result escapeParseResult =
+					EscapeSequenceParsing.parseEscape(chars, i);
+				switch (escapeParseResult.type) {
+					case INVALID:
+						g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
+									   g.fileName, charSetAST.getToken(), charSetAST.getText());
+						state = CharSetParseState.ERROR;
+						break;
+					case CODE_POINT:
+						state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, escapeParseResult.codePoint);
+						break;
+					case PROPERTY:
+						state = applyPrevStateAndMoveToProperty(charSetAST, set, state, escapeParseResult.propertyIntervalSet);
+						break;
 				}
-				else {
-					g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
-								   g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
+				offset = escapeParseResult.parseLength;
+			} else if (c == '-' && !state.inRange) {
+				if (state.mode == CharSetParseState.Mode.PREV_PROPERTY) {
+					g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
+								   g.fileName, charSetAST.getToken(), charSetAST.getText());
+					state = CharSetParseState.ERROR;
+				} else {
+					state = new CharSetParseState(state.mode, true, state.prevCodePoint, state.prevProperty);
 				}
-				offset += Character.charCount(y) + 1;
-			}
-			else {
-				checkSetCollision(charSetAST, set, c);
-				set.add(c);
+			} else {
+				state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, c);
 			}
 			i += offset;
 		}
+		if (state.mode == CharSetParseState.Mode.ERROR) {
+			return new IntervalSet();
+		}
+		// Whether or not we were in a range, we'll add the last code point found to the set.
+		// If the range wasn't terminated, we'll treat it as a standalone codepoint.
+		applyPrevState(charSetAST, set, state);
+		if (state.inRange) {
+			// Unterminated range; add a literal hyphen to the set.
+			checkSetCollision(charSetAST, set, '-');
+			set.add('-');
+		}
 		return set;
 	}

+	private CharSetParseState applyPrevStateAndMoveToCodePoint(
+			GrammarAST charSetAST,
+			IntervalSet set,
+			CharSetParseState state,
+			int codePoint) {
+		if (state.inRange) {
+			if (state.prevCodePoint > codePoint) {
+				g.tool.errMgr.grammarError(
+						ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+						g.fileName,
+						charSetAST.getToken(),
+						CharSupport.toRange(state.prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
+			}
+			checkSetCollision(charSetAST, set, state.prevCodePoint, codePoint);
+			set.add(state.prevCodePoint, codePoint);
+			state = CharSetParseState.NONE;
+		} else {
+			applyPrevState(charSetAST, set, state);
+			state = new CharSetParseState(
+					CharSetParseState.Mode.PREV_CODE_POINT,
+					false,
+					codePoint,
+					IntervalSet.EMPTY_SET);
+		}
+		return state;
+	}
+
+	private CharSetParseState applyPrevStateAndMoveToProperty(
+			GrammarAST charSetAST,
+			IntervalSet set,
+			CharSetParseState state,
+			IntervalSet property) {
+		if (state.inRange) {
+			g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
+						   g.fileName, charSetAST.getToken(), charSetAST.getText());
+			return CharSetParseState.ERROR;
+		} else {
+			applyPrevState(charSetAST, set, state);
+			state = new CharSetParseState(
+					CharSetParseState.Mode.PREV_PROPERTY,
+					false,
+					-1,
+					property);
+		}
+		return state;
+	}
+
+	private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParseState state) {
+		switch (state.mode) {
+			case NONE:
+			case ERROR:
+				break;
+			case PREV_CODE_POINT:
+				checkSetCollision(charSetAST, set, state.prevCodePoint);
+				set.add(state.prevCodePoint);
+				break;
+			case PREV_PROPERTY:
+				set.addAll(state.prevProperty);
+				break;
+		}
+	}
+
 	protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) {
 		if (set.contains(el)) {
 			g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
--- a/tool/src/org/antlr/v4/tool/ErrorType.java
+++ b/tool/src/org/antlr/v4/tool/ErrorType.java
@ -1060,6 +1060,20 @@ public enum ErrorType {
 	 */
 	TOKEN_RANGE_IN_PARSER(181, "token ranges not allowed in parser: <arg>..<arg2>", ErrorSeverity.ERROR),

+	/**
+	 * Compiler Error 182.
+	 *
+	 * <p>Unicode properties cannot be part of a lexer charset range</p>
+	 *
+	 * <pre>
+	 * A: [\\p{Letter}-\\p{Number}];
+	 * </pre>
+	 */
+	UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE(
+			182,
+			"unicode property escapes not allowed in lexer charset range: <arg>",
+			ErrorSeverity.ERROR),
+
 	/*
 	 * Backward incompatibility errors
 	 */