EscapeSequenceParsing

2017-02-23 13:19:26 -08:00 · 2017-02-23 13:19:26 -08:00 · 3557f48386
parent b2012d9054
commit 3557f48386
3 changed files with 300 additions and 1 deletions
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestEscapeSequenceParsing.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestEscapeSequenceParsing.java
@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+
+package org.antlr.v4.test.tool;
+
+import org.antlr.v4.misc.EscapeSequenceParsing;
+import org.antlr.v4.runtime.misc.IntervalSet;
+
+import org.junit.Test;
+
+import static org.antlr.v4.misc.EscapeSequenceParsing.Result;
+import static org.junit.Assert.assertEquals;
+
+public class TestEscapeSequenceParsing {
+	@Test
+	public void testParseEmpty() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("", 0));
+	}
+
+	@Test
+	public void testParseJustBackslash() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\", 0));
+	}
+
+	@Test
+	public void testParseInvalidEscape() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\z", 0));
+	}
+
+	@Test
+	public void testParseNewline() {
+		assertEquals(
+				new Result(Result.Type.CODE_POINT, '\n', IntervalSet.EMPTY_SET, 2),
+				EscapeSequenceParsing.parseEscape("\\n", 0));
+	}
+
+	@Test
+	public void testParseTab() {
+		assertEquals(
+				new Result(Result.Type.CODE_POINT, '\t', IntervalSet.EMPTY_SET, 2),
+				EscapeSequenceParsing.parseEscape("\\t", 0));
+	}
+
+	@Test
+	public void testParseUnicodeTooShort() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\uABC", 0));
+	}
+
+	@Test
+	public void testParseUnicodeBMP() {
+		assertEquals(
+				new Result(Result.Type.CODE_POINT, 0xABCD, IntervalSet.EMPTY_SET, 6),
+				EscapeSequenceParsing.parseEscape("\\uABCD", 0));
+	}
+
+	@Test
+	public void testParseUnicodeSMPTooShort() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\u{}", 0));
+	}
+
+	@Test
+	public void testParseUnicodeSMPMissingCloseBrace() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\u{12345", 0));
+	}
+
+	@Test
+	public void testParseUnicodeTooBig() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\u{110000}", 0));
+	}
+
+	@Test
+	public void testParseUnicodeSMP() {
+		assertEquals(
+				new Result(Result.Type.CODE_POINT, 0x10ABCD, IntervalSet.EMPTY_SET, 10),
+				EscapeSequenceParsing.parseEscape("\\u{10ABCD}", 0));
+	}
+
+	@Test
+	public void testParseUnicodePropertyTooShort() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\p{}", 0));
+	}
+
+	@Test
+	public void testParseUnicodePropertyMissingCloseBrace() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\p{1234", 0));
+	}
+
+	@Test
+	public void testParseUnicodeProperty() {
+		assertEquals(
+				new Result(Result.Type.PROPERTY, -1, IntervalSet.of(66560, 66639), 11),
+				EscapeSequenceParsing.parseEscape("\\p{Deseret}", 0));
+	}
+
+	@Test
+	public void testParseUnicodePropertyInvertedTooShort() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\P{}", 0));
+	}
+
+	@Test
+	public void testParseUnicodePropertyInvertedMissingCloseBrace() {
+		assertEquals(
+				EscapeSequenceParsing.Result.INVALID,
+				EscapeSequenceParsing.parseEscape("\\P{Deseret", 0));
+	}
+
+	@Test
+	public void testParseUnicodePropertyInverted() {
+		IntervalSet expected = IntervalSet.of(0, 66559);
+		expected.add(66640, Character.MAX_CODE_POINT);
+		assertEquals(
+				new Result(Result.Type.PROPERTY, -1, expected, 11),
+				EscapeSequenceParsing.parseEscape("\\P{Deseret}", 0));
+	}
+}
--- a/tool/src/org/antlr/v4/misc/CharSupport.java
+++ b/tool/src/org/antlr/v4/misc/CharSupport.java
@ -163,7 +163,7 @@ public class CharSupport {
 		}
 	}

-	private static int parseHexValue(String cstr, int startOff, int endOff) {
+	public static int parseHexValue(String cstr, int startOff, int endOff) {
 		if (startOff < 0 || endOff < 0) {
 			return -1;
 		}
--- a/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java
+++ b/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java
@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+
+package org.antlr.v4.misc;
+
+import java.util.Objects;
+
+import org.antlr.v4.runtime.Token;
+import org.antlr.v4.runtime.misc.IntervalSet;
+import org.antlr.v4.unicode.UnicodeData;
+
+/**
+ * Utility class to parse escapes like:
+ *   \\n
+ *   \\uABCD
+ *   \\u{10ABCD}
+ *   \\p{Foo}
+ *   \\P{Bar}
+ */
+public abstract class EscapeSequenceParsing {
+	public static class Result {
+		public enum Type {
+			INVALID,
+			CODE_POINT,
+			PROPERTY
+		};
+
+		public static Result INVALID = new Result(Type.INVALID, -1, IntervalSet.EMPTY_SET, -1);
+
+		public final Type type;
+		public final int codePoint;
+		public final IntervalSet propertyIntervalSet;
+		public final int parseLength;
+
+		public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int parseLength) {
+			this.type = type;
+			this.codePoint = codePoint;
+			this.propertyIntervalSet = propertyIntervalSet;
+			this.parseLength = parseLength;
+		}
+
+		@Override
+		public String toString() {
+			return String.format(
+					"%s type=%s codePoint=%d propertyIntervalSet=%s parseLength=%d",
+					super.toString(),
+					type,
+					codePoint,
+					propertyIntervalSet,
+					parseLength);
+		}
+
+		@Override
+		public boolean equals(Object other) {
+			if (!(other instanceof Result)) {
+				return false;
+			}
+			Result that = (Result) other;
+			if (this == that) {
+				return true;
+			}
+			return Objects.equals(this.type, that.type) &&
+				Objects.equals(this.codePoint, that.codePoint) &&
+				Objects.equals(this.propertyIntervalSet, that.propertyIntervalSet) &&
+				Objects.equals(this.parseLength, that.parseLength);
+		}
+
+		@Override
+		public int hashCode() {
+			return Objects.hash(type, codePoint, propertyIntervalSet, parseLength);
+		}
+	}
+
+	/**
+	 * Parses a single escape sequence starting at {@code startOff}.
+	 *
+	 * Returns {@link Result.INVALID} if no valid escape sequence was found, a Result otherwise.
+	 */
+	public static Result parseEscape(String s, int startOff) {
+		int offset = startOff;
+		if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') {
+			return Result.INVALID;
+		}
+		// Move past backslash
+		offset++;
+		int escaped = s.codePointAt(offset);
+		// Move past escaped code point
+		offset += Character.charCount(escaped);
+		if (escaped == 'u') {
+			// \\u{1} is the shortest we support
+			if (offset + 3 > s.length()) {
+				return Result.INVALID;
+			}
+			int hexStartOffset;
+			int hexEndOffset;
+			if (s.codePointAt(offset) == '{') {
+				hexStartOffset = offset + 1;
+				hexEndOffset = s.indexOf('}', hexStartOffset);
+				if (hexEndOffset == -1) {
+					return Result.INVALID;
+				}
+				offset = hexEndOffset + 1;
+			} else {
+				if (offset + 4 > s.length()) {
+					return Result.INVALID;
+				}
+				hexStartOffset = offset;
+				hexEndOffset = offset + 4;
+				offset = hexEndOffset;
+			}
+			int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset);
+			if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) {
+				return Result.INVALID;
+			}
+			return new Result(
+				Result.Type.CODE_POINT,
+				codePointValue,
+				IntervalSet.EMPTY_SET,
+				offset - startOff);
+		} else if (escaped == 'p' || escaped == 'P') {
+			// \p{L} is the shortest we support
+			if (offset + 3 > s.length() || s.codePointAt(offset) != '{') {
+				return Result.INVALID;
+			}
+			int openBraceOffset = offset;
+			int closeBraceOffset = s.indexOf('}', openBraceOffset);
+			if (closeBraceOffset == -1) {
+				return Result.INVALID;
+			}
+			String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset);
+			IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName);
+			if (propertyIntervalSet == null) {
+				return Result.INVALID;
+			}
+			offset = closeBraceOffset + 1;
+			if (escaped == 'P') {
+				propertyIntervalSet = propertyIntervalSet.complement(IntervalSet.COMPLETE_CHAR_SET);
+			}
+			return new Result(
+				Result.Type.PROPERTY,
+				-1,
+				propertyIntervalSet,
+				offset - startOff);
+		} else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) {
+			int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped];
+			if (codePoint == 0) {
+				return Result.INVALID;
+			}
+			return new Result(
+				Result.Type.CODE_POINT,
+				codePoint,
+				IntervalSet.EMPTY_SET,
+				offset - startOff);
+		} else {
+			return Result.INVALID;
+		}
+	}
+}