EscapeSequenceParsing

This commit is contained in:
Ben Hamilton 2017-02-23 13:19:26 -08:00
parent b2012d9054
commit 3557f48386
3 changed files with 300 additions and 1 deletions

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.test.tool;
import org.antlr.v4.misc.EscapeSequenceParsing;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.junit.Test;
import static org.antlr.v4.misc.EscapeSequenceParsing.Result;
import static org.junit.Assert.assertEquals;
public class TestEscapeSequenceParsing {
@Test
public void testParseEmpty() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("", 0));
}
@Test
public void testParseJustBackslash() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\", 0));
}
@Test
public void testParseInvalidEscape() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\z", 0));
}
@Test
public void testParseNewline() {
assertEquals(
new Result(Result.Type.CODE_POINT, '\n', IntervalSet.EMPTY_SET, 2),
EscapeSequenceParsing.parseEscape("\\n", 0));
}
@Test
public void testParseTab() {
assertEquals(
new Result(Result.Type.CODE_POINT, '\t', IntervalSet.EMPTY_SET, 2),
EscapeSequenceParsing.parseEscape("\\t", 0));
}
@Test
public void testParseUnicodeTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\uABC", 0));
}
@Test
public void testParseUnicodeBMP() {
assertEquals(
new Result(Result.Type.CODE_POINT, 0xABCD, IntervalSet.EMPTY_SET, 6),
EscapeSequenceParsing.parseEscape("\\uABCD", 0));
}
@Test
public void testParseUnicodeSMPTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{}", 0));
}
@Test
public void testParseUnicodeSMPMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{12345", 0));
}
@Test
public void testParseUnicodeTooBig() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{110000}", 0));
}
@Test
public void testParseUnicodeSMP() {
assertEquals(
new Result(Result.Type.CODE_POINT, 0x10ABCD, IntervalSet.EMPTY_SET, 10),
EscapeSequenceParsing.parseEscape("\\u{10ABCD}", 0));
}
@Test
public void testParseUnicodePropertyTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\p{}", 0));
}
@Test
public void testParseUnicodePropertyMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\p{1234", 0));
}
@Test
public void testParseUnicodeProperty() {
assertEquals(
new Result(Result.Type.PROPERTY, -1, IntervalSet.of(66560, 66639), 11),
EscapeSequenceParsing.parseEscape("\\p{Deseret}", 0));
}
@Test
public void testParseUnicodePropertyInvertedTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\P{}", 0));
}
@Test
public void testParseUnicodePropertyInvertedMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\P{Deseret", 0));
}
@Test
public void testParseUnicodePropertyInverted() {
IntervalSet expected = IntervalSet.of(0, 66559);
expected.add(66640, Character.MAX_CODE_POINT);
assertEquals(
new Result(Result.Type.PROPERTY, -1, expected, 11),
EscapeSequenceParsing.parseEscape("\\P{Deseret}", 0));
}
}

View File

@ -163,7 +163,7 @@ public class CharSupport {
}
}
private static int parseHexValue(String cstr, int startOff, int endOff) {
public static int parseHexValue(String cstr, int startOff, int endOff) {
if (startOff < 0 || endOff < 0) {
return -1;
}

View File

@ -0,0 +1,161 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.misc;
import java.util.Objects;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.unicode.UnicodeData;
/**
* Utility class to parse escapes like:
* \\n
* \\uABCD
* \\u{10ABCD}
* \\p{Foo}
* \\P{Bar}
*/
public abstract class EscapeSequenceParsing {
public static class Result {
public enum Type {
INVALID,
CODE_POINT,
PROPERTY
};
public static Result INVALID = new Result(Type.INVALID, -1, IntervalSet.EMPTY_SET, -1);
public final Type type;
public final int codePoint;
public final IntervalSet propertyIntervalSet;
public final int parseLength;
public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int parseLength) {
this.type = type;
this.codePoint = codePoint;
this.propertyIntervalSet = propertyIntervalSet;
this.parseLength = parseLength;
}
@Override
public String toString() {
return String.format(
"%s type=%s codePoint=%d propertyIntervalSet=%s parseLength=%d",
super.toString(),
type,
codePoint,
propertyIntervalSet,
parseLength);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof Result)) {
return false;
}
Result that = (Result) other;
if (this == that) {
return true;
}
return Objects.equals(this.type, that.type) &&
Objects.equals(this.codePoint, that.codePoint) &&
Objects.equals(this.propertyIntervalSet, that.propertyIntervalSet) &&
Objects.equals(this.parseLength, that.parseLength);
}
@Override
public int hashCode() {
return Objects.hash(type, codePoint, propertyIntervalSet, parseLength);
}
}
/**
* Parses a single escape sequence starting at {@code startOff}.
*
* Returns {@link Result.INVALID} if no valid escape sequence was found, a Result otherwise.
*/
public static Result parseEscape(String s, int startOff) {
int offset = startOff;
if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') {
return Result.INVALID;
}
// Move past backslash
offset++;
int escaped = s.codePointAt(offset);
// Move past escaped code point
offset += Character.charCount(escaped);
if (escaped == 'u') {
// \\u{1} is the shortest we support
if (offset + 3 > s.length()) {
return Result.INVALID;
}
int hexStartOffset;
int hexEndOffset;
if (s.codePointAt(offset) == '{') {
hexStartOffset = offset + 1;
hexEndOffset = s.indexOf('}', hexStartOffset);
if (hexEndOffset == -1) {
return Result.INVALID;
}
offset = hexEndOffset + 1;
} else {
if (offset + 4 > s.length()) {
return Result.INVALID;
}
hexStartOffset = offset;
hexEndOffset = offset + 4;
offset = hexEndOffset;
}
int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset);
if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) {
return Result.INVALID;
}
return new Result(
Result.Type.CODE_POINT,
codePointValue,
IntervalSet.EMPTY_SET,
offset - startOff);
} else if (escaped == 'p' || escaped == 'P') {
// \p{L} is the shortest we support
if (offset + 3 > s.length() || s.codePointAt(offset) != '{') {
return Result.INVALID;
}
int openBraceOffset = offset;
int closeBraceOffset = s.indexOf('}', openBraceOffset);
if (closeBraceOffset == -1) {
return Result.INVALID;
}
String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset);
IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName);
if (propertyIntervalSet == null) {
return Result.INVALID;
}
offset = closeBraceOffset + 1;
if (escaped == 'P') {
propertyIntervalSet = propertyIntervalSet.complement(IntervalSet.COMPLETE_CHAR_SET);
}
return new Result(
Result.Type.PROPERTY,
-1,
propertyIntervalSet,
offset - startOff);
} else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) {
int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped];
if (codePoint == 0) {
return Result.INVALID;
}
return new Result(
Result.Type.CODE_POINT,
codePoint,
IntervalSet.EMPTY_SET,
offset - startOff);
} else {
return Result.INVALID;
}
}
}