forked from jasder/antlr
EscapeSequenceParsing
This commit is contained in:
parent
b2012d9054
commit
3557f48386
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.test.tool;
|
||||
|
||||
import org.antlr.v4.misc.EscapeSequenceParsing;
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.antlr.v4.misc.EscapeSequenceParsing.Result;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class TestEscapeSequenceParsing {
|
||||
@Test
|
||||
public void testParseEmpty() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseJustBackslash() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseInvalidEscape() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\z", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseNewline() {
|
||||
assertEquals(
|
||||
new Result(Result.Type.CODE_POINT, '\n', IntervalSet.EMPTY_SET, 2),
|
||||
EscapeSequenceParsing.parseEscape("\\n", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseTab() {
|
||||
assertEquals(
|
||||
new Result(Result.Type.CODE_POINT, '\t', IntervalSet.EMPTY_SET, 2),
|
||||
EscapeSequenceParsing.parseEscape("\\t", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeTooShort() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\uABC", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeBMP() {
|
||||
assertEquals(
|
||||
new Result(Result.Type.CODE_POINT, 0xABCD, IntervalSet.EMPTY_SET, 6),
|
||||
EscapeSequenceParsing.parseEscape("\\uABCD", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeSMPTooShort() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\u{}", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeSMPMissingCloseBrace() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\u{12345", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeTooBig() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\u{110000}", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeSMP() {
|
||||
assertEquals(
|
||||
new Result(Result.Type.CODE_POINT, 0x10ABCD, IntervalSet.EMPTY_SET, 10),
|
||||
EscapeSequenceParsing.parseEscape("\\u{10ABCD}", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodePropertyTooShort() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\p{}", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodePropertyMissingCloseBrace() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\p{1234", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodeProperty() {
|
||||
assertEquals(
|
||||
new Result(Result.Type.PROPERTY, -1, IntervalSet.of(66560, 66639), 11),
|
||||
EscapeSequenceParsing.parseEscape("\\p{Deseret}", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodePropertyInvertedTooShort() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\P{}", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodePropertyInvertedMissingCloseBrace() {
|
||||
assertEquals(
|
||||
EscapeSequenceParsing.Result.INVALID,
|
||||
EscapeSequenceParsing.parseEscape("\\P{Deseret", 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseUnicodePropertyInverted() {
|
||||
IntervalSet expected = IntervalSet.of(0, 66559);
|
||||
expected.add(66640, Character.MAX_CODE_POINT);
|
||||
assertEquals(
|
||||
new Result(Result.Type.PROPERTY, -1, expected, 11),
|
||||
EscapeSequenceParsing.parseEscape("\\P{Deseret}", 0));
|
||||
}
|
||||
}
|
|
@ -163,7 +163,7 @@ public class CharSupport {
|
|||
}
|
||||
}
|
||||
|
||||
private static int parseHexValue(String cstr, int startOff, int endOff) {
|
||||
public static int parseHexValue(String cstr, int startOff, int endOff) {
|
||||
if (startOff < 0 || endOff < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.misc;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.antlr.v4.runtime.Token;
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
import org.antlr.v4.unicode.UnicodeData;
|
||||
|
||||
/**
|
||||
* Utility class to parse escapes like:
|
||||
* \\n
|
||||
* \\uABCD
|
||||
* \\u{10ABCD}
|
||||
* \\p{Foo}
|
||||
* \\P{Bar}
|
||||
*/
|
||||
public abstract class EscapeSequenceParsing {
|
||||
public static class Result {
|
||||
public enum Type {
|
||||
INVALID,
|
||||
CODE_POINT,
|
||||
PROPERTY
|
||||
};
|
||||
|
||||
public static Result INVALID = new Result(Type.INVALID, -1, IntervalSet.EMPTY_SET, -1);
|
||||
|
||||
public final Type type;
|
||||
public final int codePoint;
|
||||
public final IntervalSet propertyIntervalSet;
|
||||
public final int parseLength;
|
||||
|
||||
public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int parseLength) {
|
||||
this.type = type;
|
||||
this.codePoint = codePoint;
|
||||
this.propertyIntervalSet = propertyIntervalSet;
|
||||
this.parseLength = parseLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format(
|
||||
"%s type=%s codePoint=%d propertyIntervalSet=%s parseLength=%d",
|
||||
super.toString(),
|
||||
type,
|
||||
codePoint,
|
||||
propertyIntervalSet,
|
||||
parseLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (!(other instanceof Result)) {
|
||||
return false;
|
||||
}
|
||||
Result that = (Result) other;
|
||||
if (this == that) {
|
||||
return true;
|
||||
}
|
||||
return Objects.equals(this.type, that.type) &&
|
||||
Objects.equals(this.codePoint, that.codePoint) &&
|
||||
Objects.equals(this.propertyIntervalSet, that.propertyIntervalSet) &&
|
||||
Objects.equals(this.parseLength, that.parseLength);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(type, codePoint, propertyIntervalSet, parseLength);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a single escape sequence starting at {@code startOff}.
|
||||
*
|
||||
* Returns {@link Result.INVALID} if no valid escape sequence was found, a Result otherwise.
|
||||
*/
|
||||
public static Result parseEscape(String s, int startOff) {
|
||||
int offset = startOff;
|
||||
if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') {
|
||||
return Result.INVALID;
|
||||
}
|
||||
// Move past backslash
|
||||
offset++;
|
||||
int escaped = s.codePointAt(offset);
|
||||
// Move past escaped code point
|
||||
offset += Character.charCount(escaped);
|
||||
if (escaped == 'u') {
|
||||
// \\u{1} is the shortest we support
|
||||
if (offset + 3 > s.length()) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
int hexStartOffset;
|
||||
int hexEndOffset;
|
||||
if (s.codePointAt(offset) == '{') {
|
||||
hexStartOffset = offset + 1;
|
||||
hexEndOffset = s.indexOf('}', hexStartOffset);
|
||||
if (hexEndOffset == -1) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
offset = hexEndOffset + 1;
|
||||
} else {
|
||||
if (offset + 4 > s.length()) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
hexStartOffset = offset;
|
||||
hexEndOffset = offset + 4;
|
||||
offset = hexEndOffset;
|
||||
}
|
||||
int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset);
|
||||
if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
return new Result(
|
||||
Result.Type.CODE_POINT,
|
||||
codePointValue,
|
||||
IntervalSet.EMPTY_SET,
|
||||
offset - startOff);
|
||||
} else if (escaped == 'p' || escaped == 'P') {
|
||||
// \p{L} is the shortest we support
|
||||
if (offset + 3 > s.length() || s.codePointAt(offset) != '{') {
|
||||
return Result.INVALID;
|
||||
}
|
||||
int openBraceOffset = offset;
|
||||
int closeBraceOffset = s.indexOf('}', openBraceOffset);
|
||||
if (closeBraceOffset == -1) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset);
|
||||
IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName);
|
||||
if (propertyIntervalSet == null) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
offset = closeBraceOffset + 1;
|
||||
if (escaped == 'P') {
|
||||
propertyIntervalSet = propertyIntervalSet.complement(IntervalSet.COMPLETE_CHAR_SET);
|
||||
}
|
||||
return new Result(
|
||||
Result.Type.PROPERTY,
|
||||
-1,
|
||||
propertyIntervalSet,
|
||||
offset - startOff);
|
||||
} else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) {
|
||||
int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped];
|
||||
if (codePoint == 0) {
|
||||
return Result.INVALID;
|
||||
}
|
||||
return new Result(
|
||||
Result.Type.CODE_POINT,
|
||||
codePoint,
|
||||
IntervalSet.EMPTY_SET,
|
||||
offset - startOff);
|
||||
} else {
|
||||
return Result.INVALID;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue