Merge pull request #1626 from bhamiltoncx/new-code-point-char-stream
New class CodePointCharStream (alternative to ANTLRInputStream)
This commit is contained in:
commit
508d2f988f
|
@ -0,0 +1,293 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.test.runtime.java;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
import org.antlr.v4.runtime.IntStream;
|
||||
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
public class TestCodePointCharStream {
|
||||
@Rule
|
||||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void emptyBytesHasSize0() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("");
|
||||
assertEquals(0, s.size());
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyBytesLookAheadReturnsEOF() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("");
|
||||
assertEquals(IntStream.EOF, s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingEmptyStreamShouldThrow() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("");
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
s.consume();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleLatinCodePointHasSize1() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("X");
|
||||
assertEquals(1, s.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingSingleLatinCodePointShouldMoveIndex() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("X");
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
assertEquals(1, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingPastSingleLatinCodePointShouldThrow() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("X");
|
||||
s.consume();
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
s.consume();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleLatinCodePointLookAheadShouldReturnCodePoint() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("X");
|
||||
assertEquals('X', s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("XYZ");
|
||||
assertEquals('X', s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
assertEquals('Y', s.LA(2));
|
||||
assertEquals(0, s.index());
|
||||
assertEquals('Z', s.LA(3));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("X");
|
||||
assertEquals(IntStream.EOF, s.LA(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleCJKCodePointHasSize1() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
|
||||
assertEquals(1, s.size());
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingSingleCJKCodePointShouldMoveIndex() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
assertEquals(1, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingPastSingleCJKCodePointShouldThrow() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
|
||||
s.consume();
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
s.consume();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleCJKCodePointLookAheadShouldReturnCodePoint() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
|
||||
assertEquals(0x611B, s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
|
||||
assertEquals(IntStream.EOF, s.LA(2));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleEmojiCodePointHasSize1() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(1, s.size());
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingSingleEmojiCodePointShouldMoveIndex() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
assertEquals(1, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingPastEndOfEmojiCodePointWithShouldThrow() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
assertEquals(1, s.index());
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
s.consume();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleEmojiCodePointLookAheadShouldReturnCodePoint() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(0x1F4A9, s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(IntStream.EOF, s.LA(2));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getTextWithLatin() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
|
||||
assertEquals("34567", s.getText(Interval.of(3, 7)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getTextWithCJK() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
|
||||
assertEquals("34\u409467", s.getText(Interval.of(3, 7)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getTextWithEmoji() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
.toString());
|
||||
assertEquals("34\uD83D\uDD2267", s.getText(Interval.of(3, 7)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringWithLatin() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
|
||||
assertEquals("0123456789", s.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringWithCJK() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
|
||||
assertEquals("01234\u40946789", s.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringWithEmoji() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
.toString());
|
||||
assertEquals("01234\uD83D\uDD226789", s.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookAheadWithLatin() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
|
||||
assertEquals('5', s.LA(6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookAheadWithCJK() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
|
||||
assertEquals(0x4094, s.LA(6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookAheadWithEmoji() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
.toString());
|
||||
assertEquals(0x1F522, s.LA(6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void seekWithLatin() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
|
||||
s.seek(5);
|
||||
assertEquals('5', s.LA(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void seekWithCJK() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
|
||||
s.seek(5);
|
||||
assertEquals(0x4094, s.LA(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void seekWithEmoji() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
.toString());
|
||||
s.seek(5);
|
||||
assertEquals(0x1F522, s.LA(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookBehindWithLatin() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
|
||||
s.seek(6);
|
||||
assertEquals('5', s.LA(-1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookBehindWithCJK() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
|
||||
s.seek(6);
|
||||
assertEquals(0x4094, s.LA(-1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookBehindWithEmoji() {
|
||||
CodePointCharStream s = CodePointCharStream.createWithString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
.toString());
|
||||
s.seek(6);
|
||||
assertEquals(0x1F522, s.LA(-1));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
/**
|
||||
* Alternative to {@link ANTLRInputStream} which treats the input
|
||||
* as a series of Unicode code points, instead of a series of UTF-16
|
||||
* code units.
|
||||
*
|
||||
* Use this if you need to parse input which potentially contains
|
||||
* Unicode values > U+FFFF.
|
||||
*/
|
||||
public final class CodePointCharStream implements CharStream {
|
||||
private final IntBuffer codePointBuffer;
|
||||
private final int initialPosition;
|
||||
private final int size;
|
||||
private final String name;
|
||||
|
||||
/**
|
||||
* Convenience method to create a {@link CodePointCharStream}
|
||||
* for the Unicode code points in a Java {@link String}.
|
||||
*/
|
||||
public static CodePointCharStream createWithString(String s) {
|
||||
// Initial guess assumes no code points > U+FFFF: one code
|
||||
// point for each code unit in the string
|
||||
IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
|
||||
int stringIdx = 0;
|
||||
while (stringIdx < s.length()) {
|
||||
if (!codePointBuffer.hasRemaining()) {
|
||||
// Grow the code point buffer size by 2.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
|
||||
codePointBuffer.flip();
|
||||
newBuffer.put(codePointBuffer);
|
||||
codePointBuffer = newBuffer;
|
||||
}
|
||||
int codePoint = Character.codePointAt(s, stringIdx);
|
||||
codePointBuffer.put(codePoint);
|
||||
stringIdx += Character.charCount(codePoint);
|
||||
}
|
||||
codePointBuffer.flip();
|
||||
return new CodePointCharStream(codePointBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a {@link CodePointCharStream} which provides access
|
||||
* to the Unicode code points stored in {@code codePointBuffer}.
|
||||
*
|
||||
* {@code codePointBuffer}'s {@link IntBuffer#position position}
|
||||
* reflects the first code point of the stream, and its
|
||||
* {@link IntBuffer#limit limit} is just after the last code point
|
||||
* of the stream.
|
||||
*/
|
||||
public CodePointCharStream(IntBuffer codePointBuffer) {
|
||||
this(codePointBuffer, UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a named {@link CodePointCharStream} which provides access
|
||||
* to the Unicode code points stored in {@code codePointBuffer}.
|
||||
*
|
||||
* {@code codePointBuffer}'s {@link IntBuffer#position position}
|
||||
* reflects the first code point of the stream, and its
|
||||
* {@link IntBuffer#limit limit} is just after the last code point
|
||||
* of the stream.
|
||||
*/
|
||||
public CodePointCharStream(IntBuffer codePointBuffer, String name) {
|
||||
this.codePointBuffer = codePointBuffer;
|
||||
this.initialPosition = codePointBuffer.position();
|
||||
this.size = codePointBuffer.remaining();
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
private int relativeBufferPosition(int i) {
|
||||
return initialPosition + codePointBuffer.position() + i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void consume() {
|
||||
if (!codePointBuffer.hasRemaining()) {
|
||||
assert LA(1) == IntStream.EOF;
|
||||
throw new IllegalStateException("cannot consume EOF");
|
||||
}
|
||||
codePointBuffer.position(codePointBuffer.position() + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int LA(int i) {
|
||||
if (i == 0) {
|
||||
// Undefined
|
||||
return 0;
|
||||
} else if (i < 0) {
|
||||
if (codePointBuffer.position() + i < initialPosition) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return codePointBuffer.get(relativeBufferPosition(i));
|
||||
} else if (i > codePointBuffer.remaining()) {
|
||||
return IntStream.EOF;
|
||||
} else {
|
||||
return codePointBuffer.get(relativeBufferPosition(i - 1));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int index() {
|
||||
return codePointBuffer.position() - initialPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/** mark/release do nothing; we have entire buffer */
|
||||
@Override
|
||||
public int mark() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void release(int marker) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(int index) {
|
||||
codePointBuffer.position(initialPosition + index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getText(Interval interval) {
|
||||
final int startIdx = initialPosition + Math.min(interval.a, size - 1);
|
||||
final int stopIdx = initialPosition + Math.min(interval.b, size - 1);
|
||||
// interval.length() will be too small if we contain any code points > U+FFFF,
|
||||
// but it's just a hint for initial capacity; StringBuilder will grow anyway.
|
||||
StringBuilder sb = new StringBuilder(interval.length());
|
||||
for (int codePointIdx = startIdx; codePointIdx <= stopIdx; codePointIdx++) {
|
||||
sb.appendCodePoint(codePointBuffer.get(codePointIdx));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceName() {
|
||||
if (name == null || name.isEmpty()) {
|
||||
return UNKNOWN_SOURCE_NAME;
|
||||
}
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getText(Interval.of(0, size - 1));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue