CodePointCharStream

This commit is contained in:
Ben Hamilton 2017-01-24 13:06:29 -08:00
parent 449a32d4ae
commit 212a948656
2 changed files with 454 additions and 0 deletions

View File

@ -0,0 +1,293 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.test.runtime.java;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.nio.IntBuffer;
import org.antlr.v4.runtime.CodePointCharStream;
import org.antlr.v4.runtime.IntStream;
import org.antlr.v4.runtime.misc.Interval;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
public class TestCodePointCharStream {
@Rule
public ExpectedException thrown = ExpectedException.none();
@Test
public void emptyBytesHasSize0() {
CodePointCharStream s = CodePointCharStream.createWithString("");
assertEquals(0, s.size());
assertEquals(0, s.index());
}
@Test
public void emptyBytesLookAheadReturnsEOF() {
CodePointCharStream s = CodePointCharStream.createWithString("");
assertEquals(IntStream.EOF, s.LA(1));
assertEquals(0, s.index());
}
@Test
public void consumingEmptyStreamShouldThrow() {
CodePointCharStream s = CodePointCharStream.createWithString("");
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
s.consume();
}
@Test
public void singleLatinCodePointHasSize1() {
CodePointCharStream s = CodePointCharStream.createWithString("X");
assertEquals(1, s.size());
}
@Test
public void consumingSingleLatinCodePointShouldMoveIndex() {
CodePointCharStream s = CodePointCharStream.createWithString("X");
assertEquals(0, s.index());
s.consume();
assertEquals(1, s.index());
}
@Test
public void consumingPastSingleLatinCodePointShouldThrow() {
CodePointCharStream s = CodePointCharStream.createWithString("X");
s.consume();
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
s.consume();
}
@Test
public void singleLatinCodePointLookAheadShouldReturnCodePoint() {
CodePointCharStream s = CodePointCharStream.createWithString("X");
assertEquals('X', s.LA(1));
assertEquals(0, s.index());
}
@Test
public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() {
CodePointCharStream s = CodePointCharStream.createWithString("XYZ");
assertEquals('X', s.LA(1));
assertEquals(0, s.index());
assertEquals('Y', s.LA(2));
assertEquals(0, s.index());
assertEquals('Z', s.LA(3));
assertEquals(0, s.index());
}
@Test
public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() {
CodePointCharStream s = CodePointCharStream.createWithString("X");
assertEquals(IntStream.EOF, s.LA(2));
}
@Test
public void singleCJKCodePointHasSize1() {
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
assertEquals(1, s.size());
assertEquals(0, s.index());
}
@Test
public void consumingSingleCJKCodePointShouldMoveIndex() {
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
assertEquals(0, s.index());
s.consume();
assertEquals(1, s.index());
}
@Test
public void consumingPastSingleCJKCodePointShouldThrow() {
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
s.consume();
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
s.consume();
}
@Test
public void singleCJKCodePointLookAheadShouldReturnCodePoint() {
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
assertEquals(0x611B, s.LA(1));
assertEquals(0, s.index());
}
@Test
public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() {
CodePointCharStream s = CodePointCharStream.createWithString("\u611B");
assertEquals(IntStream.EOF, s.LA(2));
assertEquals(0, s.index());
}
@Test
public void singleEmojiCodePointHasSize1() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(1, s.size());
assertEquals(0, s.index());
}
@Test
public void consumingSingleEmojiCodePointShouldMoveIndex() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(0, s.index());
s.consume();
assertEquals(1, s.index());
}
@Test
public void consumingPastEndOfEmojiCodePointWithShouldThrow() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(0, s.index());
s.consume();
assertEquals(1, s.index());
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
s.consume();
}
@Test
public void singleEmojiCodePointLookAheadShouldReturnCodePoint() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(0x1F4A9, s.LA(1));
assertEquals(0, s.index());
}
@Test
public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(IntStream.EOF, s.LA(2));
assertEquals(0, s.index());
}
@Test
public void getTextWithLatin() {
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
assertEquals("34567", s.getText(Interval.of(3, 7)));
}
@Test
public void getTextWithCJK() {
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
assertEquals("34\u409467", s.getText(Interval.of(3, 7)));
}
@Test
public void getTextWithEmoji() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
.toString());
assertEquals("34\uD83D\uDD2267", s.getText(Interval.of(3, 7)));
}
@Test
public void toStringWithLatin() {
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
assertEquals("0123456789", s.toString());
}
@Test
public void toStringWithCJK() {
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
assertEquals("01234\u40946789", s.toString());
}
@Test
public void toStringWithEmoji() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
.toString());
assertEquals("01234\uD83D\uDD226789", s.toString());
}
@Test
public void lookAheadWithLatin() {
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
assertEquals('5', s.LA(6));
}
@Test
public void lookAheadWithCJK() {
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
assertEquals(0x4094, s.LA(6));
}
@Test
public void lookAheadWithEmoji() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
.toString());
assertEquals(0x1F522, s.LA(6));
}
@Test
public void seekWithLatin() {
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
s.seek(5);
assertEquals('5', s.LA(1));
}
@Test
public void seekWithCJK() {
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
s.seek(5);
assertEquals(0x4094, s.LA(1));
}
@Test
public void seekWithEmoji() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
.toString());
s.seek(5);
assertEquals(0x1F522, s.LA(1));
}
@Test
public void lookBehindWithLatin() {
CodePointCharStream s = CodePointCharStream.createWithString("0123456789");
s.seek(6);
assertEquals('5', s.LA(-1));
}
@Test
public void lookBehindWithCJK() {
CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789");
s.seek(6);
assertEquals(0x4094, s.LA(-1));
}
@Test
public void lookBehindWithEmoji() {
CodePointCharStream s = CodePointCharStream.createWithString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
.toString());
s.seek(6);
assertEquals(0x1F522, s.LA(-1));
}
}

View File

@ -0,0 +1,161 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.runtime;
import org.antlr.v4.runtime.misc.Interval;
import java.nio.IntBuffer;
/**
* Alternative to {@link ANTLRInputStream} which treats the input
* as a series of Unicode code points, instead of a series of UTF-16
* code units.
*
* Use this if you need to parse input which potentially contains
* Unicode values > U+FFFF.
*/
public final class CodePointCharStream implements CharStream {
private final IntBuffer codePointBuffer;
private final int initialPosition;
private final int size;
private final String name;
/**
* Convenience method to create a {@link CodePointCharStream}
* for the Unicode code points in a Java {@link String}.
*/
public static CodePointCharStream createWithString(String s) {
// Initial guess assumes no code points > U+FFFF: one code
// point for each code unit in the string
IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
int stringIdx = 0;
while (stringIdx < s.length()) {
if (!codePointBuffer.hasRemaining()) {
// Grow the code point buffer size by 2.
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
codePointBuffer.flip();
newBuffer.put(codePointBuffer);
codePointBuffer = newBuffer;
}
int codePoint = Character.codePointAt(s, stringIdx);
codePointBuffer.put(codePoint);
stringIdx += Character.charCount(codePoint);
}
codePointBuffer.flip();
return new CodePointCharStream(codePointBuffer);
}
/**
* Constructs a {@link CodePointCharStream} which provides access
* to the Unicode code points stored in {@code codePointBuffer}.
*
* {@code codePointBuffer}'s {@link IntBuffer#position position}
* reflects the first code point of the stream, and its
* {@link IntBuffer#limit limit} is just after the last code point
* of the stream.
*/
public CodePointCharStream(IntBuffer codePointBuffer) {
this(codePointBuffer, UNKNOWN_SOURCE_NAME);
}
/**
* Constructs a named {@link CodePointCharStream} which provides access
* to the Unicode code points stored in {@code codePointBuffer}.
*
* {@code codePointBuffer}'s {@link IntBuffer#position position}
* reflects the first code point of the stream, and its
* {@link IntBuffer#limit limit} is just after the last code point
* of the stream.
*/
public CodePointCharStream(IntBuffer codePointBuffer, String name) {
this.codePointBuffer = codePointBuffer;
this.initialPosition = codePointBuffer.position();
this.size = codePointBuffer.remaining();
this.name = name;
}
private int relativeBufferPosition(int i) {
return initialPosition + codePointBuffer.position() + i;
}
@Override
public void consume() {
if (!codePointBuffer.hasRemaining()) {
assert LA(1) == IntStream.EOF;
throw new IllegalStateException("cannot consume EOF");
}
codePointBuffer.position(codePointBuffer.position() + 1);
}
@Override
public int LA(int i) {
if (i == 0) {
// Undefined
return 0;
} else if (i < 0) {
if (codePointBuffer.position() + i < initialPosition) {
return IntStream.EOF;
}
return codePointBuffer.get(relativeBufferPosition(i));
} else if (i > codePointBuffer.remaining()) {
return IntStream.EOF;
} else {
return codePointBuffer.get(relativeBufferPosition(i - 1));
}
}
@Override
public int index() {
return codePointBuffer.position() - initialPosition;
}
@Override
public int size() {
return size;
}
/** mark/release do nothing; we have entire buffer */
@Override
public int mark() {
return -1;
}
@Override
public void release(int marker) {
}
@Override
public void seek(int index) {
codePointBuffer.position(initialPosition + index);
}
@Override
public String getText(Interval interval) {
final int startIdx = initialPosition + Math.min(interval.a, size - 1);
final int stopIdx = initialPosition + Math.min(interval.b, size - 1);
// interval.length() will be too small if we contain any code points > U+FFFF,
// but it's just a hint for initial capacity; StringBuilder will grow anyway.
StringBuilder sb = new StringBuilder(interval.length());
for (int codePointIdx = startIdx; codePointIdx <= stopIdx; codePointIdx++) {
sb.appendCodePoint(codePointBuffer.get(codePointIdx));
}
return sb.toString();
}
@Override
public String getSourceName() {
if (name == null || name.isEmpty()) {
return UNKNOWN_SOURCE_NAME;
}
return name;
}
@Override
public String toString() {
return getText(Interval.of(0, size - 1));
}
}