From 212a948656d3464b43cd82fd7a7457dc62d233a3 Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Tue, 24 Jan 2017 13:06:29 -0800 Subject: [PATCH] CodePointCharStream --- .../runtime/java/TestCodePointCharStream.java | 293 ++++++++++++++++++ .../antlr/v4/runtime/CodePointCharStream.java | 161 ++++++++++ 2 files changed, 454 insertions(+) create mode 100644 runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java create mode 100644 runtime/Java/src/org/antlr/v4/runtime/CodePointCharStream.java diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java new file mode 100644 index 000000000..d585d8265 --- /dev/null +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ +package org.antlr.v4.test.runtime.java; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.nio.IntBuffer; + +import org.antlr.v4.runtime.CodePointCharStream; +import org.antlr.v4.runtime.IntStream; + +import org.antlr.v4.runtime.misc.Interval; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class TestCodePointCharStream { + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void emptyBytesHasSize0() { + CodePointCharStream s = CodePointCharStream.createWithString(""); + assertEquals(0, s.size()); + assertEquals(0, s.index()); + } + + @Test + public void emptyBytesLookAheadReturnsEOF() { + CodePointCharStream s = CodePointCharStream.createWithString(""); + assertEquals(IntStream.EOF, s.LA(1)); + assertEquals(0, s.index()); + } + + @Test + public void consumingEmptyStreamShouldThrow() { + CodePointCharStream s = CodePointCharStream.createWithString(""); + thrown.expect(IllegalStateException.class); + thrown.expectMessage("cannot consume EOF"); + s.consume(); + } + + @Test + public void singleLatinCodePointHasSize1() { + CodePointCharStream s = CodePointCharStream.createWithString("X"); + assertEquals(1, s.size()); + } + + @Test + public void consumingSingleLatinCodePointShouldMoveIndex() { + CodePointCharStream s = CodePointCharStream.createWithString("X"); + assertEquals(0, s.index()); + s.consume(); + assertEquals(1, s.index()); + } + + @Test + public void consumingPastSingleLatinCodePointShouldThrow() { + CodePointCharStream s = CodePointCharStream.createWithString("X"); + s.consume(); + thrown.expect(IllegalStateException.class); + thrown.expectMessage("cannot consume EOF"); + s.consume(); + } + + @Test + public void singleLatinCodePointLookAheadShouldReturnCodePoint() { + CodePointCharStream s = CodePointCharStream.createWithString("X"); + assertEquals('X', s.LA(1)); + assertEquals(0, s.index()); + } + + @Test + public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() { + CodePointCharStream s = CodePointCharStream.createWithString("XYZ"); + assertEquals('X', s.LA(1)); + assertEquals(0, s.index()); + assertEquals('Y', s.LA(2)); + assertEquals(0, s.index()); + assertEquals('Z', s.LA(3)); + assertEquals(0, s.index()); + } + + @Test + public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() { + CodePointCharStream s = CodePointCharStream.createWithString("X"); + assertEquals(IntStream.EOF, s.LA(2)); + } + + @Test + public void singleCJKCodePointHasSize1() { + CodePointCharStream s = CodePointCharStream.createWithString("\u611B"); + assertEquals(1, s.size()); + assertEquals(0, s.index()); + } + + @Test + public void consumingSingleCJKCodePointShouldMoveIndex() { + CodePointCharStream s = CodePointCharStream.createWithString("\u611B"); + assertEquals(0, s.index()); + s.consume(); + assertEquals(1, s.index()); + } + + @Test + public void consumingPastSingleCJKCodePointShouldThrow() { + CodePointCharStream s = CodePointCharStream.createWithString("\u611B"); + s.consume(); + thrown.expect(IllegalStateException.class); + thrown.expectMessage("cannot consume EOF"); + s.consume(); + } + + @Test + public void singleCJKCodePointLookAheadShouldReturnCodePoint() { + CodePointCharStream s = CodePointCharStream.createWithString("\u611B"); + assertEquals(0x611B, s.LA(1)); + assertEquals(0, s.index()); + } + + @Test + public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() { + CodePointCharStream s = CodePointCharStream.createWithString("\u611B"); + assertEquals(IntStream.EOF, s.LA(2)); + assertEquals(0, s.index()); + } + + @Test + public void singleEmojiCodePointHasSize1() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder().appendCodePoint(0x1F4A9).toString()); + assertEquals(1, s.size()); + assertEquals(0, s.index()); + } + + @Test + public void consumingSingleEmojiCodePointShouldMoveIndex() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder().appendCodePoint(0x1F4A9).toString()); + assertEquals(0, s.index()); + s.consume(); + assertEquals(1, s.index()); + } + + @Test + public void consumingPastEndOfEmojiCodePointWithShouldThrow() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder().appendCodePoint(0x1F4A9).toString()); + assertEquals(0, s.index()); + s.consume(); + assertEquals(1, s.index()); + thrown.expect(IllegalStateException.class); + thrown.expectMessage("cannot consume EOF"); + s.consume(); + } + + @Test + public void singleEmojiCodePointLookAheadShouldReturnCodePoint() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder().appendCodePoint(0x1F4A9).toString()); + assertEquals(0x1F4A9, s.LA(1)); + assertEquals(0, s.index()); + } + + @Test + public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder().appendCodePoint(0x1F4A9).toString()); + assertEquals(IntStream.EOF, s.LA(2)); + assertEquals(0, s.index()); + } + + @Test + public void getTextWithLatin() { + CodePointCharStream s = CodePointCharStream.createWithString("0123456789"); + assertEquals("34567", s.getText(Interval.of(3, 7))); + } + + @Test + public void getTextWithCJK() { + CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789"); + assertEquals("34\u409467", s.getText(Interval.of(3, 7))); + } + + @Test + public void getTextWithEmoji() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder("01234") + .appendCodePoint(0x1F522) + .append("6789") + .toString()); + assertEquals("34\uD83D\uDD2267", s.getText(Interval.of(3, 7))); + } + + @Test + public void toStringWithLatin() { + CodePointCharStream s = CodePointCharStream.createWithString("0123456789"); + assertEquals("0123456789", s.toString()); + } + + @Test + public void toStringWithCJK() { + CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789"); + assertEquals("01234\u40946789", s.toString()); + } + + @Test + public void toStringWithEmoji() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder("01234") + .appendCodePoint(0x1F522) + .append("6789") + .toString()); + assertEquals("01234\uD83D\uDD226789", s.toString()); + } + + @Test + public void lookAheadWithLatin() { + CodePointCharStream s = CodePointCharStream.createWithString("0123456789"); + assertEquals('5', s.LA(6)); + } + + @Test + public void lookAheadWithCJK() { + CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789"); + assertEquals(0x4094, s.LA(6)); + } + + @Test + public void lookAheadWithEmoji() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder("01234") + .appendCodePoint(0x1F522) + .append("6789") + .toString()); + assertEquals(0x1F522, s.LA(6)); + } + + @Test + public void seekWithLatin() { + CodePointCharStream s = CodePointCharStream.createWithString("0123456789"); + s.seek(5); + assertEquals('5', s.LA(1)); + } + + @Test + public void seekWithCJK() { + CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789"); + s.seek(5); + assertEquals(0x4094, s.LA(1)); + } + + @Test + public void seekWithEmoji() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder("01234") + .appendCodePoint(0x1F522) + .append("6789") + .toString()); + s.seek(5); + assertEquals(0x1F522, s.LA(1)); + } + + @Test + public void lookBehindWithLatin() { + CodePointCharStream s = CodePointCharStream.createWithString("0123456789"); + s.seek(6); + assertEquals('5', s.LA(-1)); + } + + @Test + public void lookBehindWithCJK() { + CodePointCharStream s = CodePointCharStream.createWithString("01234\u40946789"); + s.seek(6); + assertEquals(0x4094, s.LA(-1)); + } + + @Test + public void lookBehindWithEmoji() { + CodePointCharStream s = CodePointCharStream.createWithString( + new StringBuilder("01234") + .appendCodePoint(0x1F522) + .append("6789") + .toString()); + s.seek(6); + assertEquals(0x1F522, s.LA(-1)); + } +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/CodePointCharStream.java b/runtime/Java/src/org/antlr/v4/runtime/CodePointCharStream.java new file mode 100644 index 000000000..a43f63edf --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/CodePointCharStream.java @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ +package org.antlr.v4.runtime; + +import org.antlr.v4.runtime.misc.Interval; + +import java.nio.IntBuffer; + +/** + * Alternative to {@link ANTLRInputStream} which treats the input + * as a series of Unicode code points, instead of a series of UTF-16 + * code units. + * + * Use this if you need to parse input which potentially contains + * Unicode values > U+FFFF. + */ +public final class CodePointCharStream implements CharStream { + private final IntBuffer codePointBuffer; + private final int initialPosition; + private final int size; + private final String name; + + /** + * Convenience method to create a {@link CodePointCharStream} + * for the Unicode code points in a Java {@link String}. + */ + public static CodePointCharStream createWithString(String s) { + // Initial guess assumes no code points > U+FFFF: one code + // point for each code unit in the string + IntBuffer codePointBuffer = IntBuffer.allocate(s.length()); + int stringIdx = 0; + while (stringIdx < s.length()) { + if (!codePointBuffer.hasRemaining()) { + // Grow the code point buffer size by 2. + IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2); + codePointBuffer.flip(); + newBuffer.put(codePointBuffer); + codePointBuffer = newBuffer; + } + int codePoint = Character.codePointAt(s, stringIdx); + codePointBuffer.put(codePoint); + stringIdx += Character.charCount(codePoint); + } + codePointBuffer.flip(); + return new CodePointCharStream(codePointBuffer); + } + + /** + * Constructs a {@link CodePointCharStream} which provides access + * to the Unicode code points stored in {@code codePointBuffer}. + * + * {@code codePointBuffer}'s {@link IntBuffer#position position} + * reflects the first code point of the stream, and its + * {@link IntBuffer#limit limit} is just after the last code point + * of the stream. + */ + public CodePointCharStream(IntBuffer codePointBuffer) { + this(codePointBuffer, UNKNOWN_SOURCE_NAME); + } + + /** + * Constructs a named {@link CodePointCharStream} which provides access + * to the Unicode code points stored in {@code codePointBuffer}. + * + * {@code codePointBuffer}'s {@link IntBuffer#position position} + * reflects the first code point of the stream, and its + * {@link IntBuffer#limit limit} is just after the last code point + * of the stream. + */ + public CodePointCharStream(IntBuffer codePointBuffer, String name) { + this.codePointBuffer = codePointBuffer; + this.initialPosition = codePointBuffer.position(); + this.size = codePointBuffer.remaining(); + this.name = name; + } + + private int relativeBufferPosition(int i) { + return initialPosition + codePointBuffer.position() + i; + } + + @Override + public void consume() { + if (!codePointBuffer.hasRemaining()) { + assert LA(1) == IntStream.EOF; + throw new IllegalStateException("cannot consume EOF"); + } + codePointBuffer.position(codePointBuffer.position() + 1); + } + + @Override + public int LA(int i) { + if (i == 0) { + // Undefined + return 0; + } else if (i < 0) { + if (codePointBuffer.position() + i < initialPosition) { + return IntStream.EOF; + } + return codePointBuffer.get(relativeBufferPosition(i)); + } else if (i > codePointBuffer.remaining()) { + return IntStream.EOF; + } else { + return codePointBuffer.get(relativeBufferPosition(i - 1)); + } + } + + @Override + public int index() { + return codePointBuffer.position() - initialPosition; + } + + @Override + public int size() { + return size; + } + + /** mark/release do nothing; we have entire buffer */ + @Override + public int mark() { + return -1; + } + + @Override + public void release(int marker) { + } + + @Override + public void seek(int index) { + codePointBuffer.position(initialPosition + index); + } + + @Override + public String getText(Interval interval) { + final int startIdx = initialPosition + Math.min(interval.a, size - 1); + final int stopIdx = initialPosition + Math.min(interval.b, size - 1); + // interval.length() will be too small if we contain any code points > U+FFFF, + // but it's just a hint for initial capacity; StringBuilder will grow anyway. + StringBuilder sb = new StringBuilder(interval.length()); + for (int codePointIdx = startIdx; codePointIdx <= stopIdx; codePointIdx++) { + sb.appendCodePoint(codePointBuffer.get(codePointIdx)); + } + return sb.toString(); + } + + @Override + public String getSourceName() { + if (name == null || name.isEmpty()) { + return UNKNOWN_SOURCE_NAME; + } + + return name; + } + + @Override + public String toString() { + return getText(Interval.of(0, size - 1)); + } +}