forked from jasder/antlr
Improve memory usage of CodePointCharStream: Use 8-bit, 16-bit, or 32-bit buffer
This commit is contained in:
parent
de4d129921
commit
ab0655598e
|
@ -3,17 +3,14 @@
|
|||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.test.runtime.java;
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.antlr.v4.runtime.CharStreams;
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
import org.antlr.v4.runtime.IntStream;
|
||||
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
import org.junit.Rule;
|
||||
|
@ -291,4 +288,25 @@ public class TestCodePointCharStream {
|
|||
s.seek(6);
|
||||
assertEquals(0x1F522, s.LA(-1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void asciiContentsShouldUse8BitBuffer() {
|
||||
CodePointCharStream s = CharStreams.fromString("hello");
|
||||
assertTrue(s.getInternalStorage() instanceof byte[]);
|
||||
assertEquals(5, s.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void bmpContentsShouldUse16BitBuffer() {
|
||||
CodePointCharStream s = CharStreams.fromString("hello \u4E16\u754C");
|
||||
assertTrue(s.getInternalStorage() instanceof char[]);
|
||||
assertEquals(8, s.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void smpContentsShouldUse32BitBuffer() {
|
||||
CodePointCharStream s = CharStreams.fromString("hello \uD83C\uDF0D");
|
||||
assertTrue(s.getInternalStorage() instanceof int[]);
|
||||
assertEquals(7, s.size());
|
||||
}
|
||||
}
|
|
@ -137,9 +137,9 @@ public class TestCharStreams {
|
|||
try (SeekableByteChannel c = Files.newByteChannel(p)) {
|
||||
CharStream s = CharStreams.fromChannel(
|
||||
c, 4096, CodingErrorAction.REPLACE, "foo");
|
||||
assertEquals(3, s.size());
|
||||
assertEquals(4, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("\uFFFD\uFFFD\uFFFD", s.toString());
|
||||
assertEquals("\uFFFD\uFFFD\uFFFD\uFFFD", s.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,162 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.test.runtime.java;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.antlr.v4.runtime.UTF8CodePointDecoder;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
public class TestUTF8CodePointDecoder {
|
||||
@Rule
|
||||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void decodeEmptyByteBufferWritesNothing() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.allocate(0);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(0);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(0, result.remaining());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeLatinByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("X");
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals('X', result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeCyrillicByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u042F");
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0x042F, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeCJKByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u611B");
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0x611B, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeEmojiByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString()
|
||||
);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0x1F4A9, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidLeadInReplaceModeWritesSubstitutionCharacter() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0xFFFD, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidLeadInReportModeThrows() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
thrown.expectMessage("Invalid UTF-8 leading byte 0xF8");
|
||||
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidTrailInReplaceModeWritesSubstitutionCharacter() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0xFFFD, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidTrailInReportModeThrows() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
thrown.expectMessage("Invalid UTF-8 trailing byte 0xC0");
|
||||
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingNonShortestFormInReplaceModeWritesSubstitutionCharacter() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0xFFFD, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingNonShortestFormInReportModeThrows() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
|
||||
// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
thrown.expectMessage("Code point 92 is out of expected range 128..2047");
|
||||
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
}
|
||||
}
|
|
@ -13,9 +13,11 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.IOException;
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.lang.management.RuntimeMXBean;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
@ -331,10 +333,12 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
|
|||
for (int i = 0; i<n; i++) {
|
||||
streams[i] = loader.getResourceAsStream(resourceName);
|
||||
}
|
||||
URLConnection uc = null;
|
||||
long streamLength = getResourceSize(loader, resourceName);
|
||||
long start = System.nanoTime(); // track only time to suck data out of stream
|
||||
for (int i = 0; i<n; i++) {
|
||||
try (InputStream is = streams[i]) {
|
||||
input[i] = CharStreams.fromStream(is);
|
||||
input[i] = CharStreams.fromStream(is, StandardCharsets.UTF_8, streamLength);
|
||||
}
|
||||
}
|
||||
long stop = System.nanoTime();
|
||||
|
@ -370,8 +374,10 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
|
|||
}
|
||||
|
||||
public void lex_new_java_utf8(int n, boolean clearLexerDFACache) throws Exception {
|
||||
try (InputStream is = TimeLexerSpeed.class.getClassLoader().getResourceAsStream(Parser_java_file);) {
|
||||
CharStream input = CharStreams.fromStream(is);
|
||||
ClassLoader loader = TimeLexerSpeed.class.getClassLoader();
|
||||
try (InputStream is = loader.getResourceAsStream(Parser_java_file);) {
|
||||
long size = getResourceSize(loader, Parser_java_file);
|
||||
CharStream input = CharStreams.fromStream(is, StandardCharsets.UTF_8, size);
|
||||
JavaLexer lexer = new JavaLexer(input);
|
||||
double avg = tokenize(lexer, n, clearLexerDFACache);
|
||||
String currentMethodName = new Exception().getStackTrace()[0].getMethodName();
|
||||
|
@ -403,8 +409,11 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
|
|||
}
|
||||
|
||||
public void lex_new_grapheme_utf8(String fileName, int n, boolean clearLexerDFACache) throws Exception {
|
||||
try (InputStream is = TimeLexerSpeed.class.getClassLoader().getResourceAsStream(PerfDir+"/"+fileName)) {
|
||||
CharStream input = CharStreams.fromStream(is);
|
||||
String resourceName = PerfDir+"/"+fileName;
|
||||
ClassLoader loader = TimeLexerSpeed.class.getClassLoader();
|
||||
try (InputStream is = loader.getResourceAsStream(resourceName)) {
|
||||
long size = getResourceSize(loader, resourceName);
|
||||
CharStream input = CharStreams.fromStream(is, StandardCharsets.UTF_8, size);
|
||||
graphemesLexer lexer = new graphemesLexer(input);
|
||||
double avg = tokenize(lexer, n, clearLexerDFACache);
|
||||
String currentMethodName = new Exception().getStackTrace()[0].getMethodName();
|
||||
|
@ -474,4 +483,18 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
|
|||
public static String dirname(Path path) {
|
||||
return path.getName(0).toString();
|
||||
}
|
||||
|
||||
public static final long getResourceSize(ClassLoader loader, String resourceName) throws IOException {
|
||||
URLConnection uc = null;
|
||||
try {
|
||||
// Sadly, URLConnection is not AutoCloseable, but it leaks resources if
|
||||
// we don't close its stream.
|
||||
uc = loader.getResource(resourceName).openConnection();
|
||||
return uc.getContentLengthLong();
|
||||
} finally {
|
||||
if (uc != null) {
|
||||
uc.getInputStream().close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,10 +10,12 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.channels.Channels;
|
||||
import java.nio.channels.ReadableByteChannel;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
|
@ -55,17 +57,15 @@ public final class CharStreams {
|
|||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromPath(Path path, Charset charset) throws IOException {
|
||||
if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
path.toString());
|
||||
}
|
||||
}
|
||||
else {
|
||||
return new ANTLRFileStream(path.toString(), charset.toString());
|
||||
long size = Files.size(path);
|
||||
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
charset,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
path.toString(),
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -120,19 +120,18 @@ public final class CharStreams {
|
|||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
|
||||
if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
try (ReadableByteChannel channel = Channels.newChannel(is)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
}
|
||||
else {
|
||||
try (InputStreamReader isr = new InputStreamReader(is, charset)) {
|
||||
return new ANTLRInputStream(isr);
|
||||
}
|
||||
return fromStream(is, charset, -1);
|
||||
}
|
||||
|
||||
public static CharStream fromStream(InputStream is, Charset charset, long inputSize) throws IOException {
|
||||
try (ReadableByteChannel channel = Channels.newChannel(is)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
charset,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME,
|
||||
inputSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,18 +159,11 @@ public final class CharStreams {
|
|||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
|
||||
if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
else {
|
||||
try (InputStreamReader isr = new InputStreamReader(Channels.newInputStream(channel), charset)) {
|
||||
return new ANTLRInputStream(isr);
|
||||
}
|
||||
}
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -187,50 +179,15 @@ public final class CharStreams {
|
|||
* source name. Closes the reader before returning.
|
||||
*/
|
||||
public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
|
||||
IntBuffer codePointBuffer = IntBuffer.allocate(DEFAULT_BUFFER_SIZE);
|
||||
int highSurrogate = -1;
|
||||
int curCodeUnit;
|
||||
try {
|
||||
while ((curCodeUnit = r.read()) != -1) {
|
||||
if (!codePointBuffer.hasRemaining()) {
|
||||
// Grow the code point buffer size by 2.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
|
||||
codePointBuffer.flip();
|
||||
newBuffer.put(codePointBuffer);
|
||||
codePointBuffer = newBuffer;
|
||||
}
|
||||
if (Character.isHighSurrogate((char) curCodeUnit)) {
|
||||
if (highSurrogate != -1) {
|
||||
// Dangling high surrogate followed by another high surrogate.
|
||||
codePointBuffer.put(highSurrogate);
|
||||
}
|
||||
highSurrogate = curCodeUnit;
|
||||
}
|
||||
else if (Character.isLowSurrogate((char) curCodeUnit)) {
|
||||
if (highSurrogate == -1) {
|
||||
// Low surrogate not preceded by high surrogate.
|
||||
codePointBuffer.put(curCodeUnit);
|
||||
}
|
||||
else {
|
||||
codePointBuffer.put(Character.toCodePoint((char) highSurrogate, (char) curCodeUnit));
|
||||
highSurrogate = -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (highSurrogate != -1) {
|
||||
// Dangling high surrogate followed by a non-surrogate.
|
||||
codePointBuffer.put(highSurrogate);
|
||||
highSurrogate = -1;
|
||||
}
|
||||
codePointBuffer.put(curCodeUnit);
|
||||
}
|
||||
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(DEFAULT_BUFFER_SIZE);
|
||||
CharBuffer charBuffer = CharBuffer.allocate(DEFAULT_BUFFER_SIZE);
|
||||
while ((r.read(charBuffer)) != -1) {
|
||||
charBuffer.flip();
|
||||
codePointBufferBuilder.append(charBuffer);
|
||||
charBuffer.compact();
|
||||
}
|
||||
if (highSurrogate != -1) {
|
||||
// Dangling high surrogate at end of file.
|
||||
codePointBuffer.put(highSurrogate);
|
||||
}
|
||||
codePointBuffer.flip();
|
||||
return new CodePointCharStream(codePointBuffer, sourceName);
|
||||
return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
|
||||
}
|
||||
finally {
|
||||
r.close();
|
||||
|
@ -251,22 +208,14 @@ public final class CharStreams {
|
|||
public static CodePointCharStream fromString(String s, String sourceName) {
|
||||
// Initial guess assumes no code points > U+FFFF: one code
|
||||
// point for each code unit in the string
|
||||
IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
|
||||
int stringIdx = 0;
|
||||
while (stringIdx < s.length()) {
|
||||
if (!codePointBuffer.hasRemaining()) {
|
||||
// Grow the code point buffer size by 2.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
|
||||
codePointBuffer.flip();
|
||||
newBuffer.put(codePointBuffer);
|
||||
codePointBuffer = newBuffer;
|
||||
}
|
||||
int codePoint = Character.codePointAt(s, stringIdx);
|
||||
codePointBuffer.put(codePoint);
|
||||
stringIdx += Character.charCount(codePoint);
|
||||
}
|
||||
codePointBuffer.flip();
|
||||
return new CodePointCharStream(codePointBuffer, sourceName);
|
||||
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(s.length());
|
||||
// TODO: CharBuffer.wrap(String) rightfully returns a read-only buffer
|
||||
// which doesn't expose its array, so we make a copy.
|
||||
CharBuffer cb = CharBuffer.allocate(s.length());
|
||||
cb.put(s);
|
||||
cb.flip();
|
||||
codePointBufferBuilder.append(cb);
|
||||
return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -282,25 +231,62 @@ public final class CharStreams {
|
|||
CodingErrorAction decodingErrorAction,
|
||||
String sourceName)
|
||||
throws IOException
|
||||
{
|
||||
return fromChannel(channel, StandardCharsets.UTF_8, bufferSize, decodingErrorAction, sourceName, -1);
|
||||
}
|
||||
|
||||
public static CodePointCharStream fromChannel(
|
||||
ReadableByteChannel channel,
|
||||
Charset charset,
|
||||
int bufferSize,
|
||||
CodingErrorAction decodingErrorAction,
|
||||
String sourceName,
|
||||
long inputSize)
|
||||
throws IOException
|
||||
{
|
||||
try {
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.allocate(bufferSize);
|
||||
CharBuffer utf16CodeUnitsOut = CharBuffer.allocate(bufferSize);
|
||||
if (inputSize == -1) {
|
||||
inputSize = bufferSize;
|
||||
} else if (inputSize > Integer.MAX_VALUE) {
|
||||
// ByteBuffer et al don't support long sizes
|
||||
throw new IOException(String.format("inputSize %d larger than max %d", inputSize, Integer.MAX_VALUE));
|
||||
}
|
||||
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder((int) inputSize);
|
||||
CharsetDecoder decoder = charset
|
||||
.newDecoder()
|
||||
.onMalformedInput(decodingErrorAction)
|
||||
.onUnmappableCharacter(decodingErrorAction);
|
||||
|
||||
boolean endOfInput = false;
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
|
||||
while (!endOfInput) {
|
||||
int bytesRead = channel.read(utf8BytesIn);
|
||||
endOfInput = (bytesRead == -1);
|
||||
utf8BytesIn.flip();
|
||||
codePointsOut = decoder.decodeCodePointsFromBuffer(
|
||||
CoderResult result = decoder.decode(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
utf16CodeUnitsOut,
|
||||
endOfInput);
|
||||
if (result.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
|
||||
result.throwException();
|
||||
}
|
||||
utf16CodeUnitsOut.flip();
|
||||
codePointBufferBuilder.append(utf16CodeUnitsOut);
|
||||
utf8BytesIn.compact();
|
||||
utf16CodeUnitsOut.compact();
|
||||
}
|
||||
codePointsOut.limit(codePointsOut.position());
|
||||
codePointsOut.flip();
|
||||
return new CodePointCharStream(codePointsOut, sourceName);
|
||||
// Handle any bytes at the end of the file which need to
|
||||
// be represented as errors or substitution characters.
|
||||
CoderResult flushResult = decoder.flush(utf16CodeUnitsOut);
|
||||
if (flushResult.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
|
||||
flushResult.throwException();
|
||||
}
|
||||
utf16CodeUnitsOut.flip();
|
||||
codePointBufferBuilder.append(utf16CodeUnitsOut);
|
||||
|
||||
CodePointBuffer codePointBuffer = codePointBufferBuilder.build();
|
||||
return CodePointCharStream.fromBuffer(codePointBuffer, sourceName);
|
||||
}
|
||||
finally {
|
||||
channel.close();
|
||||
|
|
|
@ -0,0 +1,388 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
/**
|
||||
* Wrapper for {@link ByteBuffer} / {@link CharBuffer} / {@link IntBuffer}.
|
||||
*
|
||||
* Because Java lacks generics on primitive types, these three types
|
||||
* do not share an interface, so we have to write one manually.
|
||||
*/
|
||||
public class CodePointBuffer {
|
||||
public enum Type {
|
||||
BYTE,
|
||||
CHAR,
|
||||
INT
|
||||
}
|
||||
private final Type type;
|
||||
private final ByteBuffer byteBuffer;
|
||||
private final CharBuffer charBuffer;
|
||||
private final IntBuffer intBuffer;
|
||||
|
||||
private CodePointBuffer(Type type, ByteBuffer byteBuffer, CharBuffer charBuffer, IntBuffer intBuffer) {
|
||||
this.type = type;
|
||||
this.byteBuffer = byteBuffer;
|
||||
this.charBuffer = charBuffer;
|
||||
this.intBuffer = intBuffer;
|
||||
}
|
||||
|
||||
public static CodePointBuffer withBytes(ByteBuffer byteBuffer) {
|
||||
return new CodePointBuffer(Type.BYTE, byteBuffer, null, null);
|
||||
}
|
||||
|
||||
public static CodePointBuffer withChars(CharBuffer charBuffer) {
|
||||
return new CodePointBuffer(Type.CHAR, null, charBuffer, null);
|
||||
}
|
||||
|
||||
public static CodePointBuffer withInts(IntBuffer intBuffer) {
|
||||
return new CodePointBuffer(Type.INT, null, null, intBuffer);
|
||||
}
|
||||
|
||||
public int position() {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
return byteBuffer.position();
|
||||
case CHAR:
|
||||
return charBuffer.position();
|
||||
case INT:
|
||||
return intBuffer.position();
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
public void position(int newPosition) {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
byteBuffer.position(newPosition);
|
||||
break;
|
||||
case CHAR:
|
||||
charBuffer.position(newPosition);
|
||||
break;
|
||||
case INT:
|
||||
intBuffer.position(newPosition);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public int remaining() {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
return byteBuffer.remaining();
|
||||
case CHAR:
|
||||
return charBuffer.remaining();
|
||||
case INT:
|
||||
return intBuffer.remaining();
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
public int get(int offset) {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
return byteBuffer.get(offset);
|
||||
case CHAR:
|
||||
return charBuffer.get(offset);
|
||||
case INT:
|
||||
return intBuffer.get(offset);
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
Type getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
int arrayOffset() {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
return byteBuffer.arrayOffset();
|
||||
case CHAR:
|
||||
return charBuffer.arrayOffset();
|
||||
case INT:
|
||||
return intBuffer.arrayOffset();
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
byte[] byteArray() {
|
||||
assert type == Type.BYTE;
|
||||
return byteBuffer.array();
|
||||
}
|
||||
|
||||
char[] charArray() {
|
||||
assert type == Type.CHAR;
|
||||
return charBuffer.array();
|
||||
}
|
||||
|
||||
int[] intArray() {
|
||||
assert type == Type.INT;
|
||||
return intBuffer.array();
|
||||
}
|
||||
|
||||
public static Builder builder(int initialBufferSize) {
|
||||
return new Builder(initialBufferSize);
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private Type type;
|
||||
private ByteBuffer byteBuffer;
|
||||
private CharBuffer charBuffer;
|
||||
private IntBuffer intBuffer;
|
||||
private int prevHighSurrogate;
|
||||
|
||||
private Builder(int initialBufferSize) {
|
||||
type = Type.BYTE;
|
||||
byteBuffer = ByteBuffer.allocate(initialBufferSize);
|
||||
charBuffer = null;
|
||||
intBuffer = null;
|
||||
prevHighSurrogate = -1;
|
||||
}
|
||||
|
||||
Type getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
ByteBuffer getByteBuffer() {
|
||||
return byteBuffer;
|
||||
}
|
||||
|
||||
CharBuffer getCharBuffer() {
|
||||
return charBuffer;
|
||||
}
|
||||
|
||||
IntBuffer getIntBuffer() {
|
||||
return intBuffer;
|
||||
}
|
||||
|
||||
public CodePointBuffer build() {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
byteBuffer.flip();
|
||||
break;
|
||||
case CHAR:
|
||||
charBuffer.flip();
|
||||
break;
|
||||
case INT:
|
||||
intBuffer.flip();
|
||||
break;
|
||||
}
|
||||
return new CodePointBuffer(type, byteBuffer, charBuffer, intBuffer);
|
||||
}
|
||||
|
||||
private static int roundUpToNextPowerOfTwo(int i) {
|
||||
int nextPowerOfTwo = 32 - Integer.numberOfLeadingZeros(i - 1);
|
||||
return (int) Math.pow(2, nextPowerOfTwo);
|
||||
}
|
||||
|
||||
public void ensureRemaining(int remainingNeeded) {
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
if (byteBuffer.remaining() < remainingNeeded) {
|
||||
int newCapacity = roundUpToNextPowerOfTwo(byteBuffer.capacity() + remainingNeeded);
|
||||
ByteBuffer newBuffer = ByteBuffer.allocate(newCapacity);
|
||||
byteBuffer.flip();
|
||||
newBuffer.put(byteBuffer);
|
||||
byteBuffer = newBuffer;
|
||||
}
|
||||
break;
|
||||
case CHAR:
|
||||
if (charBuffer.remaining() < remainingNeeded) {
|
||||
int newCapacity = roundUpToNextPowerOfTwo(charBuffer.capacity() + remainingNeeded);
|
||||
CharBuffer newBuffer = CharBuffer.allocate(newCapacity);
|
||||
charBuffer.flip();
|
||||
newBuffer.put(charBuffer);
|
||||
charBuffer = newBuffer;
|
||||
}
|
||||
break;
|
||||
case INT:
|
||||
if (intBuffer.remaining() < remainingNeeded) {
|
||||
int newCapacity = roundUpToNextPowerOfTwo(intBuffer.capacity() + remainingNeeded);
|
||||
IntBuffer newBuffer = IntBuffer.allocate(newCapacity);
|
||||
intBuffer.flip();
|
||||
newBuffer.put(intBuffer);
|
||||
intBuffer = newBuffer;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public void append(CharBuffer utf16In) {
|
||||
ensureRemaining(utf16In.remaining());
|
||||
if (utf16In.hasArray()) {
|
||||
appendArray(utf16In);
|
||||
} else {
|
||||
// TODO
|
||||
throw new UnsupportedOperationException("TODO");
|
||||
}
|
||||
}
|
||||
|
||||
private void appendArray(CharBuffer utf16In) {
|
||||
assert utf16In.hasArray();
|
||||
|
||||
switch (type) {
|
||||
case BYTE:
|
||||
appendArrayByte(utf16In);
|
||||
break;
|
||||
case CHAR:
|
||||
appendArrayChar(utf16In);
|
||||
break;
|
||||
case INT:
|
||||
appendArrayInt(utf16In);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private void appendArrayByte(CharBuffer utf16In) {
|
||||
assert prevHighSurrogate == -1;
|
||||
|
||||
char[] in = utf16In.array();
|
||||
int inOffset = utf16In.arrayOffset() + utf16In.position();
|
||||
int inLimit = utf16In.arrayOffset() + utf16In.limit();
|
||||
|
||||
byte[] outByte = byteBuffer.array();
|
||||
int outOffset = byteBuffer.arrayOffset() + byteBuffer.position();
|
||||
|
||||
while (inOffset < inLimit) {
|
||||
char c = in[inOffset];
|
||||
if (c <= 0xFF) {
|
||||
outByte[outOffset] = (byte)(c & 0xFF);
|
||||
} else {
|
||||
utf16In.position(inOffset - utf16In.arrayOffset());
|
||||
byteBuffer.position(outOffset - byteBuffer.arrayOffset());
|
||||
if (!Character.isHighSurrogate(c)) {
|
||||
byteToCharBuffer(utf16In.remaining());
|
||||
appendArrayChar(utf16In);
|
||||
return;
|
||||
} else {
|
||||
byteToIntBuffer(utf16In.remaining());
|
||||
appendArrayInt(utf16In);
|
||||
return;
|
||||
}
|
||||
}
|
||||
inOffset++;
|
||||
outOffset++;
|
||||
}
|
||||
|
||||
utf16In.position(inOffset - utf16In.arrayOffset());
|
||||
byteBuffer.position(outOffset - byteBuffer.arrayOffset());
|
||||
}
|
||||
|
||||
private void appendArrayChar(CharBuffer utf16In) {
|
||||
assert prevHighSurrogate == -1;
|
||||
|
||||
char[] in = utf16In.array();
|
||||
int inOffset = utf16In.arrayOffset() + utf16In.position();
|
||||
int inLimit = utf16In.arrayOffset() + utf16In.limit();
|
||||
|
||||
char[] outChar = charBuffer.array();
|
||||
int outOffset = charBuffer.arrayOffset() + charBuffer.position();
|
||||
|
||||
while (inOffset < inLimit) {
|
||||
char c = in[inOffset];
|
||||
if (!Character.isHighSurrogate(c)) {
|
||||
outChar[outOffset] = c;
|
||||
} else {
|
||||
utf16In.position(inOffset - utf16In.arrayOffset());
|
||||
charBuffer.position(outOffset - charBuffer.arrayOffset());
|
||||
charToIntBuffer(utf16In.remaining());
|
||||
appendArrayInt(utf16In);
|
||||
return;
|
||||
}
|
||||
inOffset++;
|
||||
outOffset++;
|
||||
}
|
||||
|
||||
utf16In.position(inOffset - utf16In.arrayOffset());
|
||||
charBuffer.position(outOffset - charBuffer.arrayOffset());
|
||||
}
|
||||
|
||||
private void appendArrayInt(CharBuffer utf16In) {
|
||||
char[] in = utf16In.array();
|
||||
int inOffset = utf16In.arrayOffset() + utf16In.position();
|
||||
int inLimit = utf16In.arrayOffset() + utf16In.limit();
|
||||
|
||||
int[] outInt = intBuffer.array();
|
||||
int outOffset = intBuffer.arrayOffset() + intBuffer.position();
|
||||
|
||||
while (inOffset < inLimit) {
|
||||
char c = in[inOffset];
|
||||
inOffset++;
|
||||
if (prevHighSurrogate != -1) {
|
||||
if (Character.isLowSurrogate(c)) {
|
||||
outInt[outOffset] = Character.toCodePoint((char) prevHighSurrogate, c);
|
||||
outOffset++;
|
||||
prevHighSurrogate = -1;
|
||||
} else {
|
||||
// Dangling high surrogate
|
||||
outInt[outOffset] = prevHighSurrogate;
|
||||
outOffset++;
|
||||
if (Character.isHighSurrogate(c)) {
|
||||
prevHighSurrogate = c & 0xFFFF;
|
||||
} else {
|
||||
outInt[outOffset] = c & 0xFFFF;
|
||||
outOffset++;
|
||||
prevHighSurrogate = -1;
|
||||
}
|
||||
}
|
||||
} else if (Character.isHighSurrogate(c)) {
|
||||
prevHighSurrogate = c & 0xFFFF;
|
||||
} else {
|
||||
outInt[outOffset] = c & 0xFFFF;
|
||||
outOffset++;
|
||||
}
|
||||
}
|
||||
|
||||
if (prevHighSurrogate != -1) {
|
||||
// Dangling high surrogate
|
||||
outInt[outOffset] = prevHighSurrogate & 0xFFFF;
|
||||
outOffset++;
|
||||
}
|
||||
|
||||
utf16In.position(inOffset - utf16In.arrayOffset());
|
||||
intBuffer.position(outOffset - intBuffer.arrayOffset());
|
||||
}
|
||||
|
||||
private void byteToCharBuffer(int toAppend) {
|
||||
byteBuffer.flip();
|
||||
// CharBuffers hold twice as much per unit as ByteBuffers, so start with half the capacity.
|
||||
CharBuffer newBuffer = CharBuffer.allocate(Math.max(byteBuffer.remaining() + toAppend, byteBuffer.capacity() / 2));
|
||||
while (byteBuffer.hasRemaining()) {
|
||||
newBuffer.put((char) (byteBuffer.get() & 0xFF));
|
||||
}
|
||||
type = Type.CHAR;
|
||||
byteBuffer = null;
|
||||
charBuffer = newBuffer;
|
||||
}
|
||||
|
||||
private void byteToIntBuffer(int toAppend) {
|
||||
byteBuffer.flip();
|
||||
// IntBuffers hold four times as much per unit as ByteBuffers, so start with one quarter the capacity.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(Math.max(byteBuffer.remaining() + toAppend, byteBuffer.capacity() / 4));
|
||||
while (byteBuffer.hasRemaining()) {
|
||||
newBuffer.put(byteBuffer.get() & 0xFF);
|
||||
}
|
||||
type = Type.INT;
|
||||
byteBuffer = null;
|
||||
intBuffer = newBuffer;
|
||||
}
|
||||
|
||||
private void charToIntBuffer(int toAppend) {
|
||||
charBuffer.flip();
|
||||
// IntBuffers hold two times as much per unit as ByteBuffers, so start with one half the capacity.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(Math.max(charBuffer.remaining() + toAppend, charBuffer.capacity() / 2));
|
||||
while (charBuffer.hasRemaining()) {
|
||||
newBuffer.put(charBuffer.get() & 0xFFFF);
|
||||
}
|
||||
type = Type.INT;
|
||||
charBuffer = null;
|
||||
intBuffer = newBuffer;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -7,7 +7,7 @@ package org.antlr.v4.runtime;
|
|||
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
/**
|
||||
* Alternative to {@link ANTLRInputStream} which treats the input
|
||||
|
@ -17,115 +17,113 @@ import java.nio.IntBuffer;
|
|||
* Use this if you need to parse input which potentially contains
|
||||
* Unicode values > U+FFFF.
|
||||
*/
|
||||
public final class CodePointCharStream implements CharStream {
|
||||
private final IntBuffer codePointBuffer;
|
||||
private final int initialPosition;
|
||||
private final int size;
|
||||
private final String name;
|
||||
public abstract class CodePointCharStream implements CharStream {
|
||||
protected final int size;
|
||||
protected final String name;
|
||||
|
||||
// To avoid lots of virtual method calls, we directly access
|
||||
// the state of the underlying code points in the
|
||||
// CodePointBuffer.
|
||||
protected int position;
|
||||
|
||||
// Use the factory method {@link #fromBuffer(CodePointBuffer)} to
|
||||
// construct instances of this type.
|
||||
private CodePointCharStream(int position, int remaining, String name) {
|
||||
// TODO
|
||||
assert position == 0;
|
||||
this.size = remaining;
|
||||
this.name = name;
|
||||
this.position = 0;
|
||||
}
|
||||
|
||||
// Visible for testing.
|
||||
abstract Object getInternalStorage();
|
||||
|
||||
/**
|
||||
* Constructs a {@link CodePointCharStream} which provides access
|
||||
* to the Unicode code points stored in {@code codePointBuffer}.
|
||||
*
|
||||
* {@code codePointBuffer}'s {@link IntBuffer#position position}
|
||||
* reflects the first code point of the stream, and its
|
||||
* {@link IntBuffer#limit limit} is just after the last code point
|
||||
* of the stream.
|
||||
*/
|
||||
public CodePointCharStream(IntBuffer codePointBuffer) {
|
||||
this(codePointBuffer, UNKNOWN_SOURCE_NAME);
|
||||
public static CodePointCharStream fromBuffer(CodePointBuffer codePointBuffer) {
|
||||
return fromBuffer(codePointBuffer, UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a named {@link CodePointCharStream} which provides access
|
||||
* to the Unicode code points stored in {@code codePointBuffer}.
|
||||
*
|
||||
* {@code codePointBuffer}'s {@link IntBuffer#position position}
|
||||
* reflects the first code point of the stream, and its
|
||||
* {@link IntBuffer#limit limit} is just after the last code point
|
||||
* of the stream.
|
||||
*/
|
||||
public CodePointCharStream(IntBuffer codePointBuffer, String name) {
|
||||
this.codePointBuffer = codePointBuffer;
|
||||
this.initialPosition = codePointBuffer.position();
|
||||
this.size = codePointBuffer.remaining();
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
private int relativeBufferPosition(int i) {
|
||||
return initialPosition + codePointBuffer.position() + i;
|
||||
public static CodePointCharStream fromBuffer(CodePointBuffer codePointBuffer, String name) {
|
||||
// Java lacks generics on primitive types.
|
||||
//
|
||||
// To avoid lots of calls to virtual methods in the
|
||||
// very hot codepath of LA() below, we construct one
|
||||
// of three concrete subclasses.
|
||||
//
|
||||
// The concrete subclasses directly access the code
|
||||
// points stored in the underlying array (byte[],
|
||||
// char[], or int[]), so we can avoid lots of virtual
|
||||
// method calls to ByteBuffer.get(offset).
|
||||
switch (codePointBuffer.getType()) {
|
||||
case BYTE:
|
||||
return new CodePoint8BitCharStream(
|
||||
codePointBuffer.position(),
|
||||
codePointBuffer.remaining(),
|
||||
name,
|
||||
codePointBuffer.byteArray(),
|
||||
codePointBuffer.arrayOffset());
|
||||
case CHAR:
|
||||
return new CodePoint16BitCharStream(
|
||||
codePointBuffer.position(),
|
||||
codePointBuffer.remaining(),
|
||||
name,
|
||||
codePointBuffer.charArray(),
|
||||
codePointBuffer.arrayOffset());
|
||||
case INT:
|
||||
return new CodePoint32BitCharStream(
|
||||
codePointBuffer.position(),
|
||||
codePointBuffer.remaining(),
|
||||
name,
|
||||
codePointBuffer.intArray(),
|
||||
codePointBuffer.arrayOffset());
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void consume() {
|
||||
if (!codePointBuffer.hasRemaining()) {
|
||||
public final void consume() {
|
||||
if (size - position == 0) {
|
||||
assert LA(1) == IntStream.EOF;
|
||||
throw new IllegalStateException("cannot consume EOF");
|
||||
}
|
||||
codePointBuffer.position(codePointBuffer.position() + 1);
|
||||
position = position + 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int LA(int i) {
|
||||
if (i == 0) {
|
||||
// Undefined
|
||||
return 0;
|
||||
}
|
||||
else if (i < 0) {
|
||||
if (codePointBuffer.position() + i < initialPosition) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return codePointBuffer.get(relativeBufferPosition(i));
|
||||
}
|
||||
else if (i > codePointBuffer.remaining()) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
else {
|
||||
return codePointBuffer.get(relativeBufferPosition(i - 1));
|
||||
}
|
||||
public final int index() {
|
||||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int index() {
|
||||
return codePointBuffer.position() - initialPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
public final int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/** mark/release do nothing; we have entire buffer */
|
||||
@Override
|
||||
public int mark() {
|
||||
public final int mark() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void release(int marker) {
|
||||
public final void release(int marker) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(int index) {
|
||||
codePointBuffer.position(initialPosition + index);
|
||||
}
|
||||
|
||||
/** Return the UTF-16 encoded string for the given interval */
|
||||
@Override
|
||||
public String getText(Interval interval) {
|
||||
final int startIdx = initialPosition + Math.min(interval.a, size - 1);
|
||||
final int stopIdx = initialPosition + Math.min(interval.b, size - 1);
|
||||
// interval.length() will be too small if we contain any code points > U+FFFF,
|
||||
// but it's just a hint for initial capacity; StringBuilder will grow anyway.
|
||||
StringBuilder sb = new StringBuilder(interval.length());
|
||||
for (int codePointIdx = startIdx; codePointIdx <= stopIdx; codePointIdx++) {
|
||||
sb.appendCodePoint(codePointBuffer.get(codePointIdx));
|
||||
}
|
||||
return sb.toString();
|
||||
public final void seek(int index) {
|
||||
position = index;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceName() {
|
||||
public final String getSourceName() {
|
||||
if (name == null || name.isEmpty()) {
|
||||
return UNKNOWN_SOURCE_NAME;
|
||||
}
|
||||
|
@ -134,7 +132,165 @@ public final class CodePointCharStream implements CharStream {
|
|||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
public final String toString() {
|
||||
return getText(Interval.of(0, size - 1));
|
||||
}
|
||||
|
||||
// 8-bit storage for code points <= U+00FF.
|
||||
private static final class CodePoint8BitCharStream extends CodePointCharStream {
|
||||
private final byte[] byteArray;
|
||||
|
||||
private CodePoint8BitCharStream(int position, int remaining, String name, byte[] byteArray, int arrayOffset) {
|
||||
super(position, remaining, name);
|
||||
// TODO
|
||||
assert arrayOffset == 0;
|
||||
this.byteArray = byteArray;
|
||||
}
|
||||
|
||||
/** Return the UTF-16 encoded string for the given interval */
|
||||
@Override
|
||||
public String getText(Interval interval) {
|
||||
int startIdx = Math.min(interval.a, size - 1);
|
||||
int len = Math.min(interval.b - interval.a + 1, size);
|
||||
|
||||
// We know the maximum code point in byteArray is U+00FF,
|
||||
// so we can treat this as if it were ISO-8859-1, aka Latin-1,
|
||||
// which shares the same code points up to 0xFF.
|
||||
return new String(byteArray, startIdx, len, StandardCharsets.ISO_8859_1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int LA(int i) {
|
||||
int offset;
|
||||
switch (Integer.signum(i)) {
|
||||
case -1:
|
||||
offset = position + i;
|
||||
if (offset < 0) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return byteArray[offset] & 0xFF;
|
||||
case 0:
|
||||
// Undefined
|
||||
return 0;
|
||||
case 1:
|
||||
offset = position + i - 1;
|
||||
if (offset >= size) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return byteArray[offset] & 0xFF;
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
@Override
|
||||
Object getInternalStorage() {
|
||||
return byteArray;
|
||||
}
|
||||
}
|
||||
|
||||
// 16-bit internal storage for code points between U+0100 and U+FFFF.
|
||||
private static final class CodePoint16BitCharStream extends CodePointCharStream {
|
||||
private final char[] charArray;
|
||||
|
||||
private CodePoint16BitCharStream(int position, int remaining, String name, char[] charArray, int arrayOffset) {
|
||||
super(position, remaining, name);
|
||||
this.charArray = charArray;
|
||||
// TODO
|
||||
assert arrayOffset == 0;
|
||||
}
|
||||
|
||||
/** Return the UTF-16 encoded string for the given interval */
|
||||
@Override
|
||||
public String getText(Interval interval) {
|
||||
int startIdx = Math.min(interval.a, size - 1);
|
||||
int len = Math.min(interval.b - interval.a + 1, size);
|
||||
|
||||
// We know there are no surrogates in this
|
||||
// array, since otherwise we would be given a
|
||||
// 32-bit int[] array.
|
||||
//
|
||||
// So, it's safe to treat this as if it were
|
||||
// UTF-16.
|
||||
return new String(charArray, startIdx, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int LA(int i) {
|
||||
int offset;
|
||||
switch (Integer.signum(i)) {
|
||||
case -1:
|
||||
offset = position + i;
|
||||
if (offset < 0) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return charArray[offset] & 0xFFFF;
|
||||
case 0:
|
||||
// Undefined
|
||||
return 0;
|
||||
case 1:
|
||||
offset = position + i - 1;
|
||||
if (offset >= size) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return charArray[offset] & 0xFFFF;
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
@Override
|
||||
Object getInternalStorage() {
|
||||
return charArray;
|
||||
}
|
||||
}
|
||||
|
||||
// 32-bit internal storage for code points between U+10000 and U+10FFFF.
|
||||
private static final class CodePoint32BitCharStream extends CodePointCharStream {
|
||||
private final int[] intArray;
|
||||
|
||||
private CodePoint32BitCharStream(int position, int remaining, String name, int[] intArray, int arrayOffset) {
|
||||
super(position, remaining, name);
|
||||
this.intArray = intArray;
|
||||
// TODO
|
||||
assert arrayOffset == 0;
|
||||
}
|
||||
|
||||
/** Return the UTF-16 encoded string for the given interval */
|
||||
@Override
|
||||
public String getText(Interval interval) {
|
||||
int startIdx = Math.min(interval.a, size - 1);
|
||||
int len = Math.min(interval.b - interval.a + 1, size);
|
||||
|
||||
// Note that we pass the int[] code points to the String constructor --
|
||||
// this is supported, and the constructor will convert to UTF-16 internally.
|
||||
return new String(intArray, startIdx, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int LA(int i) {
|
||||
int offset;
|
||||
switch (Integer.signum(i)) {
|
||||
case -1:
|
||||
offset = position + i;
|
||||
if (offset < 0) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return intArray[offset];
|
||||
case 0:
|
||||
// Undefined
|
||||
return 0;
|
||||
case 1:
|
||||
offset = position + i - 1;
|
||||
if (offset >= size) {
|
||||
return IntStream.EOF;
|
||||
}
|
||||
return intArray[offset];
|
||||
}
|
||||
throw new UnsupportedOperationException("Not reached");
|
||||
}
|
||||
|
||||
@Override
|
||||
Object getInternalStorage() {
|
||||
return intArray;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,281 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
|
||||
/**
|
||||
* Decodes UTF-8 bytes directly to Unicode code points, stored in an
|
||||
* {@link IntBuffer}.
|
||||
*
|
||||
* Unlike {@link CharsetDecoder}, this does not use UTF-16 as an
|
||||
* intermediate representation, so this optimizes the common case of
|
||||
* decoding a UTF-8 file for parsing as Unicode code points.
|
||||
*/
|
||||
public class UTF8CodePointDecoder {
|
||||
private static final int SUBSTITUTION_CHARACTER = 0xFFFD;
|
||||
private static final byte NVAL = (byte) 0xFF;
|
||||
|
||||
// Table mapping UTF-8 leading byte to the length of the trailing
|
||||
// sequence.
|
||||
protected static final byte[] UTF8_LEADING_BYTE_LENGTHS = new byte[] {
|
||||
// [0x00, 0x7F] -> 0 trailing bytes
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
|
||||
// [0x80, 0xBF] -> invalid leading byte
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
|
||||
// [0xC0, 0xDF] -> one trailing byte
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
|
||||
// [0xE0, 0xEF] -> two trailing bytes
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
|
||||
// [0xF0, 0xF7] -> three trailing bytes
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||
|
||||
// [0xF8, 0xFF] -> invalid leading sequence
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL
|
||||
};
|
||||
|
||||
// Table mapping UTF-8 sequence length to valid Unicode code point
|
||||
// ranges for that sequence length.
|
||||
protected static final Interval[] UTF8_VALID_INTERVALS = new Interval[] {
|
||||
Interval.of(0x00, 0x7F),
|
||||
Interval.of(0x80, 0x7FF),
|
||||
Interval.of(0x800, 0xFFFF),
|
||||
Interval.of(0x10000, 0x10FFFF)
|
||||
};
|
||||
|
||||
protected final CodingErrorAction decodingErrorAction;
|
||||
protected int decodingTrailBytesNeeded;
|
||||
protected int decodingCurrentCodePoint;
|
||||
protected Interval validDecodedCodePointRange;
|
||||
|
||||
/**
|
||||
* Constructs a new {@link UTF8CodePointDecoder} with a specified
|
||||
* {@link CodingErrorAction} to handle invalid UTF-8 sequences.
|
||||
*/
|
||||
public UTF8CodePointDecoder(CodingErrorAction decodingErrorAction) {
|
||||
this.decodingErrorAction = decodingErrorAction;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the state in this {@link UTF8CodePointDecoder}, preparing it
|
||||
* for use with a new input buffer.
|
||||
*/
|
||||
public void reset() {
|
||||
this.decodingTrailBytesNeeded = -1;
|
||||
this.decodingCurrentCodePoint = -1;
|
||||
this.validDecodedCodePointRange = Interval.INVALID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes as many UTF-8 bytes as possible from {@code utf8BytesIn},
|
||||
* writing the result to {@code codePointsOut}.
|
||||
*
|
||||
* If you have more bytes to decode, set {@code endOfInput} to
|
||||
* {@code false} and call this method again once more bytes
|
||||
* are available.
|
||||
*
|
||||
* If there are no more bytes available, make sure to call this
|
||||
* setting {@code endOfInput} to {@code true} so that any invalid
|
||||
* UTF-8 sequence at the end of the input is handled.
|
||||
*
|
||||
* If {@code codePointsOut} is not large enough to store the result,
|
||||
* a new buffer is allocated and returned. Otherwise, returns
|
||||
* {@code codePointsOut}.
|
||||
*
|
||||
* After returning, the {@link ByteBuffer#position position} of
|
||||
* {@code utf8BytesIn} is moved forward to reflect the bytes consumed,
|
||||
* and the {@link IntBuffer#position position} of the result
|
||||
* is moved forward to reflect the code points written.
|
||||
*
|
||||
* The {@link IntBuffer#limit limit} of the result is not changed,
|
||||
* so if this is the end of the input, you will want to set the
|
||||
* limit to the {@link IntBuffer#position position}, then
|
||||
* {@link IntBuffer#flip flip} the result to prepare for reading.
|
||||
*/
|
||||
public IntBuffer decodeCodePointsFromBuffer(
|
||||
ByteBuffer utf8BytesIn,
|
||||
IntBuffer codePointsOut,
|
||||
boolean endOfInput)
|
||||
throws CharacterCodingException
|
||||
{
|
||||
while (utf8BytesIn.hasRemaining()) {
|
||||
if (decodingTrailBytesNeeded == -1) {
|
||||
// Start a new UTF-8 sequence by checking the leading byte.
|
||||
byte leadingByte = utf8BytesIn.get();
|
||||
if (!decodeLeadingByte(leadingByte)) {
|
||||
codePointsOut = handleDecodeError(
|
||||
String.format("Invalid UTF-8 leading byte 0x%02X", leadingByte),
|
||||
codePointsOut);
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
assert decodingTrailBytesNeeded != -1;
|
||||
if (utf8BytesIn.remaining() < decodingTrailBytesNeeded) {
|
||||
// The caller will have to call us back with more bytes.
|
||||
break;
|
||||
}
|
||||
// Now we know the input buffer has enough bytes to decode
|
||||
// the entire sequence.
|
||||
while (decodingTrailBytesNeeded > 0) {
|
||||
// Continue a multi-byte UTF-8 sequence by checking the next trailing byte.
|
||||
byte trailingByte = utf8BytesIn.get();
|
||||
decodingTrailBytesNeeded--;
|
||||
if (!decodeTrailingByte(trailingByte)) {
|
||||
codePointsOut = handleDecodeError(
|
||||
String.format("Invalid UTF-8 trailing byte 0x%02X", trailingByte),
|
||||
codePointsOut);
|
||||
// Skip past any remaining trailing bytes in the sequence.
|
||||
utf8BytesIn.position(utf8BytesIn.position() + decodingTrailBytesNeeded);
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (decodingTrailBytesNeeded == 0) {
|
||||
codePointsOut = appendCodePointFromInterval(
|
||||
decodingCurrentCodePoint,
|
||||
validDecodedCodePointRange,
|
||||
codePointsOut);
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (endOfInput) {
|
||||
if (decodingTrailBytesNeeded != -1) {
|
||||
codePointsOut = handleDecodeError(
|
||||
"Unterminated UTF-8 sequence at end of bytes",
|
||||
codePointsOut);
|
||||
}
|
||||
}
|
||||
return codePointsOut;
|
||||
}
|
||||
|
||||
private boolean decodeLeadingByte(byte leadingByte) {
|
||||
// Be careful about Java silently widening (unsigned)
|
||||
// byte to (signed) int and sign-extending here.
|
||||
//
|
||||
// We use binary AND liberally below to prevent widening.
|
||||
int leadingByteIdx = leadingByte & 0xFF;
|
||||
decodingTrailBytesNeeded = UTF8_LEADING_BYTE_LENGTHS[leadingByteIdx];
|
||||
switch (decodingTrailBytesNeeded) {
|
||||
case 0:
|
||||
decodingCurrentCodePoint = leadingByte;
|
||||
break;
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
int mask = (0b00111111 >> decodingTrailBytesNeeded);
|
||||
decodingCurrentCodePoint = leadingByte & mask;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
validDecodedCodePointRange = UTF8_VALID_INTERVALS[decodingTrailBytesNeeded];
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean decodeTrailingByte(byte trailingByte) {
|
||||
int trailingValue = (trailingByte & 0xFF) - 0x80;
|
||||
if (trailingValue < 0x00 || trailingValue > 0x3F) {
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
decodingCurrentCodePoint = (decodingCurrentCodePoint << 6) | trailingValue;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private IntBuffer appendCodePointFromInterval(
|
||||
int codePoint,
|
||||
Interval validCodePointRange,
|
||||
IntBuffer codePointsOut)
|
||||
throws CharacterCodingException
|
||||
{
|
||||
assert validCodePointRange != Interval.INVALID;
|
||||
|
||||
// Security check: UTF-8 must represent code points using their
|
||||
// shortest encoded form.
|
||||
if (codePoint < validCodePointRange.a ||
|
||||
codePoint > validCodePointRange.b) {
|
||||
return handleDecodeError(
|
||||
String.format(
|
||||
"Code point %d is out of expected range %s",
|
||||
codePoint,
|
||||
validCodePointRange),
|
||||
codePointsOut);
|
||||
}
|
||||
else {
|
||||
return appendCodePoint(codePoint, codePointsOut);
|
||||
}
|
||||
}
|
||||
|
||||
private IntBuffer appendCodePoint(int codePoint, IntBuffer codePointsOut) {
|
||||
if (!codePointsOut.hasRemaining()) {
|
||||
// Grow the code point buffer size by 2.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(codePointsOut.capacity() * 2);
|
||||
codePointsOut.flip();
|
||||
newBuffer.put(codePointsOut);
|
||||
codePointsOut = newBuffer;
|
||||
}
|
||||
codePointsOut.put(codePoint);
|
||||
return codePointsOut;
|
||||
}
|
||||
|
||||
private IntBuffer handleDecodeError(
|
||||
final String error,
|
||||
IntBuffer codePointsOut)
|
||||
throws CharacterCodingException
|
||||
{
|
||||
if (decodingErrorAction == CodingErrorAction.REPLACE) {
|
||||
codePointsOut = appendCodePoint(SUBSTITUTION_CHARACTER, codePointsOut);
|
||||
}
|
||||
else if (decodingErrorAction == CodingErrorAction.REPORT) {
|
||||
throw new CharacterCodingException() {
|
||||
@Override
|
||||
public String getMessage() {
|
||||
return error;
|
||||
}
|
||||
};
|
||||
}
|
||||
return codePointsOut;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue