Improve memory usage of CodePointCharStream: Use 8-bit, 16-bit, or 32-bit buffer

This commit is contained in:
Ben Hamilton 2017-03-21 18:04:11 -07:00
parent de4d129921
commit ab0655598e
8 changed files with 759 additions and 631 deletions

View File

@ -3,17 +3,14 @@
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.test.runtime.java;
package org.antlr.v4.runtime;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.nio.IntBuffer;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CodePointCharStream;
import org.antlr.v4.runtime.IntStream;
import org.antlr.v4.runtime.misc.Interval;
import org.junit.Rule;
@ -291,4 +288,25 @@ public class TestCodePointCharStream {
s.seek(6);
assertEquals(0x1F522, s.LA(-1));
}
@Test
public void asciiContentsShouldUse8BitBuffer() {
CodePointCharStream s = CharStreams.fromString("hello");
assertTrue(s.getInternalStorage() instanceof byte[]);
assertEquals(5, s.size());
}
@Test
public void bmpContentsShouldUse16BitBuffer() {
CodePointCharStream s = CharStreams.fromString("hello \u4E16\u754C");
assertTrue(s.getInternalStorage() instanceof char[]);
assertEquals(8, s.size());
}
@Test
public void smpContentsShouldUse32BitBuffer() {
CodePointCharStream s = CharStreams.fromString("hello \uD83C\uDF0D");
assertTrue(s.getInternalStorage() instanceof int[]);
assertEquals(7, s.size());
}
}

View File

@ -137,9 +137,9 @@ public class TestCharStreams {
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(3, s.size());
assertEquals(4, s.size());
assertEquals(0, s.index());
assertEquals("\uFFFD\uFFFD\uFFFD", s.toString());
assertEquals("\uFFFD\uFFFD\uFFFD\uFFFD", s.toString());
}
}

View File

@ -1,162 +0,0 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.test.runtime.java;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import org.antlr.v4.runtime.UTF8CodePointDecoder;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
public class TestUTF8CodePointDecoder {
@Rule
public ExpectedException thrown = ExpectedException.none();
@Test
public void decodeEmptyByteBufferWritesNothing() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = ByteBuffer.allocate(0);
IntBuffer codePointsOut = IntBuffer.allocate(0);
IntBuffer result = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
true);
result.flip();
assertEquals(0, result.remaining());
}
@Test
public void decodeLatinByteBufferWritesCodePoint() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("X");
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
true);
result.flip();
assertEquals(1, result.remaining());
assertEquals('X', result.get(0));
}
@Test
public void decodeCyrillicByteBufferWritesCodePoint() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u042F");
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
true);
result.flip();
assertEquals(1, result.remaining());
assertEquals(0x042F, result.get(0));
}
@Test
public void decodeCJKByteBufferWritesCodePoint() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u611B");
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
true);
result.flip();
assertEquals(1, result.remaining());
assertEquals(0x611B, result.get(0));
}
@Test
public void decodeEmojiByteBufferWritesCodePoint() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode(
new StringBuilder().appendCodePoint(0x1F4A9).toString()
);
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
true);
result.flip();
assertEquals(1, result.remaining());
assertEquals(0x1F4A9, result.get(0));
}
@Test
public void decodingInvalidLeadInReplaceModeWritesSubstitutionCharacter() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
result.flip();
assertEquals(1, result.remaining());
assertEquals(0xFFFD, result.get(0));
}
@Test
public void decodingInvalidLeadInReportModeThrows() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
IntBuffer codePointsOut = IntBuffer.allocate(1);
thrown.expect(CharacterCodingException.class);
thrown.expectMessage("Invalid UTF-8 leading byte 0xF8");
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
}
@Test
public void decodingInvalidTrailInReplaceModeWritesSubstitutionCharacter() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
result.flip();
assertEquals(1, result.remaining());
assertEquals(0xFFFD, result.get(0));
}
@Test
public void decodingInvalidTrailInReportModeThrows() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
IntBuffer codePointsOut = IntBuffer.allocate(1);
thrown.expect(CharacterCodingException.class);
thrown.expectMessage("Invalid UTF-8 trailing byte 0xC0");
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
}
@Test
public void decodingNonShortestFormInReplaceModeWritesSubstitutionCharacter() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
IntBuffer codePointsOut = IntBuffer.allocate(1);
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
result.flip();
assertEquals(1, result.remaining());
assertEquals(0xFFFD, result.get(0));
}
@Test
public void decodingNonShortestFormInReportModeThrows() throws Exception {
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
IntBuffer codePointsOut = IntBuffer.allocate(1);
thrown.expect(CharacterCodingException.class);
thrown.expectMessage("Code point 92 is out of expected range 128..2047");
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
}
}

View File

@ -13,9 +13,11 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.management.RuntimeMXBean;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
@ -331,10 +333,12 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
for (int i = 0; i<n; i++) {
streams[i] = loader.getResourceAsStream(resourceName);
}
URLConnection uc = null;
long streamLength = getResourceSize(loader, resourceName);
long start = System.nanoTime(); // track only time to suck data out of stream
for (int i = 0; i<n; i++) {
try (InputStream is = streams[i]) {
input[i] = CharStreams.fromStream(is);
input[i] = CharStreams.fromStream(is, StandardCharsets.UTF_8, streamLength);
}
}
long stop = System.nanoTime();
@ -370,8 +374,10 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
}
public void lex_new_java_utf8(int n, boolean clearLexerDFACache) throws Exception {
try (InputStream is = TimeLexerSpeed.class.getClassLoader().getResourceAsStream(Parser_java_file);) {
CharStream input = CharStreams.fromStream(is);
ClassLoader loader = TimeLexerSpeed.class.getClassLoader();
try (InputStream is = loader.getResourceAsStream(Parser_java_file);) {
long size = getResourceSize(loader, Parser_java_file);
CharStream input = CharStreams.fromStream(is, StandardCharsets.UTF_8, size);
JavaLexer lexer = new JavaLexer(input);
double avg = tokenize(lexer, n, clearLexerDFACache);
String currentMethodName = new Exception().getStackTrace()[0].getMethodName();
@ -403,8 +409,11 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
}
public void lex_new_grapheme_utf8(String fileName, int n, boolean clearLexerDFACache) throws Exception {
try (InputStream is = TimeLexerSpeed.class.getClassLoader().getResourceAsStream(PerfDir+"/"+fileName)) {
CharStream input = CharStreams.fromStream(is);
String resourceName = PerfDir+"/"+fileName;
ClassLoader loader = TimeLexerSpeed.class.getClassLoader();
try (InputStream is = loader.getResourceAsStream(resourceName)) {
long size = getResourceSize(loader, resourceName);
CharStream input = CharStreams.fromStream(is, StandardCharsets.UTF_8, size);
graphemesLexer lexer = new graphemesLexer(input);
double avg = tokenize(lexer, n, clearLexerDFACache);
String currentMethodName = new Exception().getStackTrace()[0].getMethodName();
@ -474,4 +483,18 @@ public class TimeLexerSpeed { // don't call it Test else it'll run during "mvn t
public static String dirname(Path path) {
return path.getName(0).toString();
}
public static final long getResourceSize(ClassLoader loader, String resourceName) throws IOException {
URLConnection uc = null;
try {
// Sadly, URLConnection is not AutoCloseable, but it leaks resources if
// we don't close its stream.
uc = loader.getResource(resourceName).openConnection();
return uc.getContentLengthLong();
} finally {
if (uc != null) {
uc.getInputStream().close();
}
}
}
}

View File

@ -10,10 +10,12 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@ -55,17 +57,15 @@ public final class CharStreams {
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromPath(Path path, Charset charset) throws IOException {
if (charset.equals(StandardCharsets.UTF_8)) {
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
path.toString());
}
}
else {
return new ANTLRFileStream(path.toString(), charset.toString());
long size = Files.size(path);
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
return fromChannel(
channel,
charset,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
path.toString(),
size);
}
}
@ -120,19 +120,18 @@ public final class CharStreams {
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
if (charset.equals(StandardCharsets.UTF_8)) {
try (ReadableByteChannel channel = Channels.newChannel(is)) {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
}
}
else {
try (InputStreamReader isr = new InputStreamReader(is, charset)) {
return new ANTLRInputStream(isr);
}
return fromStream(is, charset, -1);
}
public static CharStream fromStream(InputStream is, Charset charset, long inputSize) throws IOException {
try (ReadableByteChannel channel = Channels.newChannel(is)) {
return fromChannel(
channel,
charset,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME,
inputSize);
}
}
@ -160,18 +159,11 @@ public final class CharStreams {
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
if (charset.equals(StandardCharsets.UTF_8)) {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
}
else {
try (InputStreamReader isr = new InputStreamReader(Channels.newInputStream(channel), charset)) {
return new ANTLRInputStream(isr);
}
}
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
}
/**
@ -187,50 +179,15 @@ public final class CharStreams {
* source name. Closes the reader before returning.
*/
public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
IntBuffer codePointBuffer = IntBuffer.allocate(DEFAULT_BUFFER_SIZE);
int highSurrogate = -1;
int curCodeUnit;
try {
while ((curCodeUnit = r.read()) != -1) {
if (!codePointBuffer.hasRemaining()) {
// Grow the code point buffer size by 2.
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
codePointBuffer.flip();
newBuffer.put(codePointBuffer);
codePointBuffer = newBuffer;
}
if (Character.isHighSurrogate((char) curCodeUnit)) {
if (highSurrogate != -1) {
// Dangling high surrogate followed by another high surrogate.
codePointBuffer.put(highSurrogate);
}
highSurrogate = curCodeUnit;
}
else if (Character.isLowSurrogate((char) curCodeUnit)) {
if (highSurrogate == -1) {
// Low surrogate not preceded by high surrogate.
codePointBuffer.put(curCodeUnit);
}
else {
codePointBuffer.put(Character.toCodePoint((char) highSurrogate, (char) curCodeUnit));
highSurrogate = -1;
}
}
else {
if (highSurrogate != -1) {
// Dangling high surrogate followed by a non-surrogate.
codePointBuffer.put(highSurrogate);
highSurrogate = -1;
}
codePointBuffer.put(curCodeUnit);
}
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(DEFAULT_BUFFER_SIZE);
CharBuffer charBuffer = CharBuffer.allocate(DEFAULT_BUFFER_SIZE);
while ((r.read(charBuffer)) != -1) {
charBuffer.flip();
codePointBufferBuilder.append(charBuffer);
charBuffer.compact();
}
if (highSurrogate != -1) {
// Dangling high surrogate at end of file.
codePointBuffer.put(highSurrogate);
}
codePointBuffer.flip();
return new CodePointCharStream(codePointBuffer, sourceName);
return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
}
finally {
r.close();
@ -251,22 +208,14 @@ public final class CharStreams {
public static CodePointCharStream fromString(String s, String sourceName) {
// Initial guess assumes no code points > U+FFFF: one code
// point for each code unit in the string
IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
int stringIdx = 0;
while (stringIdx < s.length()) {
if (!codePointBuffer.hasRemaining()) {
// Grow the code point buffer size by 2.
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
codePointBuffer.flip();
newBuffer.put(codePointBuffer);
codePointBuffer = newBuffer;
}
int codePoint = Character.codePointAt(s, stringIdx);
codePointBuffer.put(codePoint);
stringIdx += Character.charCount(codePoint);
}
codePointBuffer.flip();
return new CodePointCharStream(codePointBuffer, sourceName);
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(s.length());
// TODO: CharBuffer.wrap(String) rightfully returns a read-only buffer
// which doesn't expose its array, so we make a copy.
CharBuffer cb = CharBuffer.allocate(s.length());
cb.put(s);
cb.flip();
codePointBufferBuilder.append(cb);
return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
}
/**
@ -282,25 +231,62 @@ public final class CharStreams {
CodingErrorAction decodingErrorAction,
String sourceName)
throws IOException
{
return fromChannel(channel, StandardCharsets.UTF_8, bufferSize, decodingErrorAction, sourceName, -1);
}
public static CodePointCharStream fromChannel(
ReadableByteChannel channel,
Charset charset,
int bufferSize,
CodingErrorAction decodingErrorAction,
String sourceName,
long inputSize)
throws IOException
{
try {
ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
ByteBuffer utf8BytesIn = ByteBuffer.allocate(bufferSize);
CharBuffer utf16CodeUnitsOut = CharBuffer.allocate(bufferSize);
if (inputSize == -1) {
inputSize = bufferSize;
} else if (inputSize > Integer.MAX_VALUE) {
// ByteBuffer et al don't support long sizes
throw new IOException(String.format("inputSize %d larger than max %d", inputSize, Integer.MAX_VALUE));
}
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder((int) inputSize);
CharsetDecoder decoder = charset
.newDecoder()
.onMalformedInput(decodingErrorAction)
.onUnmappableCharacter(decodingErrorAction);
boolean endOfInput = false;
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
while (!endOfInput) {
int bytesRead = channel.read(utf8BytesIn);
endOfInput = (bytesRead == -1);
utf8BytesIn.flip();
codePointsOut = decoder.decodeCodePointsFromBuffer(
CoderResult result = decoder.decode(
utf8BytesIn,
codePointsOut,
utf16CodeUnitsOut,
endOfInput);
if (result.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
result.throwException();
}
utf16CodeUnitsOut.flip();
codePointBufferBuilder.append(utf16CodeUnitsOut);
utf8BytesIn.compact();
utf16CodeUnitsOut.compact();
}
codePointsOut.limit(codePointsOut.position());
codePointsOut.flip();
return new CodePointCharStream(codePointsOut, sourceName);
// Handle any bytes at the end of the file which need to
// be represented as errors or substitution characters.
CoderResult flushResult = decoder.flush(utf16CodeUnitsOut);
if (flushResult.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
flushResult.throwException();
}
utf16CodeUnitsOut.flip();
codePointBufferBuilder.append(utf16CodeUnitsOut);
CodePointBuffer codePointBuffer = codePointBufferBuilder.build();
return CodePointCharStream.fromBuffer(codePointBuffer, sourceName);
}
finally {
channel.close();

View File

@ -0,0 +1,388 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.runtime;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
/**
* Wrapper for {@link ByteBuffer} / {@link CharBuffer} / {@link IntBuffer}.
*
* Because Java lacks generics on primitive types, these three types
* do not share an interface, so we have to write one manually.
*/
public class CodePointBuffer {
public enum Type {
BYTE,
CHAR,
INT
}
private final Type type;
private final ByteBuffer byteBuffer;
private final CharBuffer charBuffer;
private final IntBuffer intBuffer;
private CodePointBuffer(Type type, ByteBuffer byteBuffer, CharBuffer charBuffer, IntBuffer intBuffer) {
this.type = type;
this.byteBuffer = byteBuffer;
this.charBuffer = charBuffer;
this.intBuffer = intBuffer;
}
public static CodePointBuffer withBytes(ByteBuffer byteBuffer) {
return new CodePointBuffer(Type.BYTE, byteBuffer, null, null);
}
public static CodePointBuffer withChars(CharBuffer charBuffer) {
return new CodePointBuffer(Type.CHAR, null, charBuffer, null);
}
public static CodePointBuffer withInts(IntBuffer intBuffer) {
return new CodePointBuffer(Type.INT, null, null, intBuffer);
}
public int position() {
switch (type) {
case BYTE:
return byteBuffer.position();
case CHAR:
return charBuffer.position();
case INT:
return intBuffer.position();
}
throw new UnsupportedOperationException("Not reached");
}
public void position(int newPosition) {
switch (type) {
case BYTE:
byteBuffer.position(newPosition);
break;
case CHAR:
charBuffer.position(newPosition);
break;
case INT:
intBuffer.position(newPosition);
break;
}
}
public int remaining() {
switch (type) {
case BYTE:
return byteBuffer.remaining();
case CHAR:
return charBuffer.remaining();
case INT:
return intBuffer.remaining();
}
throw new UnsupportedOperationException("Not reached");
}
public int get(int offset) {
switch (type) {
case BYTE:
return byteBuffer.get(offset);
case CHAR:
return charBuffer.get(offset);
case INT:
return intBuffer.get(offset);
}
throw new UnsupportedOperationException("Not reached");
}
Type getType() {
return type;
}
int arrayOffset() {
switch (type) {
case BYTE:
return byteBuffer.arrayOffset();
case CHAR:
return charBuffer.arrayOffset();
case INT:
return intBuffer.arrayOffset();
}
throw new UnsupportedOperationException("Not reached");
}
byte[] byteArray() {
assert type == Type.BYTE;
return byteBuffer.array();
}
char[] charArray() {
assert type == Type.CHAR;
return charBuffer.array();
}
int[] intArray() {
assert type == Type.INT;
return intBuffer.array();
}
public static Builder builder(int initialBufferSize) {
return new Builder(initialBufferSize);
}
public static class Builder {
private Type type;
private ByteBuffer byteBuffer;
private CharBuffer charBuffer;
private IntBuffer intBuffer;
private int prevHighSurrogate;
private Builder(int initialBufferSize) {
type = Type.BYTE;
byteBuffer = ByteBuffer.allocate(initialBufferSize);
charBuffer = null;
intBuffer = null;
prevHighSurrogate = -1;
}
Type getType() {
return type;
}
ByteBuffer getByteBuffer() {
return byteBuffer;
}
CharBuffer getCharBuffer() {
return charBuffer;
}
IntBuffer getIntBuffer() {
return intBuffer;
}
public CodePointBuffer build() {
switch (type) {
case BYTE:
byteBuffer.flip();
break;
case CHAR:
charBuffer.flip();
break;
case INT:
intBuffer.flip();
break;
}
return new CodePointBuffer(type, byteBuffer, charBuffer, intBuffer);
}
private static int roundUpToNextPowerOfTwo(int i) {
int nextPowerOfTwo = 32 - Integer.numberOfLeadingZeros(i - 1);
return (int) Math.pow(2, nextPowerOfTwo);
}
public void ensureRemaining(int remainingNeeded) {
switch (type) {
case BYTE:
if (byteBuffer.remaining() < remainingNeeded) {
int newCapacity = roundUpToNextPowerOfTwo(byteBuffer.capacity() + remainingNeeded);
ByteBuffer newBuffer = ByteBuffer.allocate(newCapacity);
byteBuffer.flip();
newBuffer.put(byteBuffer);
byteBuffer = newBuffer;
}
break;
case CHAR:
if (charBuffer.remaining() < remainingNeeded) {
int newCapacity = roundUpToNextPowerOfTwo(charBuffer.capacity() + remainingNeeded);
CharBuffer newBuffer = CharBuffer.allocate(newCapacity);
charBuffer.flip();
newBuffer.put(charBuffer);
charBuffer = newBuffer;
}
break;
case INT:
if (intBuffer.remaining() < remainingNeeded) {
int newCapacity = roundUpToNextPowerOfTwo(intBuffer.capacity() + remainingNeeded);
IntBuffer newBuffer = IntBuffer.allocate(newCapacity);
intBuffer.flip();
newBuffer.put(intBuffer);
intBuffer = newBuffer;
}
break;
}
}
public void append(CharBuffer utf16In) {
ensureRemaining(utf16In.remaining());
if (utf16In.hasArray()) {
appendArray(utf16In);
} else {
// TODO
throw new UnsupportedOperationException("TODO");
}
}
private void appendArray(CharBuffer utf16In) {
assert utf16In.hasArray();
switch (type) {
case BYTE:
appendArrayByte(utf16In);
break;
case CHAR:
appendArrayChar(utf16In);
break;
case INT:
appendArrayInt(utf16In);
break;
}
}
private void appendArrayByte(CharBuffer utf16In) {
assert prevHighSurrogate == -1;
char[] in = utf16In.array();
int inOffset = utf16In.arrayOffset() + utf16In.position();
int inLimit = utf16In.arrayOffset() + utf16In.limit();
byte[] outByte = byteBuffer.array();
int outOffset = byteBuffer.arrayOffset() + byteBuffer.position();
while (inOffset < inLimit) {
char c = in[inOffset];
if (c <= 0xFF) {
outByte[outOffset] = (byte)(c & 0xFF);
} else {
utf16In.position(inOffset - utf16In.arrayOffset());
byteBuffer.position(outOffset - byteBuffer.arrayOffset());
if (!Character.isHighSurrogate(c)) {
byteToCharBuffer(utf16In.remaining());
appendArrayChar(utf16In);
return;
} else {
byteToIntBuffer(utf16In.remaining());
appendArrayInt(utf16In);
return;
}
}
inOffset++;
outOffset++;
}
utf16In.position(inOffset - utf16In.arrayOffset());
byteBuffer.position(outOffset - byteBuffer.arrayOffset());
}
private void appendArrayChar(CharBuffer utf16In) {
assert prevHighSurrogate == -1;
char[] in = utf16In.array();
int inOffset = utf16In.arrayOffset() + utf16In.position();
int inLimit = utf16In.arrayOffset() + utf16In.limit();
char[] outChar = charBuffer.array();
int outOffset = charBuffer.arrayOffset() + charBuffer.position();
while (inOffset < inLimit) {
char c = in[inOffset];
if (!Character.isHighSurrogate(c)) {
outChar[outOffset] = c;
} else {
utf16In.position(inOffset - utf16In.arrayOffset());
charBuffer.position(outOffset - charBuffer.arrayOffset());
charToIntBuffer(utf16In.remaining());
appendArrayInt(utf16In);
return;
}
inOffset++;
outOffset++;
}
utf16In.position(inOffset - utf16In.arrayOffset());
charBuffer.position(outOffset - charBuffer.arrayOffset());
}
private void appendArrayInt(CharBuffer utf16In) {
char[] in = utf16In.array();
int inOffset = utf16In.arrayOffset() + utf16In.position();
int inLimit = utf16In.arrayOffset() + utf16In.limit();
int[] outInt = intBuffer.array();
int outOffset = intBuffer.arrayOffset() + intBuffer.position();
while (inOffset < inLimit) {
char c = in[inOffset];
inOffset++;
if (prevHighSurrogate != -1) {
if (Character.isLowSurrogate(c)) {
outInt[outOffset] = Character.toCodePoint((char) prevHighSurrogate, c);
outOffset++;
prevHighSurrogate = -1;
} else {
// Dangling high surrogate
outInt[outOffset] = prevHighSurrogate;
outOffset++;
if (Character.isHighSurrogate(c)) {
prevHighSurrogate = c & 0xFFFF;
} else {
outInt[outOffset] = c & 0xFFFF;
outOffset++;
prevHighSurrogate = -1;
}
}
} else if (Character.isHighSurrogate(c)) {
prevHighSurrogate = c & 0xFFFF;
} else {
outInt[outOffset] = c & 0xFFFF;
outOffset++;
}
}
if (prevHighSurrogate != -1) {
// Dangling high surrogate
outInt[outOffset] = prevHighSurrogate & 0xFFFF;
outOffset++;
}
utf16In.position(inOffset - utf16In.arrayOffset());
intBuffer.position(outOffset - intBuffer.arrayOffset());
}
private void byteToCharBuffer(int toAppend) {
byteBuffer.flip();
// CharBuffers hold twice as much per unit as ByteBuffers, so start with half the capacity.
CharBuffer newBuffer = CharBuffer.allocate(Math.max(byteBuffer.remaining() + toAppend, byteBuffer.capacity() / 2));
while (byteBuffer.hasRemaining()) {
newBuffer.put((char) (byteBuffer.get() & 0xFF));
}
type = Type.CHAR;
byteBuffer = null;
charBuffer = newBuffer;
}
private void byteToIntBuffer(int toAppend) {
byteBuffer.flip();
// IntBuffers hold four times as much per unit as ByteBuffers, so start with one quarter the capacity.
IntBuffer newBuffer = IntBuffer.allocate(Math.max(byteBuffer.remaining() + toAppend, byteBuffer.capacity() / 4));
while (byteBuffer.hasRemaining()) {
newBuffer.put(byteBuffer.get() & 0xFF);
}
type = Type.INT;
byteBuffer = null;
intBuffer = newBuffer;
}
private void charToIntBuffer(int toAppend) {
charBuffer.flip();
// IntBuffers hold two times as much per unit as ByteBuffers, so start with one half the capacity.
IntBuffer newBuffer = IntBuffer.allocate(Math.max(charBuffer.remaining() + toAppend, charBuffer.capacity() / 2));
while (charBuffer.hasRemaining()) {
newBuffer.put(charBuffer.get() & 0xFFFF);
}
type = Type.INT;
charBuffer = null;
intBuffer = newBuffer;
}
}
}

View File

@ -7,7 +7,7 @@ package org.antlr.v4.runtime;
import org.antlr.v4.runtime.misc.Interval;
import java.nio.IntBuffer;
import java.nio.charset.StandardCharsets;
/**
* Alternative to {@link ANTLRInputStream} which treats the input
@ -17,115 +17,113 @@ import java.nio.IntBuffer;
* Use this if you need to parse input which potentially contains
* Unicode values > U+FFFF.
*/
public final class CodePointCharStream implements CharStream {
private final IntBuffer codePointBuffer;
private final int initialPosition;
private final int size;
private final String name;
public abstract class CodePointCharStream implements CharStream {
protected final int size;
protected final String name;
// To avoid lots of virtual method calls, we directly access
// the state of the underlying code points in the
// CodePointBuffer.
protected int position;
// Use the factory method {@link #fromBuffer(CodePointBuffer)} to
// construct instances of this type.
private CodePointCharStream(int position, int remaining, String name) {
// TODO
assert position == 0;
this.size = remaining;
this.name = name;
this.position = 0;
}
// Visible for testing.
abstract Object getInternalStorage();
/**
* Constructs a {@link CodePointCharStream} which provides access
* to the Unicode code points stored in {@code codePointBuffer}.
*
* {@code codePointBuffer}'s {@link IntBuffer#position position}
* reflects the first code point of the stream, and its
* {@link IntBuffer#limit limit} is just after the last code point
* of the stream.
*/
public CodePointCharStream(IntBuffer codePointBuffer) {
this(codePointBuffer, UNKNOWN_SOURCE_NAME);
public static CodePointCharStream fromBuffer(CodePointBuffer codePointBuffer) {
return fromBuffer(codePointBuffer, UNKNOWN_SOURCE_NAME);
}
/**
* Constructs a named {@link CodePointCharStream} which provides access
* to the Unicode code points stored in {@code codePointBuffer}.
*
* {@code codePointBuffer}'s {@link IntBuffer#position position}
* reflects the first code point of the stream, and its
* {@link IntBuffer#limit limit} is just after the last code point
* of the stream.
*/
public CodePointCharStream(IntBuffer codePointBuffer, String name) {
this.codePointBuffer = codePointBuffer;
this.initialPosition = codePointBuffer.position();
this.size = codePointBuffer.remaining();
this.name = name;
}
private int relativeBufferPosition(int i) {
return initialPosition + codePointBuffer.position() + i;
public static CodePointCharStream fromBuffer(CodePointBuffer codePointBuffer, String name) {
// Java lacks generics on primitive types.
//
// To avoid lots of calls to virtual methods in the
// very hot codepath of LA() below, we construct one
// of three concrete subclasses.
//
// The concrete subclasses directly access the code
// points stored in the underlying array (byte[],
// char[], or int[]), so we can avoid lots of virtual
// method calls to ByteBuffer.get(offset).
switch (codePointBuffer.getType()) {
case BYTE:
return new CodePoint8BitCharStream(
codePointBuffer.position(),
codePointBuffer.remaining(),
name,
codePointBuffer.byteArray(),
codePointBuffer.arrayOffset());
case CHAR:
return new CodePoint16BitCharStream(
codePointBuffer.position(),
codePointBuffer.remaining(),
name,
codePointBuffer.charArray(),
codePointBuffer.arrayOffset());
case INT:
return new CodePoint32BitCharStream(
codePointBuffer.position(),
codePointBuffer.remaining(),
name,
codePointBuffer.intArray(),
codePointBuffer.arrayOffset());
}
throw new UnsupportedOperationException("Not reached");
}
@Override
public void consume() {
if (!codePointBuffer.hasRemaining()) {
public final void consume() {
if (size - position == 0) {
assert LA(1) == IntStream.EOF;
throw new IllegalStateException("cannot consume EOF");
}
codePointBuffer.position(codePointBuffer.position() + 1);
position = position + 1;
}
@Override
public int LA(int i) {
if (i == 0) {
// Undefined
return 0;
}
else if (i < 0) {
if (codePointBuffer.position() + i < initialPosition) {
return IntStream.EOF;
}
return codePointBuffer.get(relativeBufferPosition(i));
}
else if (i > codePointBuffer.remaining()) {
return IntStream.EOF;
}
else {
return codePointBuffer.get(relativeBufferPosition(i - 1));
}
public final int index() {
return position;
}
@Override
public int index() {
return codePointBuffer.position() - initialPosition;
}
@Override
public int size() {
public final int size() {
return size;
}
/** mark/release do nothing; we have entire buffer */
@Override
public int mark() {
public final int mark() {
return -1;
}
@Override
public void release(int marker) {
public final void release(int marker) {
}
@Override
public void seek(int index) {
codePointBuffer.position(initialPosition + index);
}
/** Return the UTF-16 encoded string for the given interval */
@Override
public String getText(Interval interval) {
final int startIdx = initialPosition + Math.min(interval.a, size - 1);
final int stopIdx = initialPosition + Math.min(interval.b, size - 1);
// interval.length() will be too small if we contain any code points > U+FFFF,
// but it's just a hint for initial capacity; StringBuilder will grow anyway.
StringBuilder sb = new StringBuilder(interval.length());
for (int codePointIdx = startIdx; codePointIdx <= stopIdx; codePointIdx++) {
sb.appendCodePoint(codePointBuffer.get(codePointIdx));
}
return sb.toString();
public final void seek(int index) {
position = index;
}
@Override
public String getSourceName() {
public final String getSourceName() {
if (name == null || name.isEmpty()) {
return UNKNOWN_SOURCE_NAME;
}
@ -134,7 +132,165 @@ public final class CodePointCharStream implements CharStream {
}
@Override
public String toString() {
public final String toString() {
return getText(Interval.of(0, size - 1));
}
// 8-bit storage for code points <= U+00FF.
private static final class CodePoint8BitCharStream extends CodePointCharStream {
private final byte[] byteArray;
private CodePoint8BitCharStream(int position, int remaining, String name, byte[] byteArray, int arrayOffset) {
super(position, remaining, name);
// TODO
assert arrayOffset == 0;
this.byteArray = byteArray;
}
/** Return the UTF-16 encoded string for the given interval */
@Override
public String getText(Interval interval) {
int startIdx = Math.min(interval.a, size - 1);
int len = Math.min(interval.b - interval.a + 1, size);
// We know the maximum code point in byteArray is U+00FF,
// so we can treat this as if it were ISO-8859-1, aka Latin-1,
// which shares the same code points up to 0xFF.
return new String(byteArray, startIdx, len, StandardCharsets.ISO_8859_1);
}
@Override
public int LA(int i) {
int offset;
switch (Integer.signum(i)) {
case -1:
offset = position + i;
if (offset < 0) {
return IntStream.EOF;
}
return byteArray[offset] & 0xFF;
case 0:
// Undefined
return 0;
case 1:
offset = position + i - 1;
if (offset >= size) {
return IntStream.EOF;
}
return byteArray[offset] & 0xFF;
}
throw new UnsupportedOperationException("Not reached");
}
@Override
Object getInternalStorage() {
return byteArray;
}
}
// 16-bit internal storage for code points between U+0100 and U+FFFF.
private static final class CodePoint16BitCharStream extends CodePointCharStream {
private final char[] charArray;
private CodePoint16BitCharStream(int position, int remaining, String name, char[] charArray, int arrayOffset) {
super(position, remaining, name);
this.charArray = charArray;
// TODO
assert arrayOffset == 0;
}
/** Return the UTF-16 encoded string for the given interval */
@Override
public String getText(Interval interval) {
int startIdx = Math.min(interval.a, size - 1);
int len = Math.min(interval.b - interval.a + 1, size);
// We know there are no surrogates in this
// array, since otherwise we would be given a
// 32-bit int[] array.
//
// So, it's safe to treat this as if it were
// UTF-16.
return new String(charArray, startIdx, len);
}
@Override
public int LA(int i) {
int offset;
switch (Integer.signum(i)) {
case -1:
offset = position + i;
if (offset < 0) {
return IntStream.EOF;
}
return charArray[offset] & 0xFFFF;
case 0:
// Undefined
return 0;
case 1:
offset = position + i - 1;
if (offset >= size) {
return IntStream.EOF;
}
return charArray[offset] & 0xFFFF;
}
throw new UnsupportedOperationException("Not reached");
}
@Override
Object getInternalStorage() {
return charArray;
}
}
// 32-bit internal storage for code points between U+10000 and U+10FFFF.
private static final class CodePoint32BitCharStream extends CodePointCharStream {
private final int[] intArray;
private CodePoint32BitCharStream(int position, int remaining, String name, int[] intArray, int arrayOffset) {
super(position, remaining, name);
this.intArray = intArray;
// TODO
assert arrayOffset == 0;
}
/** Return the UTF-16 encoded string for the given interval */
@Override
public String getText(Interval interval) {
int startIdx = Math.min(interval.a, size - 1);
int len = Math.min(interval.b - interval.a + 1, size);
// Note that we pass the int[] code points to the String constructor --
// this is supported, and the constructor will convert to UTF-16 internally.
return new String(intArray, startIdx, len);
}
@Override
public int LA(int i) {
int offset;
switch (Integer.signum(i)) {
case -1:
offset = position + i;
if (offset < 0) {
return IntStream.EOF;
}
return intArray[offset];
case 0:
// Undefined
return 0;
case 1:
offset = position + i - 1;
if (offset >= size) {
return IntStream.EOF;
}
return intArray[offset];
}
throw new UnsupportedOperationException("Not reached");
}
@Override
Object getInternalStorage() {
return intArray;
}
}
}

View File

@ -1,281 +0,0 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.runtime;
import org.antlr.v4.runtime.misc.Interval;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
/**
* Decodes UTF-8 bytes directly to Unicode code points, stored in an
* {@link IntBuffer}.
*
* Unlike {@link CharsetDecoder}, this does not use UTF-16 as an
* intermediate representation, so this optimizes the common case of
* decoding a UTF-8 file for parsing as Unicode code points.
*/
public class UTF8CodePointDecoder {
private static final int SUBSTITUTION_CHARACTER = 0xFFFD;
private static final byte NVAL = (byte) 0xFF;
// Table mapping UTF-8 leading byte to the length of the trailing
// sequence.
protected static final byte[] UTF8_LEADING_BYTE_LENGTHS = new byte[] {
// [0x00, 0x7F] -> 0 trailing bytes
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// [0x80, 0xBF] -> invalid leading byte
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
// [0xC0, 0xDF] -> one trailing byte
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
// [0xE0, 0xEF] -> two trailing bytes
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
// [0xF0, 0xF7] -> three trailing bytes
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
// [0xF8, 0xFF] -> invalid leading sequence
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL
};
// Table mapping UTF-8 sequence length to valid Unicode code point
// ranges for that sequence length.
protected static final Interval[] UTF8_VALID_INTERVALS = new Interval[] {
Interval.of(0x00, 0x7F),
Interval.of(0x80, 0x7FF),
Interval.of(0x800, 0xFFFF),
Interval.of(0x10000, 0x10FFFF)
};
protected final CodingErrorAction decodingErrorAction;
protected int decodingTrailBytesNeeded;
protected int decodingCurrentCodePoint;
protected Interval validDecodedCodePointRange;
/**
* Constructs a new {@link UTF8CodePointDecoder} with a specified
* {@link CodingErrorAction} to handle invalid UTF-8 sequences.
*/
public UTF8CodePointDecoder(CodingErrorAction decodingErrorAction) {
this.decodingErrorAction = decodingErrorAction;
reset();
}
/**
* Resets the state in this {@link UTF8CodePointDecoder}, preparing it
* for use with a new input buffer.
*/
public void reset() {
this.decodingTrailBytesNeeded = -1;
this.decodingCurrentCodePoint = -1;
this.validDecodedCodePointRange = Interval.INVALID;
}
/**
* Decodes as many UTF-8 bytes as possible from {@code utf8BytesIn},
* writing the result to {@code codePointsOut}.
*
* If you have more bytes to decode, set {@code endOfInput} to
* {@code false} and call this method again once more bytes
* are available.
*
* If there are no more bytes available, make sure to call this
* setting {@code endOfInput} to {@code true} so that any invalid
* UTF-8 sequence at the end of the input is handled.
*
* If {@code codePointsOut} is not large enough to store the result,
* a new buffer is allocated and returned. Otherwise, returns
* {@code codePointsOut}.
*
* After returning, the {@link ByteBuffer#position position} of
* {@code utf8BytesIn} is moved forward to reflect the bytes consumed,
* and the {@link IntBuffer#position position} of the result
* is moved forward to reflect the code points written.
*
* The {@link IntBuffer#limit limit} of the result is not changed,
* so if this is the end of the input, you will want to set the
* limit to the {@link IntBuffer#position position}, then
* {@link IntBuffer#flip flip} the result to prepare for reading.
*/
public IntBuffer decodeCodePointsFromBuffer(
ByteBuffer utf8BytesIn,
IntBuffer codePointsOut,
boolean endOfInput)
throws CharacterCodingException
{
while (utf8BytesIn.hasRemaining()) {
if (decodingTrailBytesNeeded == -1) {
// Start a new UTF-8 sequence by checking the leading byte.
byte leadingByte = utf8BytesIn.get();
if (!decodeLeadingByte(leadingByte)) {
codePointsOut = handleDecodeError(
String.format("Invalid UTF-8 leading byte 0x%02X", leadingByte),
codePointsOut);
reset();
continue;
}
}
assert decodingTrailBytesNeeded != -1;
if (utf8BytesIn.remaining() < decodingTrailBytesNeeded) {
// The caller will have to call us back with more bytes.
break;
}
// Now we know the input buffer has enough bytes to decode
// the entire sequence.
while (decodingTrailBytesNeeded > 0) {
// Continue a multi-byte UTF-8 sequence by checking the next trailing byte.
byte trailingByte = utf8BytesIn.get();
decodingTrailBytesNeeded--;
if (!decodeTrailingByte(trailingByte)) {
codePointsOut = handleDecodeError(
String.format("Invalid UTF-8 trailing byte 0x%02X", trailingByte),
codePointsOut);
// Skip past any remaining trailing bytes in the sequence.
utf8BytesIn.position(utf8BytesIn.position() + decodingTrailBytesNeeded);
reset();
continue;
}
}
if (decodingTrailBytesNeeded == 0) {
codePointsOut = appendCodePointFromInterval(
decodingCurrentCodePoint,
validDecodedCodePointRange,
codePointsOut);
reset();
continue;
}
}
if (endOfInput) {
if (decodingTrailBytesNeeded != -1) {
codePointsOut = handleDecodeError(
"Unterminated UTF-8 sequence at end of bytes",
codePointsOut);
}
}
return codePointsOut;
}
private boolean decodeLeadingByte(byte leadingByte) {
// Be careful about Java silently widening (unsigned)
// byte to (signed) int and sign-extending here.
//
// We use binary AND liberally below to prevent widening.
int leadingByteIdx = leadingByte & 0xFF;
decodingTrailBytesNeeded = UTF8_LEADING_BYTE_LENGTHS[leadingByteIdx];
switch (decodingTrailBytesNeeded) {
case 0:
decodingCurrentCodePoint = leadingByte;
break;
case 1:
case 2:
case 3:
int mask = (0b00111111 >> decodingTrailBytesNeeded);
decodingCurrentCodePoint = leadingByte & mask;
break;
default:
return false;
}
validDecodedCodePointRange = UTF8_VALID_INTERVALS[decodingTrailBytesNeeded];
return true;
}
private boolean decodeTrailingByte(byte trailingByte) {
int trailingValue = (trailingByte & 0xFF) - 0x80;
if (trailingValue < 0x00 || trailingValue > 0x3F) {
return false;
}
else {
decodingCurrentCodePoint = (decodingCurrentCodePoint << 6) | trailingValue;
return true;
}
}
private IntBuffer appendCodePointFromInterval(
int codePoint,
Interval validCodePointRange,
IntBuffer codePointsOut)
throws CharacterCodingException
{
assert validCodePointRange != Interval.INVALID;
// Security check: UTF-8 must represent code points using their
// shortest encoded form.
if (codePoint < validCodePointRange.a ||
codePoint > validCodePointRange.b) {
return handleDecodeError(
String.format(
"Code point %d is out of expected range %s",
codePoint,
validCodePointRange),
codePointsOut);
}
else {
return appendCodePoint(codePoint, codePointsOut);
}
}
private IntBuffer appendCodePoint(int codePoint, IntBuffer codePointsOut) {
if (!codePointsOut.hasRemaining()) {
// Grow the code point buffer size by 2.
IntBuffer newBuffer = IntBuffer.allocate(codePointsOut.capacity() * 2);
codePointsOut.flip();
newBuffer.put(codePointsOut);
codePointsOut = newBuffer;
}
codePointsOut.put(codePoint);
return codePointsOut;
}
private IntBuffer handleDecodeError(
final String error,
IntBuffer codePointsOut)
throws CharacterCodingException
{
if (decodingErrorAction == CodingErrorAction.REPLACE) {
codePointsOut = appendCodePoint(SUBSTITUTION_CHARACTER, codePointsOut);
}
else if (decodingErrorAction == CodingErrorAction.REPORT) {
throw new CharacterCodingException() {
@Override
public String getMessage() {
return error;
}
};
}
return codePointsOut;
}
}