forked from jasder/antlr
UTF8CodePointDecoder
This commit is contained in:
parent
93f916745d
commit
558aa7b011
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.test.runtime.java;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.antlr.v4.runtime.UTF8CodePointDecoder;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
public class TestUTF8CodePointDecoder {
|
||||
@Rule
|
||||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void decodeEmptyByteBufferWritesNothing() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.allocate(0);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(0);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(0, result.remaining());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeLatinByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("X");
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals('X', result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeCyrillicByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u042F");
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0x042F, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeCJKByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u611B");
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0x611B, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodeEmojiByteBufferWritesCodePoint() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString()
|
||||
);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0x1F4A9, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidLeadInReplaceModeWritesSubstitutionCharacter() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0xFFFD, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidLeadInReportModeThrows() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
thrown.expectMessage("Invalid UTF-8 leading byte 0xF8");
|
||||
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidTrailInReplaceModeWritesSubstitutionCharacter() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0xFFFD, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingInvalidTrailInReportModeThrows() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
thrown.expectMessage("Invalid UTF-8 trailing byte 0xC0");
|
||||
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingNonShortestFormInReplaceModeWritesSubstitutionCharacter() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
|
||||
// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
result.flip();
|
||||
assertEquals(1, result.remaining());
|
||||
assertEquals(0xFFFD, result.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decodingNonShortestFormInReportModeThrows() throws Exception {
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
|
||||
// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(1);
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
thrown.expectMessage("Code point 92 is out of expected range 128..2047");
|
||||
decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,275 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
package org.antlr.v4.runtime;
|
||||
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
|
||||
/**
|
||||
* Decodes UTF-8 bytes directly to Unicode code points, stored in an
|
||||
* {@link IntBuffer}.
|
||||
*
|
||||
* Unlike {@link CharsetDecoder}, this does not use UTF-16 as an
|
||||
* intermediate representation, so this optimizes the common case of
|
||||
* decoding a UTF-8 file for parsing as Unicode code points.
|
||||
*/
|
||||
public final class UTF8CodePointDecoder {
|
||||
private static final int SUBSTITUTION_CHARACTER = 0xFFFD;
|
||||
private static final byte NVAL = (byte) 0xFF;
|
||||
|
||||
// Table mapping UTF-8 leading byte to the length of the trailing
|
||||
// sequence.
|
||||
private static final byte[] UTF8_LEADING_BYTE_LENGTHS = new byte[] {
|
||||
// [0x00, 0x7F] -> 0 trailing bytes
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
|
||||
// [0x80, 0xBF] -> invalid leading byte
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
|
||||
|
||||
// [0xC0, 0xDF] -> one trailing byte
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
|
||||
// [0xE0, 0xEF] -> two trailing bytes
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
|
||||
// [0xF0, 0xF7] -> three trailing bytes
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||
|
||||
// [0xF8, 0xFF] -> invalid leading sequence
|
||||
NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL
|
||||
};
|
||||
|
||||
// Table mapping UTF-8 sequence length to valid Unicode code point
|
||||
// ranges for that sequence length.
|
||||
private static final Interval[] UTF8_VALID_INTERVALS = new Interval[] {
|
||||
Interval.of(0x00, 0x7F),
|
||||
Interval.of(0x80, 0x7FF),
|
||||
Interval.of(0x800, 0xFFFF),
|
||||
Interval.of(0x10000, 0x10FFFF)
|
||||
};
|
||||
|
||||
private final CodingErrorAction decodingErrorAction;
|
||||
private int decodingTrailBytesNeeded;
|
||||
private int decodingCurrentCodePoint;
|
||||
private Interval validDecodedCodePointRange;
|
||||
|
||||
/**
|
||||
* Constructs a new {@link UTF8CodePointDecoder} with a specified
|
||||
* {@link CodingErrorAction} to handle invalid UTF-8 sequences.
|
||||
*/
|
||||
public UTF8CodePointDecoder(CodingErrorAction decodingErrorAction) {
|
||||
this.decodingErrorAction = decodingErrorAction;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the state in this {@link UTF8CodePointDecoder}, preparing it
|
||||
* for use with a new input buffer.
|
||||
*/
|
||||
public void reset() {
|
||||
this.decodingTrailBytesNeeded = -1;
|
||||
this.decodingCurrentCodePoint = -1;
|
||||
this.validDecodedCodePointRange = Interval.INVALID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes as many UTF-8 bytes as possible from {@code utf8BytesIn},
|
||||
* writing the result to {@code codePointsOut}.
|
||||
*
|
||||
* If you have more bytes to decode, set {@code endOfInput} to
|
||||
* {@code false} and call this method again once more bytes
|
||||
* are available.
|
||||
*
|
||||
* If there are no more bytes available, make sure to call this
|
||||
* setting {@code endOfInput} to {@code true} so that any invalid
|
||||
* UTF-8 sequence at the end of the input is handled.
|
||||
*
|
||||
* If {@code codePointsOut} is not large enough to store the result,
|
||||
* a new buffer is allocated and returned. Otherwise, returns
|
||||
* {@code codePointsOut}.
|
||||
*
|
||||
* After returning, the {@link ByteBuffer#position position} of
|
||||
* {@code utf8BytesIn} is moved forward to reflect the bytes consumed,
|
||||
* and the {@link IntBuffer#position position} of the result
|
||||
* is moved forward to reflect the code points written.
|
||||
*
|
||||
* The {@link IntBuffer#limit limit} of the result is not changed,
|
||||
* so if this is the end of the input, you will want to set the
|
||||
* limit to the {@link IntBuffer#position position}, then
|
||||
* {@link IntBuffer#flip flip} the result to prepare for reading.
|
||||
*/
|
||||
public IntBuffer decodeCodePointsFromBuffer(
|
||||
ByteBuffer utf8BytesIn,
|
||||
IntBuffer codePointsOut,
|
||||
boolean endOfInput
|
||||
) throws CharacterCodingException {
|
||||
while (utf8BytesIn.hasRemaining()) {
|
||||
if (decodingTrailBytesNeeded == -1) {
|
||||
// Start a new UTF-8 sequence by checking the leading byte.
|
||||
byte leadingByte = utf8BytesIn.get();
|
||||
if (!decodeLeadingByte(leadingByte)) {
|
||||
codePointsOut = handleDecodeError(
|
||||
String.format("Invalid UTF-8 leading byte 0x%02X", leadingByte),
|
||||
codePointsOut);
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
assert decodingTrailBytesNeeded != -1;
|
||||
if (utf8BytesIn.remaining() < decodingTrailBytesNeeded) {
|
||||
// The caller will have to call us back with more bytes.
|
||||
break;
|
||||
}
|
||||
// Now we know the input buffer has enough bytes to decode
|
||||
// the entire sequence.
|
||||
while (decodingTrailBytesNeeded > 0) {
|
||||
// Continue a multi-byte UTF-8 sequence by checking the next trailing byte.
|
||||
byte trailingByte = utf8BytesIn.get();
|
||||
decodingTrailBytesNeeded--;
|
||||
if (!decodeTrailingByte(trailingByte)) {
|
||||
codePointsOut = handleDecodeError(
|
||||
String.format("Invalid UTF-8 trailing byte 0x%02X", trailingByte),
|
||||
codePointsOut);
|
||||
// Skip past any remaining trailing bytes in the sequence.
|
||||
utf8BytesIn.position(utf8BytesIn.position() + decodingTrailBytesNeeded);
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (decodingTrailBytesNeeded == 0) {
|
||||
codePointsOut = appendCodePointFromInterval(
|
||||
decodingCurrentCodePoint,
|
||||
validDecodedCodePointRange,
|
||||
codePointsOut);
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (endOfInput) {
|
||||
if (decodingTrailBytesNeeded != -1) {
|
||||
codePointsOut = handleDecodeError(
|
||||
"Unterminated UTF-8 sequence at end of bytes",
|
||||
codePointsOut);
|
||||
}
|
||||
}
|
||||
return codePointsOut;
|
||||
}
|
||||
|
||||
private boolean decodeLeadingByte(byte leadingByte) {
|
||||
// Be careful about Java silently widening (unsigned)
|
||||
// byte to (signed) int and sign-extending here.
|
||||
//
|
||||
// We use binary AND liberally below to prevent widening.
|
||||
int leadingByteIdx = leadingByte & 0xFF;
|
||||
decodingTrailBytesNeeded = UTF8_LEADING_BYTE_LENGTHS[leadingByteIdx];
|
||||
switch (decodingTrailBytesNeeded) {
|
||||
case 0:
|
||||
decodingCurrentCodePoint = leadingByte;
|
||||
break;
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
int mask = (0b00111111 >> decodingTrailBytesNeeded);
|
||||
decodingCurrentCodePoint = leadingByte & mask;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
validDecodedCodePointRange = UTF8_VALID_INTERVALS[decodingTrailBytesNeeded];
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean decodeTrailingByte(byte trailingByte) {
|
||||
int trailingValue = (trailingByte & 0xFF) - 0x80;
|
||||
if (trailingValue < 0x00 || trailingValue > 0x3F) {
|
||||
return false;
|
||||
} else {
|
||||
decodingCurrentCodePoint = (decodingCurrentCodePoint << 6) | trailingValue;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private IntBuffer appendCodePointFromInterval(
|
||||
int codePoint,
|
||||
Interval validCodePointRange,
|
||||
IntBuffer codePointsOut
|
||||
) throws CharacterCodingException {
|
||||
assert validCodePointRange != Interval.INVALID;
|
||||
|
||||
// Security check: UTF-8 must represent code points using their
|
||||
// shortest encoded form.
|
||||
if (codePoint < validCodePointRange.a ||
|
||||
codePoint > validCodePointRange.b) {
|
||||
return handleDecodeError(
|
||||
String.format(
|
||||
"Code point %d is out of expected range %s",
|
||||
codePoint,
|
||||
validCodePointRange),
|
||||
codePointsOut);
|
||||
} else {
|
||||
return appendCodePoint(codePoint, codePointsOut);
|
||||
}
|
||||
}
|
||||
|
||||
private IntBuffer appendCodePoint(int codePoint, IntBuffer codePointsOut) {
|
||||
if (!codePointsOut.hasRemaining()) {
|
||||
// Grow the code point buffer size by 2.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(codePointsOut.capacity() * 2);
|
||||
codePointsOut.flip();
|
||||
newBuffer.put(codePointsOut);
|
||||
codePointsOut = newBuffer;
|
||||
}
|
||||
codePointsOut.put(codePoint);
|
||||
return codePointsOut;
|
||||
}
|
||||
|
||||
private IntBuffer handleDecodeError(
|
||||
final String error,
|
||||
IntBuffer codePointsOut
|
||||
) throws CharacterCodingException {
|
||||
if (decodingErrorAction == CodingErrorAction.REPLACE) {
|
||||
codePointsOut = appendCodePoint(SUBSTITUTION_CHARACTER, codePointsOut);
|
||||
} else if (decodingErrorAction == CodingErrorAction.REPORT) {
|
||||
throw new CharacterCodingException() {
|
||||
@Override
|
||||
public String getMessage() {
|
||||
return error;
|
||||
}
|
||||
};
|
||||
}
|
||||
return codePointsOut;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue