Merge pull request #1627 from bhamiltoncx/utf8-code-point-decoder

New class UTF8CodePointDecoder
2017-01-29 12:54:58 -08:00 · 2017-01-29 12:54:58 -08:00 · bd8a367398
parent 3c924384eb 558aa7b011
commit bd8a367398
2 changed files with 437 additions and 0 deletions
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestUTF8CodePointDecoder.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestUTF8CodePointDecoder.java
@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+package org.antlr.v4.test.runtime.java;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
+
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+
+import org.antlr.v4.runtime.UTF8CodePointDecoder;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+public class TestUTF8CodePointDecoder {
+	@Rule
+	public ExpectedException thrown = ExpectedException.none();
+
+	@Test
+	public void decodeEmptyByteBufferWritesNothing() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = ByteBuffer.allocate(0);
+		IntBuffer codePointsOut = IntBuffer.allocate(0);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(
+				utf8BytesIn,
+				codePointsOut,
+				true);
+		result.flip();
+		assertEquals(0, result.remaining());
+	}
+
+	@Test
+	public void decodeLatinByteBufferWritesCodePoint() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("X");
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(
+				utf8BytesIn,
+				codePointsOut,
+				true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals('X', result.get(0));
+	}
+
+	@Test
+	public void decodeCyrillicByteBufferWritesCodePoint() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u042F");
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(
+				utf8BytesIn,
+				codePointsOut,
+				true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals(0x042F, result.get(0));
+	}
+
+	@Test
+	public void decodeCJKByteBufferWritesCodePoint() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode("\u611B");
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(
+				utf8BytesIn,
+				codePointsOut,
+				true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals(0x611B, result.get(0));
+	}
+
+	@Test
+	public void decodeEmojiByteBufferWritesCodePoint() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = StandardCharsets.UTF_8.encode(
+				new StringBuilder().appendCodePoint(0x1F4A9).toString()
+		);
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(
+				utf8BytesIn,
+				codePointsOut,
+				true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals(0x1F4A9, result.get(0));
+	}
+
+	@Test
+	public void decodingInvalidLeadInReplaceModeWritesSubstitutionCharacter() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals(0xFFFD, result.get(0));
+	}
+
+	@Test
+	public void decodingInvalidLeadInReportModeThrows() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
+		ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xF8 });
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		thrown.expect(CharacterCodingException.class);
+		thrown.expectMessage("Invalid UTF-8 leading byte 0xF8");
+		decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
+	}
+
+	@Test
+	public void decodingInvalidTrailInReplaceModeWritesSubstitutionCharacter() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals(0xFFFD, result.get(0));
+	}
+
+	@Test
+	public void decodingInvalidTrailInReportModeThrows() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
+		ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC0, (byte)0xC0 });
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		thrown.expect(CharacterCodingException.class);
+		thrown.expectMessage("Invalid UTF-8 trailing byte 0xC0");
+		decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
+	}
+
+	@Test
+	public void decodingNonShortestFormInReplaceModeWritesSubstitutionCharacter() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPLACE);
+		// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
+		ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		IntBuffer result = decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
+		result.flip();
+		assertEquals(1, result.remaining());
+		assertEquals(0xFFFD, result.get(0));
+	}
+
+	@Test
+	public void decodingNonShortestFormInReportModeThrows() throws Exception {
+		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(CodingErrorAction.REPORT);
+		// 0xC1 0x9C would decode to \ (U+005C) if we didn't have this check
+		ByteBuffer utf8BytesIn = ByteBuffer.wrap(new byte[] { (byte)0xC1, (byte)0x9C });
+		IntBuffer codePointsOut = IntBuffer.allocate(1);
+		thrown.expect(CharacterCodingException.class);
+		thrown.expectMessage("Code point 92 is out of expected range 128..2047");
+		decoder.decodeCodePointsFromBuffer(utf8BytesIn, codePointsOut, true);
+	}
+}
--- a/runtime/Java/src/org/antlr/v4/runtime/UTF8CodePointDecoder.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/UTF8CodePointDecoder.java
@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+package org.antlr.v4.runtime;
+
+import org.antlr.v4.runtime.misc.Interval;
+
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CodingErrorAction;
+
+/**
+ * Decodes UTF-8 bytes directly to Unicode code points, stored in an
+ * {@link IntBuffer}.
+ *
+ * Unlike {@link CharsetDecoder}, this does not use UTF-16 as an
+ * intermediate representation, so this optimizes the common case of
+ * decoding a UTF-8 file for parsing as Unicode code points.
+ */
+public final class UTF8CodePointDecoder {
+	private static final int SUBSTITUTION_CHARACTER = 0xFFFD;
+	private static final byte NVAL = (byte) 0xFF;
+
+	// Table mapping UTF-8 leading byte to the length of the trailing
+	// sequence.
+	private static final byte[] UTF8_LEADING_BYTE_LENGTHS = new byte[] {
+		// [0x00, 0x7F] -> 0 trailing bytes
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+		// [0x80, 0xBF] -> invalid leading byte
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL,
+
+		// [0xC0, 0xDF] -> one trailing byte
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+
+		// [0xE0, 0xEF] -> two trailing bytes
+		0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+		0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+
+		// [0xF0, 0xF7] -> three trailing bytes
+		0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+
+		// [0xF8, 0xFF] -> invalid leading sequence
+		NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL, NVAL
+	};
+
+	// Table mapping UTF-8 sequence length to valid Unicode code point
+	// ranges for that sequence length.
+	private static final Interval[] UTF8_VALID_INTERVALS = new Interval[] {
+		Interval.of(0x00, 0x7F),
+		Interval.of(0x80, 0x7FF),
+		Interval.of(0x800, 0xFFFF),
+		Interval.of(0x10000, 0x10FFFF)
+	};
+
+	private final CodingErrorAction decodingErrorAction;
+	private int decodingTrailBytesNeeded;
+	private int decodingCurrentCodePoint;
+	private Interval validDecodedCodePointRange;
+
+	/**
+	 * Constructs a new {@link UTF8CodePointDecoder} with a specified
+	 * {@link CodingErrorAction} to handle invalid UTF-8 sequences.
+	 */
+	public UTF8CodePointDecoder(CodingErrorAction decodingErrorAction) {
+		this.decodingErrorAction = decodingErrorAction;
+		reset();
+	}
+
+	/**
+	 * Resets the state in this {@link UTF8CodePointDecoder}, preparing it
+	 * for use with a new input buffer.
+	 */
+	public void reset() {
+		this.decodingTrailBytesNeeded = -1;
+		this.decodingCurrentCodePoint = -1;
+		this.validDecodedCodePointRange = Interval.INVALID;
+	}
+
+	/**
+	 * Decodes as many UTF-8 bytes as possible from {@code utf8BytesIn},
+	 * writing the result to {@code codePointsOut}.
+	 *
+	 * If you have more bytes to decode, set {@code endOfInput} to
+	 * {@code false} and call this method again once more bytes
+	 * are available.
+	 *
+	 * If there are no more bytes available, make sure to call this
+	 * setting {@code endOfInput} to {@code true} so that any invalid
+	 * UTF-8 sequence at the end of the input is handled.
+	 *
+	 * If {@code codePointsOut} is not large enough to store the result,
+	 * a new buffer is allocated and returned. Otherwise, returns
+	 * {@code codePointsOut}.
+	 *
+	 * After returning, the {@link ByteBuffer#position position} of
+	 * {@code utf8BytesIn} is moved forward to reflect the bytes consumed,
+	 * and the {@link IntBuffer#position position} of the result
+	 * is moved forward to reflect the code points written.
+	 *
+	 * The {@link IntBuffer#limit limit} of the result is not changed,
+	 * so if this is the end of the input, you will want to set the
+	 * limit to the {@link IntBuffer#position position}, then
+	 * {@link IntBuffer#flip flip} the result to prepare for reading.
+	 */
+	public IntBuffer decodeCodePointsFromBuffer(
+			ByteBuffer utf8BytesIn,
+			IntBuffer codePointsOut,
+			boolean endOfInput
+	) throws CharacterCodingException {
+		while (utf8BytesIn.hasRemaining()) {
+			if (decodingTrailBytesNeeded == -1) {
+				// Start a new UTF-8 sequence by checking the leading byte.
+				byte leadingByte = utf8BytesIn.get();
+				if (!decodeLeadingByte(leadingByte)) {
+					codePointsOut = handleDecodeError(
+						String.format("Invalid UTF-8 leading byte 0x%02X", leadingByte),
+						codePointsOut);
+					reset();
+					continue;
+				}
+			}
+			assert decodingTrailBytesNeeded != -1;
+			if (utf8BytesIn.remaining() < decodingTrailBytesNeeded) {
+				// The caller will have to call us back with more bytes.
+				break;
+			}
+			// Now we know the input buffer has enough bytes to decode
+			// the entire sequence.
+			while (decodingTrailBytesNeeded > 0) {
+				// Continue a multi-byte UTF-8 sequence by checking the next trailing byte.
+				byte trailingByte = utf8BytesIn.get();
+				decodingTrailBytesNeeded--;
+				if (!decodeTrailingByte(trailingByte)) {
+					codePointsOut = handleDecodeError(
+							String.format("Invalid UTF-8 trailing byte 0x%02X", trailingByte),
+							codePointsOut);
+					// Skip past any remaining trailing bytes in the sequence.
+					utf8BytesIn.position(utf8BytesIn.position() + decodingTrailBytesNeeded);
+					reset();
+					continue;
+				}
+			}
+			if (decodingTrailBytesNeeded == 0) {
+				codePointsOut = appendCodePointFromInterval(
+						decodingCurrentCodePoint,
+						validDecodedCodePointRange,
+						codePointsOut);
+				reset();
+				continue;
+			}
+		}
+		if (endOfInput) {
+			if (decodingTrailBytesNeeded != -1) {
+				codePointsOut = handleDecodeError(
+						"Unterminated UTF-8 sequence at end of bytes",
+						codePointsOut);
+			}
+		}
+		return codePointsOut;
+	}
+
+	private boolean decodeLeadingByte(byte leadingByte) {
+		// Be careful about Java silently widening (unsigned)
+		// byte to (signed) int and sign-extending here.
+		//
+		// We use binary AND liberally below to prevent widening.
+		int leadingByteIdx = leadingByte & 0xFF;
+		decodingTrailBytesNeeded = UTF8_LEADING_BYTE_LENGTHS[leadingByteIdx];
+		switch (decodingTrailBytesNeeded) {
+			case 0:
+				decodingCurrentCodePoint = leadingByte;
+				break;
+			case 1:
+			case 2:
+			case 3:
+				int mask = (0b00111111 >> decodingTrailBytesNeeded);
+				decodingCurrentCodePoint = leadingByte & mask;
+				break;
+			default:
+				return false;
+		}
+		validDecodedCodePointRange = UTF8_VALID_INTERVALS[decodingTrailBytesNeeded];
+		return true;
+	}
+
+	private boolean decodeTrailingByte(byte trailingByte) {
+		int trailingValue = (trailingByte & 0xFF) - 0x80;
+		if (trailingValue < 0x00 || trailingValue > 0x3F) {
+			return false;
+		} else {
+			decodingCurrentCodePoint = (decodingCurrentCodePoint << 6) | trailingValue;
+			return true;
+		}
+	}
+
+	private IntBuffer appendCodePointFromInterval(
+			int codePoint,
+			Interval validCodePointRange,
+			IntBuffer codePointsOut
+	) throws CharacterCodingException {
+		assert validCodePointRange != Interval.INVALID;
+
+		// Security check: UTF-8 must represent code points using their
+		// shortest encoded form.
+		if (codePoint < validCodePointRange.a ||
+			codePoint > validCodePointRange.b) {
+			return handleDecodeError(
+					String.format(
+							"Code point %d is out of expected range %s",
+							codePoint,
+							validCodePointRange),
+					codePointsOut);
+		} else {
+			return appendCodePoint(codePoint, codePointsOut);
+		}
+	}
+
+	private IntBuffer appendCodePoint(int codePoint, IntBuffer codePointsOut) {
+		if (!codePointsOut.hasRemaining()) {
+			// Grow the code point buffer size by 2.
+			IntBuffer newBuffer = IntBuffer.allocate(codePointsOut.capacity() * 2);
+			codePointsOut.flip();
+			newBuffer.put(codePointsOut);
+			codePointsOut = newBuffer;
+		}
+		codePointsOut.put(codePoint);
+		return codePointsOut;
+	}
+
+	private IntBuffer handleDecodeError(
+			final String error,
+			IntBuffer codePointsOut
+	) throws CharacterCodingException {
+		if (decodingErrorAction == CodingErrorAction.REPLACE) {
+			codePointsOut = appendCodePoint(SUBSTITUTION_CHARACTER, codePointsOut);
+		} else if (decodingErrorAction == CodingErrorAction.REPORT) {
+			throw new CharacterCodingException() {
+				@Override
+				public String getMessage() {
+					return error;
+				}
+			};
+		}
+		return codePointsOut;
+	}
+}