Merge pull request #1765 from bhamiltoncx/unicode-cleanup-and-doc

Tidy up CharStreams and add new doc/unicode.md
This commit is contained in:
Terence Parr 2017-03-16 17:03:06 -07:00 committed by GitHub
commit 1f6a329692
10 changed files with 415 additions and 126 deletions

View File

@ -82,7 +82,7 @@ Make sure to use two-stage parsing. See example in [bug report](https://github.c
```Java
CharStream input = new ANTLRFileStream(args[0]);
CharStream input = CharStreams.fromPath(Paths.get(args[0]));
ExprLexer lexer = new ExprLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
ExprParser parser = new ExprParser(tokens);

View File

@ -30,7 +30,7 @@ public static ParseTree parse(String fileName,
throws IOException
{
final Grammar g = Grammar.load(combinedGrammarFileName);
LexerInterpreter lexEngine = g.createLexerInterpreter(new ANTLRFileStream(fileName));
LexerInterpreter lexEngine = g.createLexerInterpreter(CharStreams.fromPath(Paths.get(fileName)));
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
ParserInterpreter parser = g.createParserInterpreter(tokens);
ParseTree t = parser.parse(g.getRule(startRule).index);
@ -58,7 +58,7 @@ public static ParseTree parse(String fileNameToParse,
{
final LexerGrammar lg = (LexerGrammar) Grammar.load(lexerGrammarFileName);
final Grammar pg = Grammar.load(parserGrammarFileName, lg);
ANTLRFileStream input = new ANTLRFileStream(fileNameToParse);
CharStream input = CharStreams.fromPath(Paths.get(fileNameToParse));
LexerInterpreter lexEngine = lg.createLexerInterpreter(input);
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
ParserInterpreter parser = pg.createParserInterpreter(tokens);

68
doc/unicode.md Normal file
View File

@ -0,0 +1,68 @@
# Lexers and Unicode text
Until ANTLR 4.7, generated lexers only supported part of the Unicode standard
(code points up to `U+FFFF`).
With ANTLR 4.7 and later, lexers as well as all languages' runtimes
support the full range of Unicode code points up to `U+10FFFF`, as
long as the input `CharStream` is opened using `CharStreams.fromPath()`
or the equivalent method for your runtime's language.
# Unicode Code Points in Lexer Grammars
To refer to Unicode [code points](https://en.wikipedia.org/wiki/Code_point)
in lexer grammars, use the `\u` string escape. For example, to create
a lexer rule for a single Cyrillic character by creating a range from
`U+0400` to `U+04FF`:
```ANTLR
CYRILLIC = ('\u0400'..'\u04FF');
```
Unicode literals larger than U+FFFF must use the extended `\u{12345}` syntax.
For example, to create a lexer rule for a selection of smiley faces
from the [Emoticons Unicode block](http://www.unicode.org/charts/PDF/U1F600.pdf):
```ANTLR
EMOTICONS = ('\u{1F600}' | '\u{1F602}' | '\u{1F615}');
```
Finally, lexer char sets can include Unicode properties:
```ANTLR
EMOJI = [\p{Emoji}];
JAPANESE = [\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}];
NOT_CYRILLIC = [\P{Script=Cyrillic}];
```
See [lexer-rules.md](lexer-rules.md#lexer-rule-elements) for more detail on Unicode
escapes in lexer rules.
# CharStreams and UTF-8
If your lexer grammar contains code points larger than `U+FFFF`, your
lexer client code must open the file using `CharStreams.fromPath()` or
equivalent in your runtime's language, or input values larger than
`U+FFFF` will *not* match.
For backwards compatibility, the existing `ANTLRInputStream` and
`ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`.
The existing `TestRig` command-line interface supports all Unicode
code points.
# Example
If you have generated a lexer named `UnicodeLexer`:
```Java
public static void main(String[] args) {
CharStream charStream = CharStreams.fromPath(Paths.get(args[0]));
Lexer lexer = new UnicodeLexer(charStream);
CommonTokenStream tokens = new CommonTokenStream(lexer);
tokens.fill();
for (Token token : tokens.getTokens()) {
System.out.println("Got token: " + token.toString());
}
}
```

View File

@ -923,7 +923,7 @@ public class BaseJavaTest implements RuntimeTestSupport {
"\n" +
"public class Test {\n" +
" public static void main(String[] args) throws Exception {\n" +
" CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
" CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
" <lexerName> lex = new <lexerName>(input);\n" +
" CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
" <createParser>\n"+
@ -980,7 +980,7 @@ public class BaseJavaTest implements RuntimeTestSupport {
"\n" +
"public class Test {\n" +
" public static void main(String[] args) throws Exception {\n" +
" CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
" CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
" <lexerName> lex = new <lexerName>(input);\n" +
" CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
" tokens.fill();\n" +

View File

@ -10,6 +10,7 @@ import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.CharacterCodingException;
@ -20,6 +21,7 @@ import java.nio.file.Path;
import java.util.Arrays;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CodePointCharStream;
@ -36,16 +38,16 @@ public class TestCharStreams {
public ExpectedException thrown = ExpectedException.none();
@Test
public void createWithBMPStringHasExpectedSize() {
CodePointCharStream s = CharStreams.createWithString("hello");
public void fromBMPStringHasExpectedSize() {
CharStream s = CharStreams.fromString("hello");
assertEquals(5, s.size());
assertEquals(0, s.index());
assertEquals("hello", s.toString());
}
@Test
public void createWithSMPStringHasExpectedSize() {
CodePointCharStream s = CharStreams.createWithString(
public void fromSMPStringHasExpectedSize() {
CharStream s = CharStreams.fromString(
"hello \uD83C\uDF0E");
assertEquals(7, s.size());
assertEquals(0, s.index());
@ -53,10 +55,10 @@ public class TestCharStreams {
}
@Test
public void createWithBMPUTF8PathHasExpectedSize() throws Exception {
public void fromBMPUTF8PathHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
CodePointCharStream s = CharStreams.createWithUTF8(p);
CharStream s = CharStreams.fromPath(p);
assertEquals(5, s.size());
assertEquals(0, s.index());
assertEquals("hello", s.toString());
@ -64,10 +66,10 @@ public class TestCharStreams {
}
@Test
public void createWithSMPUTF8PathHasExpectedSize() throws Exception {
public void fromSMPUTF8PathHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
CodePointCharStream s = CharStreams.createWithUTF8(p);
CharStream s = CharStreams.fromPath(p);
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
@ -75,11 +77,11 @@ public class TestCharStreams {
}
@Test
public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception {
public void fromBMPUTF8InputStreamHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
try (InputStream is = Files.newInputStream(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
CharStream s = CharStreams.fromStream(is);
assertEquals(5, s.size());
assertEquals(0, s.index());
assertEquals("hello", s.toString());
@ -87,11 +89,11 @@ public class TestCharStreams {
}
@Test
public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception {
public void fromSMPUTF8InputStreamHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (InputStream is = Files.newInputStream(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
CharStream s = CharStreams.fromStream(is);
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
@ -99,11 +101,11 @@ public class TestCharStreams {
}
@Test
public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
public void fromBMPUTF8ChannelHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(5, s.size());
assertEquals(0, s.index());
@ -113,11 +115,11 @@ public class TestCharStreams {
}
@Test
public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
public void fromSMPUTF8ChannelHasExpectedSize() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(7, s.size());
assertEquals(0, s.index());
@ -127,13 +129,13 @@ public class TestCharStreams {
}
@Test
public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
public void fromInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
throws Exception {
Path p = folder.newFile().toPath();
byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE, (byte)0xFE, (byte)0xED };
Files.write(p, toWrite);
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c, 4096, CodingErrorAction.REPLACE, "foo");
assertEquals(3, s.size());
assertEquals(0, s.index());
@ -142,22 +144,22 @@ public class TestCharStreams {
}
@Test
public void createWithInvalidUTF8BytesThrowsInReportMode() throws Exception {
public void fromInvalidUTF8BytesThrowsInReportMode() throws Exception {
Path p = folder.newFile().toPath();
byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE };
Files.write(p, toWrite);
try (SeekableByteChannel c = Files.newByteChannel(p)) {
thrown.expect(CharacterCodingException.class);
CharStreams.createWithUTF8Channel(c, 4096, CodingErrorAction.REPORT, "foo");
CharStreams.fromChannel(c, 4096, CodingErrorAction.REPORT, "foo");
}
}
@Test
public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
public void fromSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (SeekableByteChannel c = Files.newByteChannel(p)) {
CodePointCharStream s = CharStreams.createWithUTF8Channel(
CharStream s = CharStreams.fromChannel(
c,
// Note this buffer size ensures the SMP code point
// straddles the boundary of two buffers
@ -169,4 +171,40 @@ public class TestCharStreams {
assertEquals("hello \uD83C\uDF0E", s.toString());
}
}
@Test
public void fromFileName() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
CharStream s = CharStreams.fromFileName(p.toString());
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
assertEquals(p.toString(), s.getSourceName());
}
@Test
public void fromFileNameWithLatin1() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \u00CA\u00FE".getBytes(StandardCharsets.ISO_8859_1));
CharStream s = CharStreams.fromFileName(p.toString(), StandardCharsets.ISO_8859_1);
assertEquals(8, s.size());
assertEquals(0, s.index());
assertEquals("hello \u00CA\u00FE", s.toString());
assertEquals(p.toString(), s.getSourceName());
}
@Test
public void fromReader() throws Exception {
Path p = folder.newFile().toPath();
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
try (Reader r = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
CharStream s = CharStreams.fromReader(r);
assertEquals(7, s.size());
assertEquals(0, s.index());
assertEquals("hello \uD83C\uDF0E", s.toString());
}
}
}

View File

@ -26,21 +26,21 @@ public class TestCodePointCharStream {
@Test
public void emptyBytesHasSize0() {
CodePointCharStream s = CharStreams.createWithString("");
CodePointCharStream s = CharStreams.fromString("");
assertEquals(0, s.size());
assertEquals(0, s.index());
}
@Test
public void emptyBytesLookAheadReturnsEOF() {
CodePointCharStream s = CharStreams.createWithString("");
CodePointCharStream s = CharStreams.fromString("");
assertEquals(IntStream.EOF, s.LA(1));
assertEquals(0, s.index());
}
@Test
public void consumingEmptyStreamShouldThrow() {
CodePointCharStream s = CharStreams.createWithString("");
CodePointCharStream s = CharStreams.fromString("");
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
s.consume();
@ -48,13 +48,13 @@ public class TestCodePointCharStream {
@Test
public void singleLatinCodePointHasSize1() {
CodePointCharStream s = CharStreams.createWithString("X");
CodePointCharStream s = CharStreams.fromString("X");
assertEquals(1, s.size());
}
@Test
public void consumingSingleLatinCodePointShouldMoveIndex() {
CodePointCharStream s = CharStreams.createWithString("X");
CodePointCharStream s = CharStreams.fromString("X");
assertEquals(0, s.index());
s.consume();
assertEquals(1, s.index());
@ -62,7 +62,7 @@ public class TestCodePointCharStream {
@Test
public void consumingPastSingleLatinCodePointShouldThrow() {
CodePointCharStream s = CharStreams.createWithString("X");
CodePointCharStream s = CharStreams.fromString("X");
s.consume();
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
@ -71,14 +71,14 @@ public class TestCodePointCharStream {
@Test
public void singleLatinCodePointLookAheadShouldReturnCodePoint() {
CodePointCharStream s = CharStreams.createWithString("X");
CodePointCharStream s = CharStreams.fromString("X");
assertEquals('X', s.LA(1));
assertEquals(0, s.index());
}
@Test
public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() {
CodePointCharStream s = CharStreams.createWithString("XYZ");
CodePointCharStream s = CharStreams.fromString("XYZ");
assertEquals('X', s.LA(1));
assertEquals(0, s.index());
assertEquals('Y', s.LA(2));
@ -89,20 +89,20 @@ public class TestCodePointCharStream {
@Test
public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() {
CodePointCharStream s = CharStreams.createWithString("X");
CodePointCharStream s = CharStreams.fromString("X");
assertEquals(IntStream.EOF, s.LA(2));
}
@Test
public void singleCJKCodePointHasSize1() {
CodePointCharStream s = CharStreams.createWithString("\u611B");
CodePointCharStream s = CharStreams.fromString("\u611B");
assertEquals(1, s.size());
assertEquals(0, s.index());
}
@Test
public void consumingSingleCJKCodePointShouldMoveIndex() {
CodePointCharStream s = CharStreams.createWithString("\u611B");
CodePointCharStream s = CharStreams.fromString("\u611B");
assertEquals(0, s.index());
s.consume();
assertEquals(1, s.index());
@ -110,7 +110,7 @@ public class TestCodePointCharStream {
@Test
public void consumingPastSingleCJKCodePointShouldThrow() {
CodePointCharStream s = CharStreams.createWithString("\u611B");
CodePointCharStream s = CharStreams.fromString("\u611B");
s.consume();
thrown.expect(IllegalStateException.class);
thrown.expectMessage("cannot consume EOF");
@ -119,21 +119,21 @@ public class TestCodePointCharStream {
@Test
public void singleCJKCodePointLookAheadShouldReturnCodePoint() {
CodePointCharStream s = CharStreams.createWithString("\u611B");
CodePointCharStream s = CharStreams.fromString("\u611B");
assertEquals(0x611B, s.LA(1));
assertEquals(0, s.index());
}
@Test
public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() {
CodePointCharStream s = CharStreams.createWithString("\u611B");
CodePointCharStream s = CharStreams.fromString("\u611B");
assertEquals(IntStream.EOF, s.LA(2));
assertEquals(0, s.index());
}
@Test
public void singleEmojiCodePointHasSize1() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(1, s.size());
assertEquals(0, s.index());
@ -141,7 +141,7 @@ public class TestCodePointCharStream {
@Test
public void consumingSingleEmojiCodePointShouldMoveIndex() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(0, s.index());
s.consume();
@ -150,7 +150,7 @@ public class TestCodePointCharStream {
@Test
public void consumingPastEndOfEmojiCodePointWithShouldThrow() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(0, s.index());
s.consume();
@ -162,7 +162,7 @@ public class TestCodePointCharStream {
@Test
public void singleEmojiCodePointLookAheadShouldReturnCodePoint() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(0x1F4A9, s.LA(1));
assertEquals(0, s.index());
@ -170,7 +170,7 @@ public class TestCodePointCharStream {
@Test
public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder().appendCodePoint(0x1F4A9).toString());
assertEquals(IntStream.EOF, s.LA(2));
assertEquals(0, s.index());
@ -178,19 +178,19 @@ public class TestCodePointCharStream {
@Test
public void getTextWithLatin() {
CodePointCharStream s = CharStreams.createWithString("0123456789");
CodePointCharStream s = CharStreams.fromString("0123456789");
assertEquals("34567", s.getText(Interval.of(3, 7)));
}
@Test
public void getTextWithCJK() {
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
assertEquals("34\u409467", s.getText(Interval.of(3, 7)));
}
@Test
public void getTextWithEmoji() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
@ -200,19 +200,19 @@ public class TestCodePointCharStream {
@Test
public void toStringWithLatin() {
CodePointCharStream s = CharStreams.createWithString("0123456789");
CodePointCharStream s = CharStreams.fromString("0123456789");
assertEquals("0123456789", s.toString());
}
@Test
public void toStringWithCJK() {
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
assertEquals("01234\u40946789", s.toString());
}
@Test
public void toStringWithEmoji() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
@ -222,19 +222,19 @@ public class TestCodePointCharStream {
@Test
public void lookAheadWithLatin() {
CodePointCharStream s = CharStreams.createWithString("0123456789");
CodePointCharStream s = CharStreams.fromString("0123456789");
assertEquals('5', s.LA(6));
}
@Test
public void lookAheadWithCJK() {
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
assertEquals(0x4094, s.LA(6));
}
@Test
public void lookAheadWithEmoji() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
@ -244,21 +244,21 @@ public class TestCodePointCharStream {
@Test
public void seekWithLatin() {
CodePointCharStream s = CharStreams.createWithString("0123456789");
CodePointCharStream s = CharStreams.fromString("0123456789");
s.seek(5);
assertEquals('5', s.LA(1));
}
@Test
public void seekWithCJK() {
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
s.seek(5);
assertEquals(0x4094, s.LA(1));
}
@Test
public void seekWithEmoji() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")
@ -269,21 +269,21 @@ public class TestCodePointCharStream {
@Test
public void lookBehindWithLatin() {
CodePointCharStream s = CharStreams.createWithString("0123456789");
CodePointCharStream s = CharStreams.fromString("0123456789");
s.seek(6);
assertEquals('5', s.LA(-1));
}
@Test
public void lookBehindWithCJK() {
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
s.seek(6);
assertEquals(0x4094, s.LA(-1));
}
@Test
public void lookBehindWithEmoji() {
CodePointCharStream s = CharStreams.createWithString(
CodePointCharStream s = CharStreams.fromString(
new StringBuilder("01234")
.appendCodePoint(0x1F522)
.append("6789")

View File

@ -7,19 +7,27 @@ package org.antlr.v4.runtime;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
/**
* Utility class to create {@link CodePointCharStream}s from
* various sources of Unicode data.
* Utility class to create {@link CharStream}s from various sources of
* string data.
*
* Main entry points are the factory methods {@code CharStreams.fromPath()},
* {@code CharStreams.fromString()}, etc.
*/
public final class CharStreams {
private static final int DEFAULT_BUFFER_SIZE = 4096;
@ -28,14 +36,215 @@ public final class CharStreams {
private CharStreams() { }
/**
* Convenience method to create a {@link CodePointCharStream}
* for the Unicode code points in a Java {@link String}.
* Creates a {@link CharStream} given a path to a UTF-8
* encoded file on disk.
*
* Reads the entire contents of the file into the result before returning.
*/
public static CodePointCharStream createWithString(String s) {
return createWithString(s, IntStream.UNKNOWN_SOURCE_NAME);
public static CharStream fromPath(Path path) throws IOException {
return fromPath(path, StandardCharsets.UTF_8);
}
public static CodePointCharStream createWithString(String s, String sourceName) {
/**
* Creates a {@link CharStream} given a path to a file on disk and the
* charset of the bytes contained in the file.
*
* Reads the entire contents of the file into the result before returning.
*
* For sources encoded in UTF-8, supports the full Unicode code point
* range.
*
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromPath(Path path, Charset charset) throws IOException {
if (charset.equals(StandardCharsets.UTF_8)) {
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
path.toString());
}
} else {
return new ANTLRFileStream(path.toString(), charset.toString());
}
}
/**
* Creates a {@link CharStream} given a string containing a
* path to a UTF-8 file on disk.
*
* Reads the entire contents of the file into the result before returning.
*/
public static CharStream fromFileName(String fileName) throws IOException {
return fromPath(Paths.get(fileName), StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given a string containing a
* path to a file on disk and the charset of the bytes
* contained in the file.
*
* Reads the entire contents of the file into the result before returning.
*
* For sources encoded in UTF-8, supports the full Unicode code point
* range.
*
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromFileName(String fileName, Charset charset) throws IOException {
return fromPath(Paths.get(fileName), charset);
}
/**
* Creates a {@link CharStream} given an opened {@link InputStream}
* containing UTF-8 bytes.
*
* Reads the entire contents of the {@code InputStream} into
* the result before returning, then closes the {@code InputStream}.
*/
public static CharStream fromStream(InputStream is) throws IOException {
return fromStream(is, StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given an opened {@link InputStream} and the
* charset of the bytes contained in the stream.
*
* Reads the entire contents of the {@code InputStream} into
* the result before returning, then closes the {@code InputStream}.
*
* For sources encoded in UTF-8, supports the full Unicode code point
* range.
*
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
if (charset.equals(StandardCharsets.UTF_8)) {
try (ReadableByteChannel channel = Channels.newChannel(is)) {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
}
} else {
try (InputStreamReader isr = new InputStreamReader(is, charset)) {
return new ANTLRInputStream(isr);
}
}
}
/**
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
* containing UTF-8 bytes.
*
* Reads the entire contents of the {@code channel} into
* the result before returning, then closes the {@code channel}.
*/
public static CharStream fromChannel(ReadableByteChannel channel) throws IOException {
return fromChannel(channel, StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the
* charset of the bytes contained in the channel.
*
* Reads the entire contents of the {@code channel} into
* the result before returning, then closes the {@code channel}.
*
* For sources encoded in UTF-8, supports the full Unicode code point
* range.
*
* For other sources, only supports Unicode code points up to U+FFFF.
*/
public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
if (charset.equals(StandardCharsets.UTF_8)) {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
} else {
try (InputStream is = Channels.newInputStream(channel);
InputStreamReader isr = new InputStreamReader(Channels.newInputStream(channel), charset)) {
return new ANTLRInputStream(isr);
}
}
}
/**
* Creates a {@link CharStream} given a {@link Reader}. Closes
* the reader before returning.
*/
public static CodePointCharStream fromReader(Reader r) throws IOException {
return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME);
}
/**
* Creates a {@link CharStream} given a {@link Reader} and its
* source name. Closes the reader before returning.
*/
public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
IntBuffer codePointBuffer = IntBuffer.allocate(DEFAULT_BUFFER_SIZE);
int highSurrogate = -1;
int curCodeUnit;
try {
while ((curCodeUnit = r.read()) != -1) {
if (!codePointBuffer.hasRemaining()) {
// Grow the code point buffer size by 2.
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
codePointBuffer.flip();
newBuffer.put(codePointBuffer);
codePointBuffer = newBuffer;
}
if (Character.isHighSurrogate((char) curCodeUnit)) {
if (highSurrogate != -1) {
// Dangling high surrogate followed by another high surrogate.
codePointBuffer.put(highSurrogate);
}
highSurrogate = curCodeUnit;
} else if (Character.isLowSurrogate((char) curCodeUnit)) {
if (highSurrogate == -1) {
// Low surrogate not preceded by high surrogate.
codePointBuffer.put(curCodeUnit);
} else {
codePointBuffer.put(Character.toCodePoint((char) highSurrogate, (char) curCodeUnit));
highSurrogate = -1;
}
} else {
if (highSurrogate != -1) {
// Dangling high surrogate followed by a non-surrogate.
codePointBuffer.put(highSurrogate);
highSurrogate = -1;
}
codePointBuffer.put(curCodeUnit);
}
}
if (highSurrogate != -1) {
// Dangling high surrogate at end of file.
codePointBuffer.put(highSurrogate);
}
codePointBuffer.flip();
return new CodePointCharStream(codePointBuffer, sourceName);
} finally {
r.close();
}
}
/**
* Creates a {@link CharStream} given a {@link String}.
*/
public static CodePointCharStream fromString(String s) {
return fromString(s, IntStream.UNKNOWN_SOURCE_NAME);
}
/**
* Creates a {@link CharStream} given a {@link String} and the {@code sourceName}
* from which it came.
*/
public static CodePointCharStream fromString(String s, String sourceName) {
// Initial guess assumes no code points > U+FFFF: one code
// point for each code unit in the string
IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
@ -56,48 +265,39 @@ public final class CharStreams {
return new CodePointCharStream(codePointBuffer, sourceName);
}
public static CodePointCharStream createWithUTF8(Path path) throws IOException {
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
return createWithUTF8Channel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
path.toString());
}
}
public static CodePointCharStream createWithUTF8Stream(InputStream is) throws IOException {
try (ReadableByteChannel channel = Channels.newChannel(is)) {
return createWithUTF8Channel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
}
}
public static CodePointCharStream createWithUTF8Channel(
/**
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
* containing UTF-8 bytes.
*
* Reads the entire contents of the {@code channel} into
* the result before returning, then closes the {@code channel}.
*/
public static CodePointCharStream fromChannel(
ReadableByteChannel channel,
int bufferSize,
CodingErrorAction decodingErrorAction,
String sourceName
) throws IOException {
ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
boolean endOfInput = false;
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
while (!endOfInput) {
int bytesRead = channel.read(utf8BytesIn);
endOfInput = (bytesRead == -1);
utf8BytesIn.flip();
codePointsOut = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
endOfInput);
utf8BytesIn.compact();
try {
ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
boolean endOfInput = false;
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
while (!endOfInput) {
int bytesRead = channel.read(utf8BytesIn);
endOfInput = (bytesRead == -1);
utf8BytesIn.flip();
codePointsOut = decoder.decodeCodePointsFromBuffer(
utf8BytesIn,
codePointsOut,
endOfInput);
utf8BytesIn.compact();
}
codePointsOut.limit(codePointsOut.position());
codePointsOut.flip();
return new CodePointCharStream(codePointsOut, sourceName);
} finally {
channel.close();
}
codePointsOut.limit(codePointsOut.position());
codePointsOut.flip();
return new CodePointCharStream(codePointsOut, sourceName);
}
}

View File

@ -381,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg, true);
CharStream input = CharStreams.createWithString(inputString);
CharStream input = CharStreams.fromString(inputString);
ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
DOTGenerator dot = new DOTGenerator(lg);
// System.out.println(dot.getDOT(startState, true));

View File

@ -161,7 +161,7 @@ public class TestUnicodeGrammar extends BaseJavaToolTest {
String inputText) throws Exception {
Grammar grammar = new Grammar(grammarText);
LexerInterpreter lexEngine = grammar.createLexerInterpreter(
CharStreams.createWithString(inputText));
CharStreams.fromString(inputText));
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);

View File

@ -26,7 +26,6 @@ import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
@ -157,28 +156,12 @@ public class TestRig {
Charset charset = ( encoding == null ? Charset.defaultCharset () : Charset.forName(encoding) );
if ( inputFiles.size()==0 ) {
CharStream charStream;
if ( charset.equals(StandardCharsets.UTF_8)) {
charStream = CharStreams.createWithUTF8Stream(System.in);
}
else {
try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) {
charStream = new ANTLRInputStream(r);
}
}
CharStream charStream = CharStreams.fromStream(System.in, charset);
process(lexer, parserClass, parser, charStream);
return;
}
for (String inputFile : inputFiles) {
CharStream charStream;
if ( charset.equals(StandardCharsets.UTF_8) ) {
charStream = CharStreams.createWithUTF8(Paths.get(inputFile));
}
else {
try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) {
charStream = new ANTLRInputStream(r);
}
}
CharStream charStream = CharStreams.fromPath(Paths.get(inputFile), charset);
if ( inputFiles.size()>1 ) {
System.err.println(inputFile);
}