Merge pull request #1765 from bhamiltoncx/unicode-cleanup-and-doc
Tidy up CharStreams and add new doc/unicode.md
This commit is contained in:
commit
1f6a329692
|
@ -82,7 +82,7 @@ Make sure to use two-stage parsing. See example in [bug report](https://github.c
|
|||
|
||||
```Java
|
||||
|
||||
CharStream input = new ANTLRFileStream(args[0]);
|
||||
CharStream input = CharStreams.fromPath(Paths.get(args[0]));
|
||||
ExprLexer lexer = new ExprLexer(input);
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexer);
|
||||
ExprParser parser = new ExprParser(tokens);
|
||||
|
|
|
@ -30,7 +30,7 @@ public static ParseTree parse(String fileName,
|
|||
throws IOException
|
||||
{
|
||||
final Grammar g = Grammar.load(combinedGrammarFileName);
|
||||
LexerInterpreter lexEngine = g.createLexerInterpreter(new ANTLRFileStream(fileName));
|
||||
LexerInterpreter lexEngine = g.createLexerInterpreter(CharStreams.fromPath(Paths.get(fileName)));
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
|
||||
ParserInterpreter parser = g.createParserInterpreter(tokens);
|
||||
ParseTree t = parser.parse(g.getRule(startRule).index);
|
||||
|
@ -58,7 +58,7 @@ public static ParseTree parse(String fileNameToParse,
|
|||
{
|
||||
final LexerGrammar lg = (LexerGrammar) Grammar.load(lexerGrammarFileName);
|
||||
final Grammar pg = Grammar.load(parserGrammarFileName, lg);
|
||||
ANTLRFileStream input = new ANTLRFileStream(fileNameToParse);
|
||||
CharStream input = CharStreams.fromPath(Paths.get(fileNameToParse));
|
||||
LexerInterpreter lexEngine = lg.createLexerInterpreter(input);
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
|
||||
ParserInterpreter parser = pg.createParserInterpreter(tokens);
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
# Lexers and Unicode text
|
||||
|
||||
Until ANTLR 4.7, generated lexers only supported part of the Unicode standard
|
||||
(code points up to `U+FFFF`).
|
||||
|
||||
With ANTLR 4.7 and later, lexers as well as all languages' runtimes
|
||||
support the full range of Unicode code points up to `U+10FFFF`, as
|
||||
long as the input `CharStream` is opened using `CharStreams.fromPath()`
|
||||
or the equivalent method for your runtime's language.
|
||||
|
||||
# Unicode Code Points in Lexer Grammars
|
||||
|
||||
To refer to Unicode [code points](https://en.wikipedia.org/wiki/Code_point)
|
||||
in lexer grammars, use the `\u` string escape. For example, to create
|
||||
a lexer rule for a single Cyrillic character by creating a range from
|
||||
`U+0400` to `U+04FF`:
|
||||
|
||||
```ANTLR
|
||||
CYRILLIC = ('\u0400'..'\u04FF');
|
||||
```
|
||||
|
||||
Unicode literals larger than U+FFFF must use the extended `\u{12345}` syntax.
|
||||
For example, to create a lexer rule for a selection of smiley faces
|
||||
from the [Emoticons Unicode block](http://www.unicode.org/charts/PDF/U1F600.pdf):
|
||||
|
||||
```ANTLR
|
||||
EMOTICONS = ('\u{1F600}' | '\u{1F602}' | '\u{1F615}');
|
||||
```
|
||||
|
||||
Finally, lexer char sets can include Unicode properties:
|
||||
|
||||
```ANTLR
|
||||
EMOJI = [\p{Emoji}];
|
||||
JAPANESE = [\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}];
|
||||
NOT_CYRILLIC = [\P{Script=Cyrillic}];
|
||||
```
|
||||
|
||||
See [lexer-rules.md](lexer-rules.md#lexer-rule-elements) for more detail on Unicode
|
||||
escapes in lexer rules.
|
||||
|
||||
# CharStreams and UTF-8
|
||||
|
||||
If your lexer grammar contains code points larger than `U+FFFF`, your
|
||||
lexer client code must open the file using `CharStreams.fromPath()` or
|
||||
equivalent in your runtime's language, or input values larger than
|
||||
`U+FFFF` will *not* match.
|
||||
|
||||
For backwards compatibility, the existing `ANTLRInputStream` and
|
||||
`ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`.
|
||||
|
||||
The existing `TestRig` command-line interface supports all Unicode
|
||||
code points.
|
||||
|
||||
# Example
|
||||
|
||||
If you have generated a lexer named `UnicodeLexer`:
|
||||
|
||||
```Java
|
||||
public static void main(String[] args) {
|
||||
CharStream charStream = CharStreams.fromPath(Paths.get(args[0]));
|
||||
Lexer lexer = new UnicodeLexer(charStream);
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexer);
|
||||
tokens.fill();
|
||||
for (Token token : tokens.getTokens()) {
|
||||
System.out.println("Got token: " + token.toString());
|
||||
}
|
||||
}
|
||||
```
|
|
@ -923,7 +923,7 @@ public class BaseJavaTest implements RuntimeTestSupport {
|
|||
"\n" +
|
||||
"public class Test {\n" +
|
||||
" public static void main(String[] args) throws Exception {\n" +
|
||||
" CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
|
||||
" CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
|
||||
" <lexerName> lex = new <lexerName>(input);\n" +
|
||||
" CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
|
||||
" <createParser>\n"+
|
||||
|
@ -980,7 +980,7 @@ public class BaseJavaTest implements RuntimeTestSupport {
|
|||
"\n" +
|
||||
"public class Test {\n" +
|
||||
" public static void main(String[] args) throws Exception {\n" +
|
||||
" CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
|
||||
" CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
|
||||
" <lexerName> lex = new <lexerName>(input);\n" +
|
||||
" CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
|
||||
" tokens.fill();\n" +
|
||||
|
|
|
@ -10,6 +10,7 @@ import static org.junit.Assert.assertEquals;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
|
||||
import java.nio.channels.SeekableByteChannel;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
|
@ -20,6 +21,7 @@ import java.nio.file.Path;
|
|||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.antlr.v4.runtime.CharStream;
|
||||
import org.antlr.v4.runtime.CharStreams;
|
||||
import org.antlr.v4.runtime.CodePointCharStream;
|
||||
|
||||
|
@ -36,16 +38,16 @@ public class TestCharStreams {
|
|||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void createWithBMPStringHasExpectedSize() {
|
||||
CodePointCharStream s = CharStreams.createWithString("hello");
|
||||
public void fromBMPStringHasExpectedSize() {
|
||||
CharStream s = CharStreams.fromString("hello");
|
||||
assertEquals(5, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello", s.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void createWithSMPStringHasExpectedSize() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
public void fromSMPStringHasExpectedSize() {
|
||||
CharStream s = CharStreams.fromString(
|
||||
"hello \uD83C\uDF0E");
|
||||
assertEquals(7, s.size());
|
||||
assertEquals(0, s.index());
|
||||
|
@ -53,10 +55,10 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithBMPUTF8PathHasExpectedSize() throws Exception {
|
||||
public void fromBMPUTF8PathHasExpectedSize() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
|
||||
CodePointCharStream s = CharStreams.createWithUTF8(p);
|
||||
CharStream s = CharStreams.fromPath(p);
|
||||
assertEquals(5, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello", s.toString());
|
||||
|
@ -64,10 +66,10 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithSMPUTF8PathHasExpectedSize() throws Exception {
|
||||
public void fromSMPUTF8PathHasExpectedSize() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
|
||||
CodePointCharStream s = CharStreams.createWithUTF8(p);
|
||||
CharStream s = CharStreams.fromPath(p);
|
||||
assertEquals(7, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello \uD83C\uDF0E", s.toString());
|
||||
|
@ -75,11 +77,11 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception {
|
||||
public void fromBMPUTF8InputStreamHasExpectedSize() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
|
||||
try (InputStream is = Files.newInputStream(p)) {
|
||||
CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
|
||||
CharStream s = CharStreams.fromStream(is);
|
||||
assertEquals(5, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello", s.toString());
|
||||
|
@ -87,11 +89,11 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception {
|
||||
public void fromSMPUTF8InputStreamHasExpectedSize() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
|
||||
try (InputStream is = Files.newInputStream(p)) {
|
||||
CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
|
||||
CharStream s = CharStreams.fromStream(is);
|
||||
assertEquals(7, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello \uD83C\uDF0E", s.toString());
|
||||
|
@ -99,11 +101,11 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
|
||||
public void fromBMPUTF8ChannelHasExpectedSize() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
|
||||
try (SeekableByteChannel c = Files.newByteChannel(p)) {
|
||||
CodePointCharStream s = CharStreams.createWithUTF8Channel(
|
||||
CharStream s = CharStreams.fromChannel(
|
||||
c, 4096, CodingErrorAction.REPLACE, "foo");
|
||||
assertEquals(5, s.size());
|
||||
assertEquals(0, s.index());
|
||||
|
@ -113,11 +115,11 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
|
||||
public void fromSMPUTF8ChannelHasExpectedSize() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
|
||||
try (SeekableByteChannel c = Files.newByteChannel(p)) {
|
||||
CodePointCharStream s = CharStreams.createWithUTF8Channel(
|
||||
CharStream s = CharStreams.fromChannel(
|
||||
c, 4096, CodingErrorAction.REPLACE, "foo");
|
||||
assertEquals(7, s.size());
|
||||
assertEquals(0, s.index());
|
||||
|
@ -127,13 +129,13 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
|
||||
public void fromInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
|
||||
throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE, (byte)0xFE, (byte)0xED };
|
||||
Files.write(p, toWrite);
|
||||
try (SeekableByteChannel c = Files.newByteChannel(p)) {
|
||||
CodePointCharStream s = CharStreams.createWithUTF8Channel(
|
||||
CharStream s = CharStreams.fromChannel(
|
||||
c, 4096, CodingErrorAction.REPLACE, "foo");
|
||||
assertEquals(3, s.size());
|
||||
assertEquals(0, s.index());
|
||||
|
@ -142,22 +144,22 @@ public class TestCharStreams {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void createWithInvalidUTF8BytesThrowsInReportMode() throws Exception {
|
||||
public void fromInvalidUTF8BytesThrowsInReportMode() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE };
|
||||
Files.write(p, toWrite);
|
||||
try (SeekableByteChannel c = Files.newByteChannel(p)) {
|
||||
thrown.expect(CharacterCodingException.class);
|
||||
CharStreams.createWithUTF8Channel(c, 4096, CodingErrorAction.REPORT, "foo");
|
||||
CharStreams.fromChannel(c, 4096, CodingErrorAction.REPORT, "foo");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
|
||||
public void fromSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
|
||||
try (SeekableByteChannel c = Files.newByteChannel(p)) {
|
||||
CodePointCharStream s = CharStreams.createWithUTF8Channel(
|
||||
CharStream s = CharStreams.fromChannel(
|
||||
c,
|
||||
// Note this buffer size ensures the SMP code point
|
||||
// straddles the boundary of two buffers
|
||||
|
@ -169,4 +171,40 @@ public class TestCharStreams {
|
|||
assertEquals("hello \uD83C\uDF0E", s.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fromFileName() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
|
||||
CharStream s = CharStreams.fromFileName(p.toString());
|
||||
assertEquals(7, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello \uD83C\uDF0E", s.toString());
|
||||
assertEquals(p.toString(), s.getSourceName());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fromFileNameWithLatin1() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \u00CA\u00FE".getBytes(StandardCharsets.ISO_8859_1));
|
||||
CharStream s = CharStreams.fromFileName(p.toString(), StandardCharsets.ISO_8859_1);
|
||||
assertEquals(8, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello \u00CA\u00FE", s.toString());
|
||||
assertEquals(p.toString(), s.getSourceName());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fromReader() throws Exception {
|
||||
Path p = folder.newFile().toPath();
|
||||
Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
|
||||
try (Reader r = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
|
||||
CharStream s = CharStreams.fromReader(r);
|
||||
assertEquals(7, s.size());
|
||||
assertEquals(0, s.index());
|
||||
assertEquals("hello \uD83C\uDF0E", s.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,21 +26,21 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void emptyBytesHasSize0() {
|
||||
CodePointCharStream s = CharStreams.createWithString("");
|
||||
CodePointCharStream s = CharStreams.fromString("");
|
||||
assertEquals(0, s.size());
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyBytesLookAheadReturnsEOF() {
|
||||
CodePointCharStream s = CharStreams.createWithString("");
|
||||
CodePointCharStream s = CharStreams.fromString("");
|
||||
assertEquals(IntStream.EOF, s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingEmptyStreamShouldThrow() {
|
||||
CodePointCharStream s = CharStreams.createWithString("");
|
||||
CodePointCharStream s = CharStreams.fromString("");
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
s.consume();
|
||||
|
@ -48,13 +48,13 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void singleLatinCodePointHasSize1() {
|
||||
CodePointCharStream s = CharStreams.createWithString("X");
|
||||
CodePointCharStream s = CharStreams.fromString("X");
|
||||
assertEquals(1, s.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingSingleLatinCodePointShouldMoveIndex() {
|
||||
CodePointCharStream s = CharStreams.createWithString("X");
|
||||
CodePointCharStream s = CharStreams.fromString("X");
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
assertEquals(1, s.index());
|
||||
|
@ -62,7 +62,7 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void consumingPastSingleLatinCodePointShouldThrow() {
|
||||
CodePointCharStream s = CharStreams.createWithString("X");
|
||||
CodePointCharStream s = CharStreams.fromString("X");
|
||||
s.consume();
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
|
@ -71,14 +71,14 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void singleLatinCodePointLookAheadShouldReturnCodePoint() {
|
||||
CodePointCharStream s = CharStreams.createWithString("X");
|
||||
CodePointCharStream s = CharStreams.fromString("X");
|
||||
assertEquals('X', s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() {
|
||||
CodePointCharStream s = CharStreams.createWithString("XYZ");
|
||||
CodePointCharStream s = CharStreams.fromString("XYZ");
|
||||
assertEquals('X', s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
assertEquals('Y', s.LA(2));
|
||||
|
@ -89,20 +89,20 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() {
|
||||
CodePointCharStream s = CharStreams.createWithString("X");
|
||||
CodePointCharStream s = CharStreams.fromString("X");
|
||||
assertEquals(IntStream.EOF, s.LA(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleCJKCodePointHasSize1() {
|
||||
CodePointCharStream s = CharStreams.createWithString("\u611B");
|
||||
CodePointCharStream s = CharStreams.fromString("\u611B");
|
||||
assertEquals(1, s.size());
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void consumingSingleCJKCodePointShouldMoveIndex() {
|
||||
CodePointCharStream s = CharStreams.createWithString("\u611B");
|
||||
CodePointCharStream s = CharStreams.fromString("\u611B");
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
assertEquals(1, s.index());
|
||||
|
@ -110,7 +110,7 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void consumingPastSingleCJKCodePointShouldThrow() {
|
||||
CodePointCharStream s = CharStreams.createWithString("\u611B");
|
||||
CodePointCharStream s = CharStreams.fromString("\u611B");
|
||||
s.consume();
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("cannot consume EOF");
|
||||
|
@ -119,21 +119,21 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void singleCJKCodePointLookAheadShouldReturnCodePoint() {
|
||||
CodePointCharStream s = CharStreams.createWithString("\u611B");
|
||||
CodePointCharStream s = CharStreams.fromString("\u611B");
|
||||
assertEquals(0x611B, s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() {
|
||||
CodePointCharStream s = CharStreams.createWithString("\u611B");
|
||||
CodePointCharStream s = CharStreams.fromString("\u611B");
|
||||
assertEquals(IntStream.EOF, s.LA(2));
|
||||
assertEquals(0, s.index());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void singleEmojiCodePointHasSize1() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(1, s.size());
|
||||
assertEquals(0, s.index());
|
||||
|
@ -141,7 +141,7 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void consumingSingleEmojiCodePointShouldMoveIndex() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
|
@ -150,7 +150,7 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void consumingPastEndOfEmojiCodePointWithShouldThrow() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(0, s.index());
|
||||
s.consume();
|
||||
|
@ -162,7 +162,7 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void singleEmojiCodePointLookAheadShouldReturnCodePoint() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(0x1F4A9, s.LA(1));
|
||||
assertEquals(0, s.index());
|
||||
|
@ -170,7 +170,7 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder().appendCodePoint(0x1F4A9).toString());
|
||||
assertEquals(IntStream.EOF, s.LA(2));
|
||||
assertEquals(0, s.index());
|
||||
|
@ -178,19 +178,19 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void getTextWithLatin() {
|
||||
CodePointCharStream s = CharStreams.createWithString("0123456789");
|
||||
CodePointCharStream s = CharStreams.fromString("0123456789");
|
||||
assertEquals("34567", s.getText(Interval.of(3, 7)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getTextWithCJK() {
|
||||
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
|
||||
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
|
||||
assertEquals("34\u409467", s.getText(Interval.of(3, 7)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getTextWithEmoji() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
|
@ -200,19 +200,19 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void toStringWithLatin() {
|
||||
CodePointCharStream s = CharStreams.createWithString("0123456789");
|
||||
CodePointCharStream s = CharStreams.fromString("0123456789");
|
||||
assertEquals("0123456789", s.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringWithCJK() {
|
||||
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
|
||||
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
|
||||
assertEquals("01234\u40946789", s.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringWithEmoji() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
|
@ -222,19 +222,19 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void lookAheadWithLatin() {
|
||||
CodePointCharStream s = CharStreams.createWithString("0123456789");
|
||||
CodePointCharStream s = CharStreams.fromString("0123456789");
|
||||
assertEquals('5', s.LA(6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookAheadWithCJK() {
|
||||
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
|
||||
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
|
||||
assertEquals(0x4094, s.LA(6));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookAheadWithEmoji() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
|
@ -244,21 +244,21 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void seekWithLatin() {
|
||||
CodePointCharStream s = CharStreams.createWithString("0123456789");
|
||||
CodePointCharStream s = CharStreams.fromString("0123456789");
|
||||
s.seek(5);
|
||||
assertEquals('5', s.LA(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void seekWithCJK() {
|
||||
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
|
||||
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
|
||||
s.seek(5);
|
||||
assertEquals(0x4094, s.LA(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void seekWithEmoji() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
|
@ -269,21 +269,21 @@ public class TestCodePointCharStream {
|
|||
|
||||
@Test
|
||||
public void lookBehindWithLatin() {
|
||||
CodePointCharStream s = CharStreams.createWithString("0123456789");
|
||||
CodePointCharStream s = CharStreams.fromString("0123456789");
|
||||
s.seek(6);
|
||||
assertEquals('5', s.LA(-1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookBehindWithCJK() {
|
||||
CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
|
||||
CodePointCharStream s = CharStreams.fromString("01234\u40946789");
|
||||
s.seek(6);
|
||||
assertEquals(0x4094, s.LA(-1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void lookBehindWithEmoji() {
|
||||
CodePointCharStream s = CharStreams.createWithString(
|
||||
CodePointCharStream s = CharStreams.fromString(
|
||||
new StringBuilder("01234")
|
||||
.appendCodePoint(0x1F522)
|
||||
.append("6789")
|
||||
|
|
|
@ -7,19 +7,27 @@ package org.antlr.v4.runtime;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.channels.Channels;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.channels.ReadableByteChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
/**
|
||||
* Utility class to create {@link CodePointCharStream}s from
|
||||
* various sources of Unicode data.
|
||||
* Utility class to create {@link CharStream}s from various sources of
|
||||
* string data.
|
||||
*
|
||||
* Main entry points are the factory methods {@code CharStreams.fromPath()},
|
||||
* {@code CharStreams.fromString()}, etc.
|
||||
*/
|
||||
public final class CharStreams {
|
||||
private static final int DEFAULT_BUFFER_SIZE = 4096;
|
||||
|
@ -28,14 +36,215 @@ public final class CharStreams {
|
|||
private CharStreams() { }
|
||||
|
||||
/**
|
||||
* Convenience method to create a {@link CodePointCharStream}
|
||||
* for the Unicode code points in a Java {@link String}.
|
||||
* Creates a {@link CharStream} given a path to a UTF-8
|
||||
* encoded file on disk.
|
||||
*
|
||||
* Reads the entire contents of the file into the result before returning.
|
||||
*/
|
||||
public static CodePointCharStream createWithString(String s) {
|
||||
return createWithString(s, IntStream.UNKNOWN_SOURCE_NAME);
|
||||
public static CharStream fromPath(Path path) throws IOException {
|
||||
return fromPath(path, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public static CodePointCharStream createWithString(String s, String sourceName) {
|
||||
/**
|
||||
* Creates a {@link CharStream} given a path to a file on disk and the
|
||||
* charset of the bytes contained in the file.
|
||||
*
|
||||
* Reads the entire contents of the file into the result before returning.
|
||||
*
|
||||
* For sources encoded in UTF-8, supports the full Unicode code point
|
||||
* range.
|
||||
*
|
||||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromPath(Path path, Charset charset) throws IOException {
|
||||
if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
path.toString());
|
||||
}
|
||||
} else {
|
||||
return new ANTLRFileStream(path.toString(), charset.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given a string containing a
|
||||
* path to a UTF-8 file on disk.
|
||||
*
|
||||
* Reads the entire contents of the file into the result before returning.
|
||||
*/
|
||||
public static CharStream fromFileName(String fileName) throws IOException {
|
||||
return fromPath(Paths.get(fileName), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given a string containing a
|
||||
* path to a file on disk and the charset of the bytes
|
||||
* contained in the file.
|
||||
*
|
||||
* Reads the entire contents of the file into the result before returning.
|
||||
*
|
||||
* For sources encoded in UTF-8, supports the full Unicode code point
|
||||
* range.
|
||||
*
|
||||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromFileName(String fileName, Charset charset) throws IOException {
|
||||
return fromPath(Paths.get(fileName), charset);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given an opened {@link InputStream}
|
||||
* containing UTF-8 bytes.
|
||||
*
|
||||
* Reads the entire contents of the {@code InputStream} into
|
||||
* the result before returning, then closes the {@code InputStream}.
|
||||
*/
|
||||
public static CharStream fromStream(InputStream is) throws IOException {
|
||||
return fromStream(is, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given an opened {@link InputStream} and the
|
||||
* charset of the bytes contained in the stream.
|
||||
*
|
||||
* Reads the entire contents of the {@code InputStream} into
|
||||
* the result before returning, then closes the {@code InputStream}.
|
||||
*
|
||||
* For sources encoded in UTF-8, supports the full Unicode code point
|
||||
* range.
|
||||
*
|
||||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
|
||||
if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
try (ReadableByteChannel channel = Channels.newChannel(is)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
} else {
|
||||
try (InputStreamReader isr = new InputStreamReader(is, charset)) {
|
||||
return new ANTLRInputStream(isr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
|
||||
* containing UTF-8 bytes.
|
||||
*
|
||||
* Reads the entire contents of the {@code channel} into
|
||||
* the result before returning, then closes the {@code channel}.
|
||||
*/
|
||||
public static CharStream fromChannel(ReadableByteChannel channel) throws IOException {
|
||||
return fromChannel(channel, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the
|
||||
* charset of the bytes contained in the channel.
|
||||
*
|
||||
* Reads the entire contents of the {@code channel} into
|
||||
* the result before returning, then closes the {@code channel}.
|
||||
*
|
||||
* For sources encoded in UTF-8, supports the full Unicode code point
|
||||
* range.
|
||||
*
|
||||
* For other sources, only supports Unicode code points up to U+FFFF.
|
||||
*/
|
||||
public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
|
||||
if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
return fromChannel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME);
|
||||
} else {
|
||||
try (InputStream is = Channels.newInputStream(channel);
|
||||
InputStreamReader isr = new InputStreamReader(Channels.newInputStream(channel), charset)) {
|
||||
return new ANTLRInputStream(isr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given a {@link Reader}. Closes
|
||||
* the reader before returning.
|
||||
*/
|
||||
public static CodePointCharStream fromReader(Reader r) throws IOException {
|
||||
return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given a {@link Reader} and its
|
||||
* source name. Closes the reader before returning.
|
||||
*/
|
||||
public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
|
||||
IntBuffer codePointBuffer = IntBuffer.allocate(DEFAULT_BUFFER_SIZE);
|
||||
int highSurrogate = -1;
|
||||
int curCodeUnit;
|
||||
try {
|
||||
while ((curCodeUnit = r.read()) != -1) {
|
||||
if (!codePointBuffer.hasRemaining()) {
|
||||
// Grow the code point buffer size by 2.
|
||||
IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
|
||||
codePointBuffer.flip();
|
||||
newBuffer.put(codePointBuffer);
|
||||
codePointBuffer = newBuffer;
|
||||
}
|
||||
if (Character.isHighSurrogate((char) curCodeUnit)) {
|
||||
if (highSurrogate != -1) {
|
||||
// Dangling high surrogate followed by another high surrogate.
|
||||
codePointBuffer.put(highSurrogate);
|
||||
}
|
||||
highSurrogate = curCodeUnit;
|
||||
} else if (Character.isLowSurrogate((char) curCodeUnit)) {
|
||||
if (highSurrogate == -1) {
|
||||
// Low surrogate not preceded by high surrogate.
|
||||
codePointBuffer.put(curCodeUnit);
|
||||
} else {
|
||||
codePointBuffer.put(Character.toCodePoint((char) highSurrogate, (char) curCodeUnit));
|
||||
highSurrogate = -1;
|
||||
}
|
||||
} else {
|
||||
if (highSurrogate != -1) {
|
||||
// Dangling high surrogate followed by a non-surrogate.
|
||||
codePointBuffer.put(highSurrogate);
|
||||
highSurrogate = -1;
|
||||
}
|
||||
codePointBuffer.put(curCodeUnit);
|
||||
}
|
||||
}
|
||||
if (highSurrogate != -1) {
|
||||
// Dangling high surrogate at end of file.
|
||||
codePointBuffer.put(highSurrogate);
|
||||
}
|
||||
codePointBuffer.flip();
|
||||
return new CodePointCharStream(codePointBuffer, sourceName);
|
||||
} finally {
|
||||
r.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given a {@link String}.
|
||||
*/
|
||||
public static CodePointCharStream fromString(String s) {
|
||||
return fromString(s, IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link CharStream} given a {@link String} and the {@code sourceName}
|
||||
* from which it came.
|
||||
*/
|
||||
public static CodePointCharStream fromString(String s, String sourceName) {
|
||||
// Initial guess assumes no code points > U+FFFF: one code
|
||||
// point for each code unit in the string
|
||||
IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
|
||||
|
@ -56,48 +265,39 @@ public final class CharStreams {
|
|||
return new CodePointCharStream(codePointBuffer, sourceName);
|
||||
}
|
||||
|
||||
public static CodePointCharStream createWithUTF8(Path path) throws IOException {
|
||||
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
|
||||
return createWithUTF8Channel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
path.toString());
|
||||
}
|
||||
}
|
||||
|
||||
public static CodePointCharStream createWithUTF8Stream(InputStream is) throws IOException {
|
||||
try (ReadableByteChannel channel = Channels.newChannel(is)) {
|
||||
return createWithUTF8Channel(
|
||||
channel,
|
||||
DEFAULT_BUFFER_SIZE,
|
||||
CodingErrorAction.REPLACE,
|
||||
IntStream.UNKNOWN_SOURCE_NAME);
|
||||
}
|
||||
}
|
||||
|
||||
public static CodePointCharStream createWithUTF8Channel(
|
||||
/**
|
||||
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
|
||||
* containing UTF-8 bytes.
|
||||
*
|
||||
* Reads the entire contents of the {@code channel} into
|
||||
* the result before returning, then closes the {@code channel}.
|
||||
*/
|
||||
public static CodePointCharStream fromChannel(
|
||||
ReadableByteChannel channel,
|
||||
int bufferSize,
|
||||
CodingErrorAction decodingErrorAction,
|
||||
String sourceName
|
||||
) throws IOException {
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
|
||||
boolean endOfInput = false;
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
|
||||
while (!endOfInput) {
|
||||
int bytesRead = channel.read(utf8BytesIn);
|
||||
endOfInput = (bytesRead == -1);
|
||||
utf8BytesIn.flip();
|
||||
codePointsOut = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
endOfInput);
|
||||
utf8BytesIn.compact();
|
||||
try {
|
||||
ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
|
||||
IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
|
||||
boolean endOfInput = false;
|
||||
UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
|
||||
while (!endOfInput) {
|
||||
int bytesRead = channel.read(utf8BytesIn);
|
||||
endOfInput = (bytesRead == -1);
|
||||
utf8BytesIn.flip();
|
||||
codePointsOut = decoder.decodeCodePointsFromBuffer(
|
||||
utf8BytesIn,
|
||||
codePointsOut,
|
||||
endOfInput);
|
||||
utf8BytesIn.compact();
|
||||
}
|
||||
codePointsOut.limit(codePointsOut.position());
|
||||
codePointsOut.flip();
|
||||
return new CodePointCharStream(codePointsOut, sourceName);
|
||||
} finally {
|
||||
channel.close();
|
||||
}
|
||||
codePointsOut.limit(codePointsOut.position());
|
||||
codePointsOut.flip();
|
||||
return new CodePointCharStream(codePointsOut, sourceName);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -381,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {
|
|||
|
||||
protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
|
||||
ATN atn = createATN(lg, true);
|
||||
CharStream input = CharStreams.createWithString(inputString);
|
||||
CharStream input = CharStreams.fromString(inputString);
|
||||
ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
|
||||
DOTGenerator dot = new DOTGenerator(lg);
|
||||
// System.out.println(dot.getDOT(startState, true));
|
||||
|
|
|
@ -161,7 +161,7 @@ public class TestUnicodeGrammar extends BaseJavaToolTest {
|
|||
String inputText) throws Exception {
|
||||
Grammar grammar = new Grammar(grammarText);
|
||||
LexerInterpreter lexEngine = grammar.createLexerInterpreter(
|
||||
CharStreams.createWithString(inputText));
|
||||
CharStreams.fromString(inputText));
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexEngine);
|
||||
GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
|
||||
ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);
|
||||
|
|
|
@ -26,7 +26,6 @@ import java.lang.reflect.Constructor;
|
|||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -157,28 +156,12 @@ public class TestRig {
|
|||
|
||||
Charset charset = ( encoding == null ? Charset.defaultCharset () : Charset.forName(encoding) );
|
||||
if ( inputFiles.size()==0 ) {
|
||||
CharStream charStream;
|
||||
if ( charset.equals(StandardCharsets.UTF_8)) {
|
||||
charStream = CharStreams.createWithUTF8Stream(System.in);
|
||||
}
|
||||
else {
|
||||
try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) {
|
||||
charStream = new ANTLRInputStream(r);
|
||||
}
|
||||
}
|
||||
CharStream charStream = CharStreams.fromStream(System.in, charset);
|
||||
process(lexer, parserClass, parser, charStream);
|
||||
return;
|
||||
}
|
||||
for (String inputFile : inputFiles) {
|
||||
CharStream charStream;
|
||||
if ( charset.equals(StandardCharsets.UTF_8) ) {
|
||||
charStream = CharStreams.createWithUTF8(Paths.get(inputFile));
|
||||
}
|
||||
else {
|
||||
try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) {
|
||||
charStream = new ANTLRInputStream(r);
|
||||
}
|
||||
}
|
||||
CharStream charStream = CharStreams.fromPath(Paths.get(inputFile), charset);
|
||||
if ( inputFiles.size()>1 ) {
|
||||
System.err.println(inputFile);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue