From 6dd9a3fbe3b3bd32be0e0bc0111dfdc78767c546 Mon Sep 17 00:00:00 2001 From: parrt Date: Wed, 29 Mar 2017 14:19:37 -0700 Subject: [PATCH] respond to @bhamiltoncx comments --- doc/parsing-binary-files.md | 8 ++++---- doc/unicode.md | 18 ++++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/doc/parsing-binary-files.md b/doc/parsing-binary-files.md index a23760045..d7e901776 100644 --- a/doc/parsing-binary-files.md +++ b/doc/parsing-binary-files.md @@ -42,7 +42,7 @@ public class WriteBinaryFile { }; public static void main(String[] args) throws IOException { - Files.write(new File("resources/ips").toPath(), bytes); + Files.write(new File("/tmp/ips").toPath(), bytes); } } ``` @@ -50,14 +50,14 @@ public class WriteBinaryFile { Now we need to create a stream of bytes satisfactory to ANTLR, which is as simple as: ```java -ANTLRFileStream bytesAsChar = new ANTLRFileStream("resources/ips", "ISO-8859-1"); +CharStream bytesAsChar = CharStreams.fromFileName("/tmp/ips", StandardCharsets.ISO_8859_1); ``` The `ISO-8859-1` encoding is just the 8-bit char encoding for LATIN-1, which effectively tells the stream to treat each byte as a character. That's what we want. Then we have the usual test rig: ```java -//ANTLRFileStream bytesAsChar = new ANTLRFileStream("resources/ips", "ISO-8859-1"); DEPRECATED in 4.7 +//ANTLRFileStream bytesAsChar = new ANTLRFileStream("/tmp/ips", "ISO-8859-1"); DEPRECATED in 4.7 CharStream bytesAsChar = CharStreams.fromFileName("/tmp/ips", StandardCharsets.ISO_8859_1); IPLexer lexer = new IPLexer(bytesAsChar); CommonTokenStream tokens = new CommonTokenStream(lexer); @@ -127,7 +127,7 @@ class BinaryANTLRFileStream extends ANTLRFileStream { The new test code starts out like this: ```java -ANTLRFileStream bytesAsChar = new BinaryANTLRFileStream("resources/ips"); +ANTLRFileStream bytesAsChar = new BinaryANTLRFileStream("/tmp/ips"); IPLexer lexer = new IPLexer(bytesAsChar); ... ``` diff --git a/doc/unicode.md b/doc/unicode.md index 19319d7ea..d1ed32842 100644 --- a/doc/unicode.md +++ b/doc/unicode.md @@ -4,10 +4,10 @@ Prior to ANTLR 4.7, generated lexers only supported part of the Unicode standard long as the input `CharStream` is opened using `CharStreams.fromPath()`, `CharStreams.fromFileName()`, etc... or the equivalent method for your runtime's language. -The deprecated `ANTLRInputStream` and `ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`. +The deprecated `ANTLRInputStream` and `ANTLRFileStream` *Java-target* APIs only support Unicode code points up to `U+FFFF`. A big shout out to Ben Hamilton (github bhamiltoncx) for his superhuman -efforts across all targets to get true Unicode 3.1 support for U+10FFFF. +efforts across all targets to get true support for U+10FFFF code points. ## Example @@ -61,7 +61,7 @@ Code for **4.6** looked like this: ```java -CharStream input = new ANTLRFileStream("myinputfile") +CharStream input = new ANTLRFileStream("myinputfile"); JavaLexer lexer = new JavaLexer(input); CommonTokenStream tokens = new CommonTokenStream(lexer); ``` @@ -77,7 +77,7 @@ CommonTokenStream tokens = new CommonTokenStream(lexer); Or, if you'd like to specify the file encoding: ```java -CharStream input = CharStreams.fromFileName("inputfile", StandardCharsets.UTF_16); +CharStream input = CharStreams.fromFileName("inputfile", Charset.forName("windows-1252")); ``` ### Motivation @@ -112,7 +112,13 @@ of unbuffered input. See the [ANTLR 4 book](https://www.amazon.com/Definitive-AN useful for processing infinite streams *during the parse* and require that you manually buffer characters. Use `UnbufferedCharStream` and `UnbufferedTokenStream`. ```java -CharStream input = new UnbufferedCharStream(is); CSVLexer lex = new CSVLexer(input); // copy text out of sliding buffer and store in tokens lex.setTokenFactory(new CommonTokenFactory(true)); TokenStream tokens = new UnbufferedTokenStream(lex); CSVParser parser = new CSVParser(tokens); parser.setBuildParseTree(false); parser.file(); +CharStream input = new UnbufferedCharStream(is); +CSVLexer lex = new CSVLexer(input); // copy text out of sliding buffer and store in tokens +lex.setTokenFactory(new CommonTokenFactory(true)); +TokenStream tokens = new UnbufferedTokenStream(lex); +CSVParser parser = new CSVParser(tokens); +parser.setBuildParseTree(false); +parser.file(); ``` Your grammar that needs to have embedded actions that access the tokens as they are created, but before they disappear and are garbage collected. For example, @@ -133,4 +139,4 @@ implementation of `CharStream.getText` in allows `Token.getText` to be called at any time regardless of the input stream implementation. -*Currently, only Java and C# have these unbuffered streams implemented*. \ No newline at end of file +*Currently, only Java, C++, and C# have these unbuffered streams implemented*.