From c3ed9a992db2ca63927bd77769097d38ff277af3 Mon Sep 17 00:00:00 2001 From: parrt Date: Wed, 29 Mar 2017 13:56:57 -0700 Subject: [PATCH] Add more about unbuffered streams. tweak style of code --- doc/unicode.md | 32 +++++++++++++++++-- .../v4/runtime/UnbufferedCharStream.java | 21 ++++++++---- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/doc/unicode.md b/doc/unicode.md index 0996cc842..19319d7ea 100644 --- a/doc/unicode.md +++ b/doc/unicode.md @@ -98,13 +98,39 @@ on-the-fly compiler (JIT) is unable to perform the same optimizations so stick with either the old or the new streams, if performance is a primary concern. See the [extreme debugging and spelunking](https://github.com/antlr/antlr4/pull/1781) needed to identify this issue in our timing rig. -### Character Buffering +### Character Buffering, Unbuffered streams The ANTLR character streams still buffer all the input when you create -the stream, as they have done for ~20 years. If you need unbuffered +the stream, as they have done for ~20 years. + +If you need unbuffered access, please note that it becomes challenging to create parse trees. The parse tree has to point to tokens which will either point into a stale location in an unbuffered stream or you have to copy the characters out of the buffer into the token. That defeats the purpose of unbuffered input. See the [ANTLR 4 book](https://www.amazon.com/Definitive-ANTLR-4-Reference/dp/1934356999) "13.8 Unbuffered Character and Token Streams". Unbuffered streams are primarily -useful for processing infinite streams *during the parse* and require that you manually buffer characters. +useful for processing infinite streams *during the parse* and require that you manually buffer characters. Use `UnbufferedCharStream` and `UnbufferedTokenStream`. + +```java +CharStream input = new UnbufferedCharStream(is); CSVLexer lex = new CSVLexer(input); // copy text out of sliding buffer and store in tokens lex.setTokenFactory(new CommonTokenFactory(true)); TokenStream tokens = new UnbufferedTokenStream(lex); CSVParser parser = new CSVParser(tokens); parser.setBuildParseTree(false); parser.file(); +``` + +Your grammar that needs to have embedded actions that access the tokens as they are created, but before they disappear and are garbage collected. For example, + +``` +data : a=INT {int x = Integer.parseInt($a.text);} ; +``` + +From the code comments of `CommonTokenFactory`: + +> That `true` in `new CommonTokenFactory(true)` indicates whether `CommonToken.setText` should be called after +constructing tokens to explicitly set the text. This is useful for cases +where the input stream might not be able to provide arbitrary substrings +of text from the input after the lexer creates a token (e.g. the +implementation of `CharStream.getText` in +`UnbufferedCharStream` throws an +`UnsupportedOperationException`). Explicitly setting the token text +allows `Token.getText` to be called at any time regardless of the +input stream implementation. + +*Currently, only Java and C# have these unbuffered streams implemented*. \ No newline at end of file diff --git a/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java b/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java index 11b42d2a8..a2e062fb2 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java +++ b/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java @@ -18,6 +18,9 @@ import java.util.Arrays; * for efficiency and also buffers while a mark exists (set by the * lookahead prediction in parser). "Unbuffered" here refers to fact * that it doesn't buffer all data, not that's it's on demand loading of char. + * + * As of 4.7, the buffer elements are ints not 16-bit chars to support + * U+10FFFF code points. */ public class UnbufferedCharStream implements CharStream { /** @@ -153,25 +156,31 @@ public class UnbufferedCharStream implements CharStream { int c = nextChar(); if (c > Character.MAX_VALUE || c == IntStream.EOF) { add(c); - } else { + } + else { char ch = (char) c; if (Character.isLowSurrogate(ch)) { throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)"); - } else if (Character.isHighSurrogate(ch)) { + } + else if (Character.isHighSurrogate(ch)) { int lowSurrogate = nextChar(); if (lowSurrogate > Character.MAX_VALUE) { throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF"); - } else if (lowSurrogate == IntStream.EOF) { + } + else if (lowSurrogate == IntStream.EOF) { throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)"); - } else { + } + else { char lowSurrogateChar = (char) lowSurrogate; if (Character.isLowSurrogate(lowSurrogateChar)) { add(Character.toCodePoint(ch, lowSurrogateChar)); - } else { + } + else { throw new RuntimeException("Invalid UTF-16 (dangling high surrogate"); } } - } else { + } + else { add(c); } }