From c3ed9a992db2ca63927bd77769097d38ff277af3 Mon Sep 17 00:00:00 2001
From: parrt <parrt@cs.usfca.edu>
Date: Wed, 29 Mar 2017 13:56:57 -0700
Subject: [PATCH]  Add more about unbuffered streams. tweak style of code

---
 doc/unicode.md                                | 32 +++++++++++++++++--
 .../v4/runtime/UnbufferedCharStream.java      | 21 ++++++++----
 2 files changed, 44 insertions(+), 9 deletions(-)
diff --git a/doc/unicode.md b/doc/unicode.md
index 0996cc842..19319d7ea 100644
--- a/doc/unicode.md
+++ b/doc/unicode.md
@@ -98,13 +98,39 @@ on-the-fly compiler (JIT) is unable to perform the same optimizations
 so stick with either the old or the new streams, if performance is
 a primary concern. See the [extreme debugging and spelunking](https://github.com/antlr/antlr4/pull/1781) needed to identify this issue in our timing rig.
 
-### Character Buffering
+### Character Buffering, Unbuffered streams
 
 The ANTLR character streams still buffer all the input when you create
-the stream, as they have done for ~20 years. If you need unbuffered
+the stream, as they have done for ~20 years. 
+
+If you need unbuffered
 access, please note that it becomes challenging to create
 parse trees. The parse tree has to point to tokens which will either
 point into a stale location in an unbuffered stream or you have to copy
 the characters out of the buffer into the token. That defeats the purpose
 of unbuffered input. See the [ANTLR 4 book](https://www.amazon.com/Definitive-ANTLR-4-Reference/dp/1934356999) "13.8 Unbuffered Character and Token Streams". Unbuffered streams are primarily
-useful for processing infinite streams *during the parse* and require that you manually buffer characters.
+useful for processing infinite streams *during the parse* and require that you manually buffer characters. Use `UnbufferedCharStream` and `UnbufferedTokenStream`.
+
+```java
+CharStream input = new UnbufferedCharStream(is);CSVLexer lex = new CSVLexer(input);// copy text out of sliding buffer and store in tokens lex.setTokenFactory(new CommonTokenFactory(true));TokenStream tokens = new UnbufferedTokenStream<CommonToken>(lex); CSVParser parser = new CSVParser(tokens); parser.setBuildParseTree(false);parser.file();
+```
+
+Your grammar that needs to have embedded actions that access the tokens as they are created, but before they disappear and are garbage collected. For example,
+
+```
+data : a=INT {int x = Integer.parseInt($a.text);} ;
+```
+
+From the code comments of `CommonTokenFactory`:
+
+> That `true` in `new CommonTokenFactory(true)` indicates whether `CommonToken.setText` should be called after 
+constructing tokens to explicitly set the text. This is useful for cases
+where the input stream might not be able to provide arbitrary substrings
+of text from the input after the lexer creates a token (e.g. the
+implementation of `CharStream.getText` in
+`UnbufferedCharStream` throws an
+`UnsupportedOperationException`). Explicitly setting the token text
+allows `Token.getText` to be called at any time regardless of the
+input stream implementation.
+
+*Currently, only Java and C# have these unbuffered streams implemented*.
\ No newline at end of file
diff --git a/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java b/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java
index 11b42d2a8..a2e062fb2 100644
--- a/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java
@@ -18,6 +18,9 @@ import java.util.Arrays;
  *  for efficiency and also buffers while a mark exists (set by the
  *  lookahead prediction in parser). "Unbuffered" here refers to fact
  *  that it doesn't buffer all data, not that's it's on demand loading of char.
+ *
+ *  As of 4.7, the buffer elements are ints not 16-bit chars to support
+ *  U+10FFFF code points.
  */
 public class UnbufferedCharStream implements CharStream {
 	/**
@@ -153,25 +156,31 @@ public class UnbufferedCharStream implements CharStream {
 				int c = nextChar();
 				if (c > Character.MAX_VALUE || c == IntStream.EOF) {
 					add(c);
-				} else {
+				}
+				else {
 					char ch = (char) c;
 					if (Character.isLowSurrogate(ch)) {
 						throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
-					} else if (Character.isHighSurrogate(ch)) {
+					}
+					else if (Character.isHighSurrogate(ch)) {
 						int lowSurrogate = nextChar();
 						if (lowSurrogate > Character.MAX_VALUE) {
 							throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF");
-						} else if (lowSurrogate == IntStream.EOF) {
+						}
+						else if (lowSurrogate == IntStream.EOF) {
 							throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)");
-						} else {
+						}
+						else {
 							char lowSurrogateChar = (char) lowSurrogate;
 							if (Character.isLowSurrogate(lowSurrogateChar)) {
 								add(Character.toCodePoint(ch, lowSurrogateChar));
-							} else {
+							}
+							else {
 								throw new RuntimeException("Invalid UTF-16 (dangling high surrogate");
 							}
 						}
-					} else {
+					}
+					else {
 						add(c);
 					}
 				}