Merge pull request #1765 from bhamiltoncx/unicode-cleanup-and-doc

Tidy up CharStreams and add new doc/unicode.md
2017-03-16 17:03:06 -07:00 · 2017-03-16 17:03:06 -07:00 · 1f6a329692
parent b2e51e20b7 4f2168600d
commit 1f6a329692
10 changed files with 415 additions and 126 deletions
--- a/doc/faq/general.md
+++ b/doc/faq/general.md
@ -82,7 +82,7 @@ Make sure to use two-stage parsing. See example in [bug report](https://github.c

 ```Java

-CharStream input = new ANTLRFileStream(args[0]);
+CharStream input = CharStreams.fromPath(Paths.get(args[0]));
 ExprLexer lexer = new ExprLexer(input);
 CommonTokenStream tokens = new CommonTokenStream(lexer);
 ExprParser parser = new ExprParser(tokens);
--- a/doc/interpreters.md
+++ b/doc/interpreters.md
@ -30,7 +30,7 @@ public static ParseTree parse(String fileName,
    throws IOException
 {
    final Grammar g = Grammar.load(combinedGrammarFileName);
-    LexerInterpreter lexEngine = g.createLexerInterpreter(new ANTLRFileStream(fileName));
+    LexerInterpreter lexEngine = g.createLexerInterpreter(CharStreams.fromPath(Paths.get(fileName)));
    CommonTokenStream tokens = new CommonTokenStream(lexEngine);
    ParserInterpreter parser = g.createParserInterpreter(tokens);
    ParseTree t = parser.parse(g.getRule(startRule).index);
@ -58,7 +58,7 @@ public static ParseTree parse(String fileNameToParse,
 {
    final LexerGrammar lg = (LexerGrammar) Grammar.load(lexerGrammarFileName);
    final Grammar pg = Grammar.load(parserGrammarFileName, lg);
-    ANTLRFileStream input = new ANTLRFileStream(fileNameToParse);
+    CharStream input = CharStreams.fromPath(Paths.get(fileNameToParse));
    LexerInterpreter lexEngine = lg.createLexerInterpreter(input);
    CommonTokenStream tokens = new CommonTokenStream(lexEngine);
    ParserInterpreter parser = pg.createParserInterpreter(tokens);
--- a/doc/unicode.md
+++ b/doc/unicode.md
@ -0,0 +1,68 @@
+# Lexers and Unicode text
+
+Until ANTLR 4.7, generated lexers only supported part of the Unicode standard
+(code points up to `U+FFFF`).
+
+With ANTLR 4.7 and later, lexers as well as all languages' runtimes
+support the full range of Unicode code points up to `U+10FFFF`, as
+long as the input `CharStream` is opened using `CharStreams.fromPath()`
+or the equivalent method for your runtime's language.
+
+# Unicode Code Points in Lexer Grammars
+
+To refer to Unicode [code points](https://en.wikipedia.org/wiki/Code_point)
+in lexer grammars, use the `\u` string escape. For example, to create
+a lexer rule for a single Cyrillic character by creating a range from
+`U+0400` to `U+04FF`:
+
+```ANTLR
+CYRILLIC = ('\u0400'..'\u04FF');
+```
+
+Unicode literals larger than U+FFFF must use the extended `\u{12345}` syntax.
+For example, to create a lexer rule for a selection of smiley faces
+from the [Emoticons Unicode block](http://www.unicode.org/charts/PDF/U1F600.pdf):
+
+```ANTLR
+EMOTICONS = ('\u{1F600}' | '\u{1F602}' | '\u{1F615}');
+```
+
+Finally, lexer char sets can include Unicode properties:
+
+```ANTLR
+EMOJI = [\p{Emoji}];
+JAPANESE = [\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Han}];
+NOT_CYRILLIC = [\P{Script=Cyrillic}];
+```
+
+See [lexer-rules.md](lexer-rules.md#lexer-rule-elements) for more detail on Unicode
+escapes in lexer rules.
+
+# CharStreams and UTF-8
+
+If your lexer grammar contains code points larger than `U+FFFF`, your
+lexer client code must open the file using `CharStreams.fromPath()` or
+equivalent in your runtime's language, or input values larger than
+`U+FFFF` will *not* match.
+
+For backwards compatibility, the existing `ANTLRInputStream` and
+`ANTLRFileStream` APIs only support Unicode code points up to `U+FFFF`.
+
+The existing `TestRig` command-line interface supports all Unicode
+code points.
+
+# Example
+
+If you have generated a lexer named `UnicodeLexer`:
+
+```Java
+public static void main(String[] args) {
+  CharStream charStream = CharStreams.fromPath(Paths.get(args[0]));
+  Lexer lexer = new UnicodeLexer(charStream);
+  CommonTokenStream tokens = new CommonTokenStream(lexer);
+  tokens.fill();
+  for (Token token : tokens.getTokens()) {
+    System.out.println("Got token: " + token.toString());
+  }
+}
+```
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/BaseJavaTest.java
@ -923,7 +923,7 @@ public class BaseJavaTest implements RuntimeTestSupport {
 			"\n" +
 			"public class Test {\n" +
 			"    public static void main(String[] args) throws Exception {\n" +
-			"        CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
+			"        CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
 			"        <lexerName> lex = new <lexerName>(input);\n" +
 			"        CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
 			"        <createParser>\n"+
@ -980,7 +980,7 @@ public class BaseJavaTest implements RuntimeTestSupport {
 			"\n" +
 			"public class Test {\n" +
 			"    public static void main(String[] args) throws Exception {\n" +
-			"        CharStream input = CharStreams.createWithUTF8(Paths.get(args[0]));\n" +
+			"        CharStream input = CharStreams.fromPath(Paths.get(args[0]));\n" +
 			"        <lexerName> lex = new <lexerName>(input);\n" +
 			"        CommonTokenStream tokens = new CommonTokenStream(lex);\n" +
 			"        tokens.fill();\n" +
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCharStreams.java
@ -10,6 +10,7 @@ import static org.junit.Assert.assertEquals;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;

 import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.CharacterCodingException;
@ -20,6 +21,7 @@ import java.nio.file.Path;

 import java.util.Arrays;

+import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.CodePointCharStream;

@ -36,16 +38,16 @@ public class TestCharStreams {
 	public ExpectedException thrown = ExpectedException.none();

 	@Test
-	public void createWithBMPStringHasExpectedSize() {
-		CodePointCharStream s = CharStreams.createWithString("hello");
+	public void fromBMPStringHasExpectedSize() {
+		CharStream s = CharStreams.fromString("hello");
 		assertEquals(5, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello", s.toString());
 	}

 	@Test
-	public void createWithSMPStringHasExpectedSize() {
-		CodePointCharStream s = CharStreams.createWithString(
+	public void fromSMPStringHasExpectedSize() {
+		CharStream s = CharStreams.fromString(
 				"hello \uD83C\uDF0E");
 		assertEquals(7, s.size());
 		assertEquals(0, s.index());
@ -53,10 +55,10 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithBMPUTF8PathHasExpectedSize() throws Exception {
+	public void fromBMPUTF8PathHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
-		CodePointCharStream s = CharStreams.createWithUTF8(p);
+		CharStream s = CharStreams.fromPath(p);
 		assertEquals(5, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello", s.toString());
@ -64,10 +66,10 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithSMPUTF8PathHasExpectedSize() throws Exception {
+	public void fromSMPUTF8PathHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
-		CodePointCharStream s = CharStreams.createWithUTF8(p);
+		CharStream s = CharStreams.fromPath(p);
 		assertEquals(7, s.size());
 		assertEquals(0, s.index());
 		assertEquals("hello \uD83C\uDF0E", s.toString());
@ -75,11 +77,11 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithBMPUTF8InputStreamHasExpectedSize() throws Exception {
+	public void fromBMPUTF8InputStreamHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
 		try (InputStream is = Files.newInputStream(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
+			CharStream s = CharStreams.fromStream(is);
 			assertEquals(5, s.size());
 			assertEquals(0, s.index());
 			assertEquals("hello", s.toString());
@ -87,11 +89,11 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithSMPUTF8InputStreamHasExpectedSize() throws Exception {
+	public void fromSMPUTF8InputStreamHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
 		try (InputStream is = Files.newInputStream(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Stream(is);
+			CharStream s = CharStreams.fromStream(is);
 			assertEquals(7, s.size());
 			assertEquals(0, s.index());
 			assertEquals("hello \uD83C\uDF0E", s.toString());
@ -99,11 +101,11 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithBMPUTF8ChannelHasExpectedSize() throws Exception {
+	public void fromBMPUTF8ChannelHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello".getBytes(StandardCharsets.UTF_8));
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c, 4096, CodingErrorAction.REPLACE, "foo");
 			assertEquals(5, s.size());
 			assertEquals(0, s.index());
@ -113,11 +115,11 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithSMPUTF8ChannelHasExpectedSize() throws Exception {
+	public void fromSMPUTF8ChannelHasExpectedSize() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c, 4096, CodingErrorAction.REPLACE, "foo");
 			assertEquals(7, s.size());
 			assertEquals(0, s.index());
@ -127,13 +129,13 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
+	public void fromInvalidUTF8BytesChannelReplacesWithSubstCharInReplaceMode()
 		throws Exception {
 		Path p = folder.newFile().toPath();
 		byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE, (byte)0xFE, (byte)0xED };
 		Files.write(p, toWrite);
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c, 4096, CodingErrorAction.REPLACE, "foo");
 			assertEquals(3, s.size());
 			assertEquals(0, s.index());
@ -142,22 +144,22 @@ public class TestCharStreams {
 	}

 	@Test
-	public void createWithInvalidUTF8BytesThrowsInReportMode() throws Exception {
+	public void fromInvalidUTF8BytesThrowsInReportMode() throws Exception {
 		Path p = folder.newFile().toPath();
 		byte[] toWrite = new byte[] { (byte)0xCA, (byte)0xFE };
 		Files.write(p, toWrite);
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
 			thrown.expect(CharacterCodingException.class);
-			CharStreams.createWithUTF8Channel(c, 4096, CodingErrorAction.REPORT, "foo");
+			CharStreams.fromChannel(c, 4096, CodingErrorAction.REPORT, "foo");
 		}
 	}

 	@Test
-	public void createWithSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
+	public void fromSMPUTF8SequenceStraddlingBufferBoundary() throws Exception {
 		Path p = folder.newFile().toPath();
 		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
 		try (SeekableByteChannel c = Files.newByteChannel(p)) {
-			CodePointCharStream s = CharStreams.createWithUTF8Channel(
+			CharStream s = CharStreams.fromChannel(
 					c,
 					// Note this buffer size ensures the SMP code point
 					// straddles the boundary of two buffers
@ -169,4 +171,40 @@ public class TestCharStreams {
 			assertEquals("hello \uD83C\uDF0E", s.toString());
 		}
 	}
+
+	@Test
+	public void fromFileName() throws Exception {
+		Path p = folder.newFile().toPath();
+		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
+		CharStream s = CharStreams.fromFileName(p.toString());
+		assertEquals(7, s.size());
+		assertEquals(0, s.index());
+		assertEquals("hello \uD83C\uDF0E", s.toString());
+		assertEquals(p.toString(), s.getSourceName());
+
+	}
+
+	@Test
+	public void fromFileNameWithLatin1() throws Exception {
+		Path p = folder.newFile().toPath();
+		Files.write(p, "hello \u00CA\u00FE".getBytes(StandardCharsets.ISO_8859_1));
+		CharStream s = CharStreams.fromFileName(p.toString(), StandardCharsets.ISO_8859_1);
+		assertEquals(8, s.size());
+		assertEquals(0, s.index());
+		assertEquals("hello \u00CA\u00FE", s.toString());
+		assertEquals(p.toString(), s.getSourceName());
+
+	}
+
+	@Test
+	public void fromReader() throws Exception {
+		Path p = folder.newFile().toPath();
+		Files.write(p, "hello \uD83C\uDF0E".getBytes(StandardCharsets.UTF_8));
+		try (Reader r = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
+			CharStream s = CharStreams.fromReader(r);
+			assertEquals(7, s.size());
+			assertEquals(0, s.index());
+			assertEquals("hello \uD83C\uDF0E", s.toString());
+		}
+	}
 }
--- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java
+++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestCodePointCharStream.java
@ -26,21 +26,21 @@ public class TestCodePointCharStream {

 	@Test
 	public void emptyBytesHasSize0() {
-		CodePointCharStream s = CharStreams.createWithString("");
+		CodePointCharStream s = CharStreams.fromString("");
 		assertEquals(0, s.size());
 		assertEquals(0, s.index());
 	}

 	@Test
 	public void emptyBytesLookAheadReturnsEOF() {
-		CodePointCharStream s = CharStreams.createWithString("");
+		CodePointCharStream s = CharStreams.fromString("");
 		assertEquals(IntStream.EOF, s.LA(1));
 		assertEquals(0, s.index());
 	}

 	@Test
 	public void consumingEmptyStreamShouldThrow() {
-		CodePointCharStream s = CharStreams.createWithString("");
+		CodePointCharStream s = CharStreams.fromString("");
 		thrown.expect(IllegalStateException.class);
 		thrown.expectMessage("cannot consume EOF");
 		s.consume();
@ -48,13 +48,13 @@ public class TestCodePointCharStream {

 	@Test
 	public void singleLatinCodePointHasSize1() {
-		CodePointCharStream s = CharStreams.createWithString("X");
+		CodePointCharStream s = CharStreams.fromString("X");
 		assertEquals(1, s.size());
 	}

 	@Test
 	public void consumingSingleLatinCodePointShouldMoveIndex() {
-		CodePointCharStream s = CharStreams.createWithString("X");
+		CodePointCharStream s = CharStreams.fromString("X");
 		assertEquals(0, s.index());
 		s.consume();
 		assertEquals(1, s.index());
@ -62,7 +62,7 @@ public class TestCodePointCharStream {

 	@Test
 	public void consumingPastSingleLatinCodePointShouldThrow() {
-		CodePointCharStream s = CharStreams.createWithString("X");
+		CodePointCharStream s = CharStreams.fromString("X");
 		s.consume();
 		thrown.expect(IllegalStateException.class);
 		thrown.expectMessage("cannot consume EOF");
@ -71,14 +71,14 @@ public class TestCodePointCharStream {

 	@Test
 	public void singleLatinCodePointLookAheadShouldReturnCodePoint() {
-		CodePointCharStream s = CharStreams.createWithString("X");
+		CodePointCharStream s = CharStreams.fromString("X");
 		assertEquals('X', s.LA(1));
 		assertEquals(0, s.index());
 	}

 	@Test
 	public void multipleLatinCodePointsLookAheadShouldReturnCodePoints() {
-		CodePointCharStream s = CharStreams.createWithString("XYZ");
+		CodePointCharStream s = CharStreams.fromString("XYZ");
 		assertEquals('X', s.LA(1));
 		assertEquals(0, s.index());
 		assertEquals('Y', s.LA(2));
@ -89,20 +89,20 @@ public class TestCodePointCharStream {

 	@Test
 	public void singleLatinCodePointLookAheadPastEndShouldReturnEOF() {
-		CodePointCharStream s = CharStreams.createWithString("X");
+		CodePointCharStream s = CharStreams.fromString("X");
 		assertEquals(IntStream.EOF, s.LA(2));
 	}

 	@Test
 	public void singleCJKCodePointHasSize1() {
-		CodePointCharStream s = CharStreams.createWithString("\u611B");
+		CodePointCharStream s = CharStreams.fromString("\u611B");
 		assertEquals(1, s.size());
 		assertEquals(0, s.index());
 	}

 	@Test
 	public void consumingSingleCJKCodePointShouldMoveIndex() {
-		CodePointCharStream s = CharStreams.createWithString("\u611B");
+		CodePointCharStream s = CharStreams.fromString("\u611B");
 		assertEquals(0, s.index());
 		s.consume();
 		assertEquals(1, s.index());
@ -110,7 +110,7 @@ public class TestCodePointCharStream {

 	@Test
 	public void consumingPastSingleCJKCodePointShouldThrow() {
-		CodePointCharStream s = CharStreams.createWithString("\u611B");
+		CodePointCharStream s = CharStreams.fromString("\u611B");
 		s.consume();
 		thrown.expect(IllegalStateException.class);
 		thrown.expectMessage("cannot consume EOF");
@ -119,21 +119,21 @@ public class TestCodePointCharStream {

 	@Test
 	public void singleCJKCodePointLookAheadShouldReturnCodePoint() {
-		CodePointCharStream s = CharStreams.createWithString("\u611B");
+		CodePointCharStream s = CharStreams.fromString("\u611B");
 		assertEquals(0x611B, s.LA(1));
 		assertEquals(0, s.index());
 	}

 	@Test
 	public void singleCJKCodePointLookAheadPastEndShouldReturnEOF() {
-		CodePointCharStream s = CharStreams.createWithString("\u611B");
+		CodePointCharStream s = CharStreams.fromString("\u611B");
 		assertEquals(IntStream.EOF, s.LA(2));
 		assertEquals(0, s.index());
 	}

 	@Test
 	public void singleEmojiCodePointHasSize1() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder().appendCodePoint(0x1F4A9).toString());
 		assertEquals(1, s.size());
 		assertEquals(0, s.index());
@ -141,7 +141,7 @@ public class TestCodePointCharStream {

 	@Test
 	public void consumingSingleEmojiCodePointShouldMoveIndex() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder().appendCodePoint(0x1F4A9).toString());
 		assertEquals(0, s.index());
 		s.consume();
@ -150,7 +150,7 @@ public class TestCodePointCharStream {

 	@Test
 	public void consumingPastEndOfEmojiCodePointWithShouldThrow() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder().appendCodePoint(0x1F4A9).toString());
 		assertEquals(0, s.index());
 		s.consume();
@ -162,7 +162,7 @@ public class TestCodePointCharStream {

 	@Test
 	public void singleEmojiCodePointLookAheadShouldReturnCodePoint() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder().appendCodePoint(0x1F4A9).toString());
 		assertEquals(0x1F4A9, s.LA(1));
 		assertEquals(0, s.index());
@ -170,7 +170,7 @@ public class TestCodePointCharStream {

 	@Test
 	public void singleEmojiCodePointLookAheadPastEndShouldReturnEOF() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder().appendCodePoint(0x1F4A9).toString());
 		assertEquals(IntStream.EOF, s.LA(2));
 		assertEquals(0, s.index());
@ -178,19 +178,19 @@ public class TestCodePointCharStream {

 	@Test
 	public void getTextWithLatin() {
-		CodePointCharStream s = CharStreams.createWithString("0123456789");
+		CodePointCharStream s = CharStreams.fromString("0123456789");
 		assertEquals("34567", s.getText(Interval.of(3, 7)));
 	}

 	@Test
 	public void getTextWithCJK() {
-		CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
+		CodePointCharStream s = CharStreams.fromString("01234\u40946789");
 		assertEquals("34\u409467", s.getText(Interval.of(3, 7)));
 	}

 	@Test
 	public void getTextWithEmoji() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder("01234")
 					.appendCodePoint(0x1F522)
 					.append("6789")
@ -200,19 +200,19 @@ public class TestCodePointCharStream {

 	@Test
 	public void toStringWithLatin() {
-		CodePointCharStream s = CharStreams.createWithString("0123456789");
+		CodePointCharStream s = CharStreams.fromString("0123456789");
 		assertEquals("0123456789", s.toString());
 	}

 	@Test
 	public void toStringWithCJK() {
-		CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
+		CodePointCharStream s = CharStreams.fromString("01234\u40946789");
 		assertEquals("01234\u40946789", s.toString());
 	}

 	@Test
 	public void toStringWithEmoji() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder("01234")
 					.appendCodePoint(0x1F522)
 					.append("6789")
@ -222,19 +222,19 @@ public class TestCodePointCharStream {

 	@Test
 	public void lookAheadWithLatin() {
-		CodePointCharStream s = CharStreams.createWithString("0123456789");
+		CodePointCharStream s = CharStreams.fromString("0123456789");
 		assertEquals('5', s.LA(6));
 	}

 	@Test
 	public void lookAheadWithCJK() {
-		CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
+		CodePointCharStream s = CharStreams.fromString("01234\u40946789");
 		assertEquals(0x4094, s.LA(6));
 	}

 	@Test
 	public void lookAheadWithEmoji() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder("01234")
 					.appendCodePoint(0x1F522)
 					.append("6789")
@ -244,21 +244,21 @@ public class TestCodePointCharStream {

 	@Test
 	public void seekWithLatin() {
-		CodePointCharStream s = CharStreams.createWithString("0123456789");
+		CodePointCharStream s = CharStreams.fromString("0123456789");
 		s.seek(5);
 		assertEquals('5', s.LA(1));
 	}

 	@Test
 	public void seekWithCJK() {
-		CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
+		CodePointCharStream s = CharStreams.fromString("01234\u40946789");
 		s.seek(5);
 		assertEquals(0x4094, s.LA(1));
 	}

 	@Test
 	public void seekWithEmoji() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder("01234")
 					.appendCodePoint(0x1F522)
 					.append("6789")
@ -269,21 +269,21 @@ public class TestCodePointCharStream {

 	@Test
 	public void lookBehindWithLatin() {
-		CodePointCharStream s = CharStreams.createWithString("0123456789");
+		CodePointCharStream s = CharStreams.fromString("0123456789");
 		s.seek(6);
 		assertEquals('5', s.LA(-1));
 	}

 	@Test
 	public void lookBehindWithCJK() {
-		CodePointCharStream s = CharStreams.createWithString("01234\u40946789");
+		CodePointCharStream s = CharStreams.fromString("01234\u40946789");
 		s.seek(6);
 		assertEquals(0x4094, s.LA(-1));
 	}

 	@Test
 	public void lookBehindWithEmoji() {
-		CodePointCharStream s = CharStreams.createWithString(
+		CodePointCharStream s = CharStreams.fromString(
 				new StringBuilder("01234")
 					.appendCodePoint(0x1F522)
 					.append("6789")
--- a/runtime/Java/src/org/antlr/v4/runtime/CharStreams.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/CharStreams.java
@ -7,19 +7,27 @@ package org.antlr.v4.runtime;

 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;

 import java.nio.ByteBuffer;
 import java.nio.IntBuffer;
+import java.nio.charset.Charset;
 import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
 import java.nio.channels.Channels;
 import java.nio.channels.FileChannel;
 import java.nio.channels.ReadableByteChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;

 /**
- * Utility class to create {@link CodePointCharStream}s from
- * various sources of Unicode data.
+ * Utility class to create {@link CharStream}s from various sources of
+ * string data.
+ *
+ * Main entry points are the factory methods {@code CharStreams.fromPath()},
+ * {@code CharStreams.fromString()}, etc.
 */
 public final class CharStreams {
 	private static final int DEFAULT_BUFFER_SIZE = 4096;
@ -28,14 +36,215 @@ public final class CharStreams {
 	private CharStreams() { }

 	/**
-	 * Convenience method to create a {@link CodePointCharStream}
-	 * for the Unicode code points in a Java {@link String}.
+	 * Creates a {@link CharStream} given a path to a UTF-8
+	 * encoded file on disk.
+	 *
+	 * Reads the entire contents of the file into the result before returning.
 	 */
-	public static CodePointCharStream createWithString(String s) {
-		return createWithString(s, IntStream.UNKNOWN_SOURCE_NAME);
+	public static CharStream fromPath(Path path) throws IOException {
+		return fromPath(path, StandardCharsets.UTF_8);
 	}

-	public static CodePointCharStream createWithString(String s, String sourceName) {
+	/**
+	 * Creates a {@link CharStream} given a path to a file on disk and the
+	 * charset of the bytes contained in the file.
+	 *
+	 * Reads the entire contents of the file into the result before returning.
+	 *
+	 * For sources encoded in UTF-8, supports the full Unicode code point
+	 * range.
+	 *
+	 * For other sources, only supports Unicode code points up to U+FFFF.
+	 */
+	public static CharStream fromPath(Path path, Charset charset) throws IOException {
+		if (charset.equals(StandardCharsets.UTF_8)) {
+			try (ReadableByteChannel channel = Files.newByteChannel(path)) {
+				return fromChannel(
+					channel,
+					DEFAULT_BUFFER_SIZE,
+					CodingErrorAction.REPLACE,
+					path.toString());
+			}
+		} else {
+			return new ANTLRFileStream(path.toString(), charset.toString());
+		}
+	}
+
+	/**
+	 * Creates a {@link CharStream} given a string containing a
+	 * path to a UTF-8 file on disk.
+	 *
+	 * Reads the entire contents of the file into the result before returning.
+	 */
+	public static CharStream fromFileName(String fileName) throws IOException {
+		return fromPath(Paths.get(fileName), StandardCharsets.UTF_8);
+	}
+
+	/**
+	 * Creates a {@link CharStream} given a string containing a
+	 * path to a file on disk and the charset of the bytes
+	 * contained in the file.
+	 *
+	 * Reads the entire contents of the file into the result before returning.
+	 *
+	 * For sources encoded in UTF-8, supports the full Unicode code point
+	 * range.
+	 *
+	 * For other sources, only supports Unicode code points up to U+FFFF.
+	 */
+	public static CharStream fromFileName(String fileName, Charset charset) throws IOException {
+		return fromPath(Paths.get(fileName), charset);
+	}
+
+
+	/**
+	 * Creates a {@link CharStream} given an opened {@link InputStream}
+         * containing UTF-8 bytes.
+	 *
+	 * Reads the entire contents of the {@code InputStream} into
+	 * the result before returning, then closes the {@code InputStream}.
+	 */
+        public static CharStream fromStream(InputStream is) throws IOException {
+                return fromStream(is, StandardCharsets.UTF_8);
+        }
+
+/**
+	 * Creates a {@link CharStream} given an opened {@link InputStream} and the
+	 * charset of the bytes contained in the stream.
+	 *
+	 * Reads the entire contents of the {@code InputStream} into
+	 * the result before returning, then closes the {@code InputStream}.
+	 *
+	 * For sources encoded in UTF-8, supports the full Unicode code point
+	 * range.
+	 *
+	 * For other sources, only supports Unicode code points up to U+FFFF.
+	 */
+	public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
+		if (charset.equals(StandardCharsets.UTF_8)) {
+			try (ReadableByteChannel channel = Channels.newChannel(is)) {
+				return fromChannel(
+						channel,
+						DEFAULT_BUFFER_SIZE,
+						CodingErrorAction.REPLACE,
+						IntStream.UNKNOWN_SOURCE_NAME);
+			}
+		} else {
+			try (InputStreamReader isr = new InputStreamReader(is, charset)) {
+				return new ANTLRInputStream(isr);
+			}
+		}
+	}
+
+	/**
+	 * Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
+	 * containing UTF-8 bytes.
+	 *
+	 * Reads the entire contents of the {@code channel} into
+	 * the result before returning, then closes the {@code channel}.
+	 */
+	public static CharStream fromChannel(ReadableByteChannel channel) throws IOException {
+		return fromChannel(channel, StandardCharsets.UTF_8);
+	}
+
+	/**
+	 * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the
+	 * charset of the bytes contained in the channel.
+	 *
+	 * Reads the entire contents of the {@code channel} into
+	 * the result before returning, then closes the {@code channel}.
+	 *
+	 * For sources encoded in UTF-8, supports the full Unicode code point
+	 * range.
+	 *
+	 * For other sources, only supports Unicode code points up to U+FFFF.
+	 */
+	public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
+		if (charset.equals(StandardCharsets.UTF_8)) {
+			return fromChannel(
+					channel,
+					DEFAULT_BUFFER_SIZE,
+					CodingErrorAction.REPLACE,
+					IntStream.UNKNOWN_SOURCE_NAME);
+		} else {
+			try (InputStream is = Channels.newInputStream(channel);
+			     InputStreamReader isr = new InputStreamReader(Channels.newInputStream(channel), charset)) {
+				return new ANTLRInputStream(isr);
+			}
+		}
+	}
+
+	/**
+	 * Creates a {@link CharStream} given a {@link Reader}. Closes
+	 * the reader before returning.
+	 */
+	public static CodePointCharStream fromReader(Reader r) throws IOException {
+		return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME);
+	}
+
+	/**
+	 * Creates a {@link CharStream} given a {@link Reader} and its
+	 * source name. Closes the reader before returning.
+	 */
+	public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
+		IntBuffer codePointBuffer = IntBuffer.allocate(DEFAULT_BUFFER_SIZE);
+		int highSurrogate = -1;
+		int curCodeUnit;
+		try {
+			while ((curCodeUnit = r.read()) != -1) {
+				if (!codePointBuffer.hasRemaining()) {
+					// Grow the code point buffer size by 2.
+					IntBuffer newBuffer = IntBuffer.allocate(codePointBuffer.capacity() * 2);
+					codePointBuffer.flip();
+					newBuffer.put(codePointBuffer);
+					codePointBuffer = newBuffer;
+				}
+				if (Character.isHighSurrogate((char) curCodeUnit)) {
+					if (highSurrogate != -1) {
+						// Dangling high surrogate followed by another high surrogate.
+						codePointBuffer.put(highSurrogate);
+					}
+					highSurrogate = curCodeUnit;
+				} else if (Character.isLowSurrogate((char) curCodeUnit)) {
+					if (highSurrogate == -1) {
+						// Low surrogate not preceded by high surrogate.
+						codePointBuffer.put(curCodeUnit);
+					} else {
+						codePointBuffer.put(Character.toCodePoint((char) highSurrogate, (char) curCodeUnit));
+						highSurrogate = -1;
+					}
+				} else {
+					if (highSurrogate != -1) {
+						// Dangling high surrogate followed by a non-surrogate.
+						codePointBuffer.put(highSurrogate);
+						highSurrogate = -1;
+					}
+					codePointBuffer.put(curCodeUnit);
+				}
+			}
+			if (highSurrogate != -1) {
+				// Dangling high surrogate at end of file.
+				codePointBuffer.put(highSurrogate);
+			}
+			codePointBuffer.flip();
+			return new CodePointCharStream(codePointBuffer, sourceName);
+		} finally {
+			r.close();
+		}
+	}
+
+	/**
+	 * Creates a {@link CharStream} given a {@link String}.
+	 */
+	public static CodePointCharStream fromString(String s) {
+		return fromString(s, IntStream.UNKNOWN_SOURCE_NAME);
+	}
+
+	/**
+	 * Creates a {@link CharStream} given a {@link String} and the {@code sourceName}
+	 * from which it came.
+	 */
+	public static CodePointCharStream fromString(String s, String sourceName) {
 		// Initial guess assumes no code points > U+FFFF: one code
 		// point for each code unit in the string
 		IntBuffer codePointBuffer = IntBuffer.allocate(s.length());
@ -56,48 +265,39 @@ public final class CharStreams {
 		return new CodePointCharStream(codePointBuffer, sourceName);
 	}

-	public static CodePointCharStream createWithUTF8(Path path) throws IOException {
-		try (ReadableByteChannel channel = Files.newByteChannel(path)) {
-			return createWithUTF8Channel(
-					channel,
-					DEFAULT_BUFFER_SIZE,
-					CodingErrorAction.REPLACE,
-					path.toString());
-		}
-	}
-
-	public static CodePointCharStream createWithUTF8Stream(InputStream is) throws IOException {
-		try (ReadableByteChannel channel = Channels.newChannel(is)) {
-			return createWithUTF8Channel(
-					channel,
-					DEFAULT_BUFFER_SIZE,
-					CodingErrorAction.REPLACE,
-					IntStream.UNKNOWN_SOURCE_NAME);
-		}
-	}
-
-	public static CodePointCharStream createWithUTF8Channel(
+	/**
+	 * Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
+	 * containing UTF-8 bytes.
+	 *
+	 * Reads the entire contents of the {@code channel} into
+	 * the result before returning, then closes the {@code channel}.
+	 */
+	public static CodePointCharStream fromChannel(
 			ReadableByteChannel channel,
 			int bufferSize,
 			CodingErrorAction decodingErrorAction,
 			String sourceName
 	) throws IOException {
-		ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
-		IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
-		boolean endOfInput = false;
-		UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
-		while (!endOfInput) {
-			int bytesRead = channel.read(utf8BytesIn);
-			endOfInput = (bytesRead == -1);
-			utf8BytesIn.flip();
-			codePointsOut = decoder.decodeCodePointsFromBuffer(
-					utf8BytesIn,
-					codePointsOut,
-					endOfInput);
-			utf8BytesIn.compact();
+		try {
+			ByteBuffer utf8BytesIn = ByteBuffer.allocateDirect(bufferSize);
+			IntBuffer codePointsOut = IntBuffer.allocate(bufferSize);
+			boolean endOfInput = false;
+			UTF8CodePointDecoder decoder = new UTF8CodePointDecoder(decodingErrorAction);
+			while (!endOfInput) {
+				int bytesRead = channel.read(utf8BytesIn);
+				endOfInput = (bytesRead == -1);
+				utf8BytesIn.flip();
+				codePointsOut = decoder.decodeCodePointsFromBuffer(
+						utf8BytesIn,
+						codePointsOut,
+						endOfInput);
+				utf8BytesIn.compact();
+			}
+			codePointsOut.limit(codePointsOut.position());
+			codePointsOut.flip();
+			return new CodePointCharStream(codePointsOut, sourceName);
+		} finally {
+			channel.close();
 		}
-		codePointsOut.limit(codePointsOut.position());
-		codePointsOut.flip();
-		return new CodePointCharStream(codePointsOut, sourceName);
 	}
 }
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java
@ -381,7 +381,7 @@ public class TestATNLexerInterpreter extends BaseJavaToolTest {

 	protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
 		ATN atn = createATN(lg, true);
-		CharStream input = CharStreams.createWithString(inputString);
+		CharStream input = CharStreams.fromString(inputString);
 		ATNState startState = atn.modeNameToStartState.get("DEFAULT_MODE");
 		DOTGenerator dot = new DOTGenerator(lg);
 //		System.out.println(dot.getDOT(startState, true));
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeGrammar.java
@ -161,7 +161,7 @@ public class TestUnicodeGrammar extends BaseJavaToolTest {
 			String inputText) throws Exception {
 		Grammar grammar = new Grammar(grammarText);
 		LexerInterpreter lexEngine = grammar.createLexerInterpreter(
-				CharStreams.createWithString(inputText));
+				CharStreams.fromString(inputText));
 		CommonTokenStream tokens = new CommonTokenStream(lexEngine);
 		GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
 		ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);
--- a/tool/src/org/antlr/v4/gui/TestRig.java
+++ b/tool/src/org/antlr/v4/gui/TestRig.java
@ -26,7 +26,6 @@ import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
@ -157,28 +156,12 @@ public class TestRig {

 		Charset charset = ( encoding == null ? Charset.defaultCharset () : Charset.forName(encoding) );
 		if ( inputFiles.size()==0 ) {
-			CharStream charStream;
-			if ( charset.equals(StandardCharsets.UTF_8)) {
-				charStream = CharStreams.createWithUTF8Stream(System.in);
-			}
-			else {
-				try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) {
-					charStream = new ANTLRInputStream(r);
-				}
-			}
+			CharStream charStream = CharStreams.fromStream(System.in, charset);
 			process(lexer, parserClass, parser, charStream);
 			return;
 		}
 		for (String inputFile : inputFiles) {
-			CharStream charStream;
-			if ( charset.equals(StandardCharsets.UTF_8) ) {
-				charStream = CharStreams.createWithUTF8(Paths.get(inputFile));
-			}
-			else {
-				try ( InputStreamReader r = new InputStreamReader(System.in, charset) ) {
-					charStream = new ANTLRInputStream(r);
-				}
-			}
+	                CharStream charStream = CharStreams.fromPath(Paths.get(inputFile), charset);
 			if ( inputFiles.size()>1 ) {
 				System.err.println(inputFile);
 			}