Merge branch 'master_upstream'

2017-02-23 10:57:31 +01:00 · 2017-02-23 10:57:31 +01:00 · cd6bb1f2a0
parent 0101770e5f ce09abb480
commit cd6bb1f2a0
13 changed files with 530 additions and 321 deletions
--- a/contributors.txt
+++ b/contributors.txt
@ -135,4 +135,6 @@ YYYY/MM/DD, github id, Full name, email
 2017/01/18, mshockwave, Bekket McClane, yihshyng223@gmail.com
 2017/02/10, lionelplessis, Lionel Plessis, lionelplessis@users.noreply.github.com
 2017/02/14, lecode-official, David Neumann, david.neumann@lecode.de
-2017/02/14, xied75, Dong Xie, xied75@gmail.com
+2017/02/14, xied75, Dong Xie, xied75@gmail.com
+2017/02/20, Thomasb81, Thomas Burg, thomasb81@gmail.com
+
--- a/pom.xml
+++ b/pom.xml
@ -78,6 +78,7 @@

  <modules>
    <module>runtime/Java</module>
+    <module>tool-codegen</module>
    <module>tool</module>
    <module>antlr4-maven-plugin</module>
    <module>tool-testsuite</module>
--- a/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ParserATNSimulator.cs
+++ b/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ParserATNSimulator.cs
@ -1,4 +1,4 @@
-/* Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
+/* Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */
@ -257,8 +257,8 @@ namespace Antlr4.Runtime.Atn
 		 *  Don't keep around as it wastes huge amounts of memory. DoubleKeyMap
 		 *  isn't synchronized but we're ok since two threads shouldn't reuse same
 		 *  parser/atnsim object because it can only handle one input at a time.
-		 *  This maps graphs a and b to merged result c. (a,b)&rarr;c. We can avoid
-		 *  the merge if we ever see a and b again.  Note that (b,a)&rarr;c should
+		 *  This maps graphs a and b to merged result c. (a,b)→c. We can avoid
+		 *  the merge if we ever see a and b again.  Note that (b,a)→c should
 		 *  also be examined during cache lookup.
 		 */
 		protected MergeCache mergeCache;
@ -2047,7 +2047,7 @@ namespace Antlr4.Runtime.Atn
 		 we don't consider any conflicts that include alternative 2. So, we
 		 ignore the conflict between alts 1 and 2. We ignore a set of
 		 conflicting alts when there is an intersection with an alternative
-		 associated with a single alt state in the state&rarr;config-list map.
+		 associated with a single alt state in the state→config-list map.

 		 It's also the case that we might have two conflicting configurations but
 		 also a 3rd nonconflicting configuration for a different alternative:
--- a/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Misc/Pair.cs
+++ b/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Misc/Pair.cs
@ -2,48 +2,48 @@
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */
-
 using System;
+
 namespace Antlr4.Runtime.Misc
 {
-
 	public class Pair<A, B>
 	{
-	public readonly A a;
-	public readonly B b;
+		public readonly A a;
+		public readonly B b;

-	public Pair(A a, B b)
-	{
-		this.a = a;
-		this.b = b;
-	}
-
-	public override bool Equals(Object obj)
-	{
-		if (obj == this)
+		public Pair(A a, B b)
 		{
-			return true;
-		}
-		else if (!(obj is Pair<A, B>)) {
-			return false;
+			this.a = a;
+			this.b = b;
 		}

-		Pair <A, B> other = (Pair <A, B>)obj;
-			return a==null ? other.a==null : a.Equals(other.b);
-	}
+		public override bool Equals(Object obj)
+		{
+			if (obj == this)
+			{
+				return true;
+			}
+			else if (!(obj is Pair<A, B>))
+			{
+				return false;
+			}

-	public override int GetHashCode()
-	{
-		int hash = MurmurHash.Initialize();
-		hash = MurmurHash.Update(hash, a);
-		hash = MurmurHash.Update(hash, b);
-		return MurmurHash.Finish(hash, 2);
-	}
+			Pair<A, B> other = (Pair<A, B>)obj;
+			return (a == null ? other.a == null : a.Equals(other.a)) &&
+				   (b == null ? other.b == null : b.Equals(other.b));
+		}

-	public override String ToString()
-	{
-		return String.Format("(%s, %s)", a, b);
-	}
-}
+		public override int GetHashCode()
+		{
+			int hash = MurmurHash.Initialize();
+			hash = MurmurHash.Update(hash, a);
+			hash = MurmurHash.Update(hash, b);
+			return MurmurHash.Finish(hash, 2);
+		}

-}
+		public override String ToString()
+		{
+			return String.Format("({0}, {1})", a, b);
+		}
+	}
+}
--- a/runtime/Cpp/runtime/src/TokenStreamRewriter.cpp
+++ b/runtime/Cpp/runtime/src/TokenStreamRewriter.cpp
@ -251,7 +251,7 @@ std::string TokenStreamRewriter::getText(const Interval &interval) {
 }

 std::string TokenStreamRewriter::getText(const std::string &programName, const Interval &interval) {
-  std::vector<TokenStreamRewriter::RewriteOperation*> rewrites = _programs[programName];
+  std::vector<TokenStreamRewriter::RewriteOperation*> &rewrites = _programs[programName];
  size_t start = interval.a;
  size_t stop = interval.b;

@ -305,7 +305,8 @@ std::string TokenStreamRewriter::getText(const std::string &programName, const I
 }

 std::unordered_map<size_t, TokenStreamRewriter::RewriteOperation*> TokenStreamRewriter::reduceToSingleOperationPerIndex(
-  std::vector<TokenStreamRewriter::RewriteOperation*> rewrites) {
+  std::vector<TokenStreamRewriter::RewriteOperation*> &rewrites) {
+

  // WALK REPLACES
  for (size_t i = 0; i < rewrites.size(); ++i) {
@ -402,7 +403,7 @@ std::unordered_map<size_t, TokenStreamRewriter::RewriteOperation*> TokenStreamRe
    }
    m[op->index] = op;
  }
-
+  
  return m;
 }

--- a/runtime/Cpp/runtime/src/TokenStreamRewriter.h
+++ b/runtime/Cpp/runtime/src/TokenStreamRewriter.h
@ -267,7 +267,7 @@ namespace antlr4 {
    ///
    ///  Return a map from token index to operation.
    /// </summary>
-    virtual std::unordered_map<size_t, RewriteOperation*> reduceToSingleOperationPerIndex(std::vector<RewriteOperation*> rewrites);
+    virtual std::unordered_map<size_t, RewriteOperation*> reduceToSingleOperationPerIndex(std::vector<RewriteOperation*> &rewrites);

    virtual std::string catOpText(std::string *a, std::string *b);

--- a/tool-codegen/pom.xml
+++ b/tool-codegen/pom.xml
@ -0,0 +1,72 @@
+<!--
+  ~ Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+  ~ Use of this file is governed by the BSD 3-clause license that
+  ~ can be found in the LICENSE.txt file in the project root.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.antlr</groupId>
+    <artifactId>antlr4-master</artifactId>
+    <version>4.6.1-SNAPSHOT</version>
+  </parent>
+  <artifactId>antlr4-tool-codegen</artifactId>
+  <name>ANTLR 4 Tool Codegen</name>
+  <url>http://www.antlr.org</url>
+  <description>Codegen for the ANTLR 4 grammar compiler.</description>
+  <dependencies>
+    <dependency>
+      <groupId>org.antlr</groupId>
+      <artifactId>antlr4-runtime</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.ibm.icu</groupId>
+      <artifactId>icu4j</artifactId>
+      <version>58.2</version>
+    </dependency>
+  </dependencies>
+  <build>
+    <sourceDirectory>src</sourceDirectory>
+    <plugins>
+      <plugin>
+        <groupId>com.webguys</groupId>
+        <artifactId>string-template-maven-plugin</artifactId>
+        <version>1.1</version>
+        <!-- this nonsense is for some reason needed to make this plugin
+             work with maven 3 -->
+        <dependencies>
+          <dependency>
+            <groupId>org.twdata.maven</groupId>
+            <artifactId>mojo-executor</artifactId>
+            <version>2.1.0</version>
+          </dependency>
+        </dependencies>
+        <configuration>
+          <templates>
+            <template>
+              <directory>${basedir}/src/main/string-template</directory>
+              <name>unicodedata</name>
+              <target>../tool/target/generated-sources/antlr4-tool-codegen/org/antlr/v4/unicode/UnicodeData.java</target>
+              <controller>
+                <className>org.antlr.v4.unicode.UnicodeDataTemplateController</className>
+                <sourceVersion>1.7</sourceVersion>
+                <targetVersion>1.7</targetVersion>
+                <method>getProperties</method>
+              </controller>
+            </template>
+          </templates>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>render</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
--- a/tool-codegen/src/main/string-template/unicodedata.st
+++ b/tool-codegen/src/main/string-template/unicodedata.st
@ -0,0 +1,58 @@
+unicodedata(propertyCodePointRanges, propertyAliases) ::= <<
+package org.antlr.v4.unicode;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.antlr.v4.runtime.misc.IntervalSet;
+import org.antlr.v4.runtime.misc.Interval;
+
+/**
+ * Code-generated utility class mapping Unicode properties to Unicode code point ranges.
+ */
+public abstract class UnicodeData {
+       private static final Map\<String, IntervalSet\> propertyCodePointRanges = new HashMap\<\>(<length(propertyCodePointRanges)>);
+       private static final Map\<String, String\> propertyAliases = new HashMap\<\>(<length(propertyAliases)>);
+
+       // Work around Java 64k bytecode method limit by splitting up static
+       // initialization into one method per Unicode property
+
+       <propertyCodePointRanges.keys:{ k | // Unicode code points with property "<k>"
+static private void addProperty<k>() {
+       List\<Interval\> intervals = Arrays.asList(
+               <propertyCodePointRanges.(k).intervals:{ interval | Interval.of(<interval.a>, <interval.b>)}; separator=",\n">
+       );
+       IntervalSet codePointRanges = new IntervalSet(intervals);
+       codePointRanges.setReadonly(true);
+       propertyCodePointRanges.put("<k>".toLowerCase(Locale.US), codePointRanges);
+\}}; separator="\n\n">
+
+       // Property aliases
+       static private void addPropertyAliases() {
+              <propertyAliases.keys:{ k | propertyAliases.put("<k>".toLowerCase(Locale.US), "<propertyAliases.(k)>".toLowerCase(Locale.US)); }; separator="\n">
+       }
+
+       // Put it all together
+       static {
+              <propertyCodePointRanges.keys:{ k | addProperty<k>(); }; separator="\n">
+              addPropertyAliases();
+       }
+
+       /**
+        * Given a Unicode property (general category code, binary property name, or script name),
+        * returns the {@link IntervalSet} of Unicode code point ranges which have that property.
+        */
+       public static IntervalSet getPropertyCodePoints(String propertyCodeOrAlias) {
+              String normalizedPropertyCodeOrAlias = propertyCodeOrAlias.toLowerCase(Locale.US);
+              IntervalSet result = propertyCodePointRanges.get(normalizedPropertyCodeOrAlias);
+              if (result == null) {
+                 String propertyCode = propertyAliases.get(normalizedPropertyCodeOrAlias);
+                 result = propertyCodePointRanges.get(propertyCode);
+              }
+              return result;
+       }
+}
+>>
--- a/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java
+++ b/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java
@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+
+package org.antlr.v4.unicode;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UCharacterCategory;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.RangeValueIterator;
+
+import org.antlr.v4.runtime.misc.IntervalSet;
+
+import java.util.LinkedHashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * StringTemplate controller used to generate parameters to feed
+ * to {@code unicodedata.st} to code-generate {@code UnicodeData.java},
+ * used by the tool for Unicode property escapes like {@code \\p\{Lu\}}.
+ *
+ * Uses ICU to iterate over Unicode character categories, properties,
+ * and script codes, as well as aliases for those codes.
+ *
+ * This class exists in its own Maven module to avoid adding a
+ * dependency from the tool onto the (large) ICU runtime.
+ */
+public abstract class UnicodeDataTemplateController {
+	private static void addIntervalForCategory(
+			Map<String, IntervalSet> categoryMap,
+			String categoryName,
+			int start,
+			int finish) {
+		IntervalSet intervalSet = categoryMap.get(categoryName);
+		if (intervalSet == null) {
+			intervalSet = new IntervalSet();
+			categoryMap.put(categoryName, intervalSet);
+		}
+		intervalSet.add(start, finish);
+	}
+
+	private static void addPropertyAliases(
+			Map<String, String> propertyAliases,
+			String propertyName,
+			int property) {
+		int nameChoice = UProperty.NameChoice.LONG;
+		while (true) {
+			String alias;
+			try {
+				alias = UCharacter.getPropertyName(property, nameChoice);
+			} catch (IllegalArgumentException e) {
+				// No more aliases.
+				break;
+			}
+			assert alias != null;
+			addPropertyAlias(propertyAliases, alias, propertyName);
+			nameChoice++;
+		}
+	}
+
+	private static void addPropertyAlias(
+			Map<String, String> propertyAliases,
+			String alias,
+			String propertyName) {
+		propertyAliases.put(alias, propertyName);
+	}
+
+	public static Map<String, Object> getProperties() {
+		Map<String, IntervalSet> propertyCodePointRanges = new LinkedHashMap<>();
+		addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
+		addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
+		addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
+
+		Map<String, String> propertyAliases = new LinkedHashMap<>();
+		addUnicodeCategoryCodesToNames(propertyAliases);
+		addUnicodeBinaryPropertyCodesToNames(propertyAliases);
+		addUnicodeScriptCodesToNames(propertyAliases);
+
+		Map<String, Object> properties = new LinkedHashMap<>();
+		properties.put("propertyCodePointRanges", propertyCodePointRanges);
+		properties.put("propertyAliases", propertyAliases);
+		return properties;
+	}
+
+	private static String getShortPropertyName(int property) {
+		String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
+		// For some reason, a few properties only have long names.
+		if (propertyName == null) {
+			propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
+		}
+		return propertyName;
+	}
+
+	private static void addUnicodeCategoryCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
+		RangeValueIterator iter = UCharacter.getTypeIterator();
+		RangeValueIterator.Element element = new RangeValueIterator.Element();
+		while (iter.next(element)) {
+			String categoryName = UCharacter.getPropertyValueName(
+					UProperty.GENERAL_CATEGORY_MASK,
+					1 << element.value,
+					UProperty.NameChoice.SHORT);
+			addIntervalForCategory(propertyCodePointRanges, categoryName, element.start, element.limit - 1);
+			// Add short category so Ll, Lu, Lo, etc. all show up under L
+			String shortCategoryName = categoryName.substring(0, 1);
+			addIntervalForCategory(propertyCodePointRanges, shortCategoryName, element.start, element.limit - 1);
+		}
+	}
+
+	private static void addUnicodeCategoryCodesToNames(Map<String, String> propertyAliases) {
+		RangeValueIterator iter = UCharacter.getTypeIterator();
+		RangeValueIterator.Element element = new RangeValueIterator.Element();
+		while (iter.next(element)) {
+			int generalCategoryMask = 1 << element.value;
+			String categoryName = UCharacter.getPropertyValueName(
+					UProperty.GENERAL_CATEGORY_MASK,
+					generalCategoryMask,
+					UProperty.NameChoice.SHORT);
+			int nameChoice = UProperty.NameChoice.LONG;
+			while (true) {
+				String alias;
+				try {
+					alias = UCharacter.getPropertyValueName(
+							UProperty.GENERAL_CATEGORY_MASK,
+							generalCategoryMask,
+							nameChoice);
+				} catch (IllegalArgumentException e) {
+					// No more aliases.
+					break;
+				}
+				assert alias != null;
+				addPropertyAlias(propertyAliases, alias, categoryName);
+				nameChoice++;
+			}
+		}
+		// Add short categories
+		addPropertyAlias(propertyAliases, "Control", "C");
+		addPropertyAlias(propertyAliases, "Letter", "L");
+		addPropertyAlias(propertyAliases, "Number", "N");
+		addPropertyAlias(propertyAliases, "Mark", "M");
+		addPropertyAlias(propertyAliases, "Punctuation", "P");
+		addPropertyAlias(propertyAliases, "Symbol", "S");
+		addPropertyAlias(propertyAliases, "Space", "Z");
+	}
+
+	private static void addUnicodeBinaryPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
+		for (int property = UProperty.BINARY_START;
+		     property < UProperty.BINARY_LIMIT;
+		     property++) {
+			String propertyName = getShortPropertyName(property);
+			IntervalSet intervalSet = new IntervalSet();
+			UnicodeSet unicodeSet = new UnicodeSet();
+			unicodeSet.applyIntPropertyValue(property, 1);
+			for (UnicodeSet.EntryRange range : unicodeSet.ranges()) {
+				intervalSet.add(range.codepoint, range.codepointEnd);
+			}
+			propertyCodePointRanges.put(propertyName, intervalSet);
+		}
+	}
+
+	private static void addUnicodeBinaryPropertyCodesToNames(Map<String, String> propertyAliases) {
+		for (int property = UProperty.BINARY_START;
+		     property < UProperty.BINARY_LIMIT;
+		     property++) {
+			String propertyName = getShortPropertyName(property);
+			addPropertyAliases(propertyAliases, propertyName, property);
+		}
+	}
+
+	private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
+		for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
+		     script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
+		     script++) {
+			UnicodeSet set = new UnicodeSet();
+			set.applyIntPropertyValue(UProperty.SCRIPT, script);
+			String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
+			IntervalSet intervalSet = propertyCodePointRanges.get(scriptName);
+			if (intervalSet == null) {
+				intervalSet = new IntervalSet();
+				propertyCodePointRanges.put(scriptName, intervalSet);
+			}
+			for (UnicodeSet.EntryRange range : set.ranges()) {
+				intervalSet.add(range.codepoint, range.codepointEnd);
+			}
+		}
+	}
+
+	private static void addUnicodeScriptCodesToNames(Map<String, String> propertyAliases) {
+		for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
+		     script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
+		     script++) {
+			String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
+			int nameChoice = UProperty.NameChoice.LONG;
+			String alias;
+			while (true) {
+				try {
+					alias = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, nameChoice);
+				} catch (IllegalArgumentException e) {
+					// No more aliases.
+					break;
+				}
+				assert alias != null;
+				addPropertyAlias(propertyAliases, alias, propertyName);
+				nameChoice++;
+			}
+		}
+	}
+}
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java
@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
- * Use of this file is governed by the BSD 3-clause license that
- * can be found in the LICENSE.txt file in the project root.
- */
-
-package org.antlr.v4.test.tool;
-
-import java.util.Map;
-
-import org.antlr.v4.codegen.Unicode;
-import org.antlr.v4.runtime.misc.IntervalSet;
-
-import org.junit.Test;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-public class TestUnicode {
-	@Test
-	public void testUnicodeCategoryCodes() {
-		Map<String, IntervalSet> unicodeCategoryCodesToCodePointRanges = Unicode.getUnicodeCategoryCodesToCodePointRanges();
-		assertTrue(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('X'));
-		assertFalse(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('x'));
-		assertTrue(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('x'));
-		assertFalse(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('X'));
-		assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('X'));
-		assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('x'));
-		assertTrue(unicodeCategoryCodesToCodePointRanges.get("N").contains('0'));
-		assertTrue(unicodeCategoryCodesToCodePointRanges.get("Z").contains(' '));
-	}
-
-	@Test
-	public void testUnicodeCategoryCodesToNames() {
-		Map<String, String> unicodeCategoryCodesToNames = Unicode.getUnicodeCategoryCodesToNames();
-		assertEquals("Lowercase_Letter", unicodeCategoryCodesToNames.get("Ll"));
-		assertEquals("Letter", unicodeCategoryCodesToNames.get("L"));
-		assertEquals("Enclosing_Mark", unicodeCategoryCodesToNames.get("Me"));
-		assertEquals("Mark", unicodeCategoryCodesToNames.get("M"));
-	}
-
-	@Test
-	public void testUnicodeBinaryPropertyCodesToCodePointRanges() {
-		Map<String, IntervalSet> unicodeBinaryPropertyCodesToCodePointRanges = Unicode.getUnicodeBinaryPropertyCodesToCodePointRanges();
-		assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains(0x1F4A9));
-		assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains('X'));
-		assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains('9'));
-		assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains(0x1F4A9));
-		assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Dash").contains('-'));
-		assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('D'));
-		assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('Q'));
-	}
-
-	@Test
-	public void testUnicodeBinaryPropertyCodesToNames() {
-		Map<String, String> unicodeBinaryPropertyCodesToNames = Unicode.getUnicodeBinaryPropertyCodesToNames();
-		assertEquals("Ideographic", unicodeBinaryPropertyCodesToNames.get("Ideo"));
-		assertEquals("Soft_Dotted", unicodeBinaryPropertyCodesToNames.get("SD"));
-		assertEquals("Noncharacter_Code_Point", unicodeBinaryPropertyCodesToNames.get("NChar"));
-	}
-
-	@Test
-	public void testUnicodeScriptCodesToCodePointRanges() {
-		Map<String, IntervalSet> unicodeScriptCodesToCodePointRanges = Unicode.getUnicodeScriptCodesToCodePointRanges();
-		assertTrue(unicodeScriptCodesToCodePointRanges.get("Zyyy").contains('0'));
-		assertTrue(unicodeScriptCodesToCodePointRanges.get("Latn").contains('X'));
-		assertTrue(unicodeScriptCodesToCodePointRanges.get("Hani").contains(0x4E04));
-		assertTrue(unicodeScriptCodesToCodePointRanges.get("Cyrl").contains(0x0404));
-	}
-
-	@Test
-	public void testUnicodeScriptCodesToNames() {
-		Map<String, String> unicodeScriptCodesToNames = Unicode.getUnicodeScriptCodesToNames();
-		assertEquals("Common", unicodeScriptCodesToNames.get("Zyyy"));
-		assertEquals("Latin", unicodeScriptCodesToNames.get("Latn"));
-		assertEquals("Han", unicodeScriptCodesToNames.get("Hani"));
-		assertEquals("Cyrillic", unicodeScriptCodesToNames.get("Cyrl"));
-	}
-}
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java
@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+
+package org.antlr.v4.test.tool;
+
+import java.util.Map;
+
+import org.antlr.v4.unicode.UnicodeData;
+import org.antlr.v4.runtime.misc.IntervalSet;
+
+import org.junit.Test;
+import org.junit.Rule;
+import org.junit.rules.ExpectedException;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TestUnicodeData {
+	@Rule
+	public ExpectedException thrown = ExpectedException.none();
+
+	@Test
+	public void testUnicodeGeneralCategoriesLatin() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains('X'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains('x'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains('x'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("L").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("L").contains('x'));
+		assertTrue(UnicodeData.getPropertyCodePoints("N").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Z").contains(' '));
+	}
+
+	@Test
+	public void testUnicodeGeneralCategoriesBMP() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains('\u1E3A'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains('\u1E3B'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains('\u1E3B'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains('\u1E3A'));
+		assertTrue(UnicodeData.getPropertyCodePoints("L").contains('\u1E3A'));
+		assertTrue(UnicodeData.getPropertyCodePoints("L").contains('\u1E3B'));
+		assertTrue(UnicodeData.getPropertyCodePoints("N").contains('\u1BB0'));
+		assertFalse(UnicodeData.getPropertyCodePoints("N").contains('\u1E3A'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Z").contains('\u2028'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Z").contains('\u1E3A'));
+	}
+
+	@Test
+	public void testUnicodeGeneralCategoriesSMP() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains(0x1D5D4));
+		assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains(0x1D770));
+		assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains(0x1D770));
+		assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains(0x1D5D4));
+		assertTrue(UnicodeData.getPropertyCodePoints("L").contains(0x1D5D4));
+		assertTrue(UnicodeData.getPropertyCodePoints("L").contains(0x1D770));
+		assertTrue(UnicodeData.getPropertyCodePoints("N").contains(0x11C50));
+		assertFalse(UnicodeData.getPropertyCodePoints("N").contains(0x1D5D4));
+	}
+
+	@Test
+	public void testUnicodeCategoryAliases() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Lowercase_Letter").contains('x'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Lowercase_Letter").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Letter").contains('x'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Letter").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Enclosing_Mark").contains(0x20E2));
+		assertFalse(UnicodeData.getPropertyCodePoints("Enclosing_Mark").contains('x'));
+	}
+
+	@Test
+	public void testUnicodeBinaryProperties() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Emoji").contains(0x1F4A9));
+		assertFalse(UnicodeData.getPropertyCodePoints("Emoji").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("alnum").contains('9'));
+		assertFalse(UnicodeData.getPropertyCodePoints("alnum").contains(0x1F4A9));
+		assertTrue(UnicodeData.getPropertyCodePoints("Dash").contains('-'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Hex").contains('D'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Hex").contains('Q'));
+	}
+
+	@Test
+	public void testUnicodeBinaryPropertyAliases() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Ideo").contains('\u611B'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Ideo").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Soft_Dotted").contains('\u0456'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Soft_Dotted").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Noncharacter_Code_Point").contains('\uFFFF'));
+		assertFalse(UnicodeData.getPropertyCodePoints("Noncharacter_Code_Point").contains('X'));
+	}
+
+	@Test
+	public void testUnicodeScripts() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Zyyy").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Latn").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Hani").contains(0x4E04));
+		assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404));
+	}
+
+	@Test
+	public void testUnicodeScriptAliases() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Latin").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Han").contains(0x4E04));
+		assertTrue(UnicodeData.getPropertyCodePoints("Cyrillic").contains(0x0404));
+	}
+
+	@Test
+	public void testPropertyCaseInsensitivity() {
+		assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));
+		assertFalse(UnicodeData.getPropertyCodePoints("l").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("common").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Alnum").contains('0'));
+	}
+
+	@Test
+	public void modifyingUnicodeDataShouldThrow() {
+		thrown.expect(IllegalStateException.class);
+		thrown.expectMessage("can't alter readonly IntervalSet");
+		UnicodeData.getPropertyCodePoints("L").add(0x12345);
+	}
+}
--- a/tool/pom.xml
+++ b/tool/pom.xml
@ -42,11 +42,6 @@
 			<artifactId>javax.json</artifactId>
 			<version>1.0.4</version>
 		</dependency>
-		<dependency>
-			<groupId>com.ibm.icu</groupId>
-			<artifactId>icu4j</artifactId>
-			<version>58.2</version>
-		</dependency>
 	</dependencies>

  <build>
@ -85,6 +80,23 @@
 					</execution>
 				</executions>
 			</plugin>
+                        <plugin> <!-- include code-generated sources -->
+                          <groupId>org.codehaus.mojo</groupId>
+                          <artifactId>build-helper-maven-plugin</artifactId>
+                          <executions>
+                            <execution>
+                              <phase>generate-sources</phase>
+                              <goals>
+                                <goal>add-source</goal>
+                              </goals>
+                              <configuration>
+                                <sources>
+                                  <source>${project.build.directory}/generated-sources/antlr4-tool-codegen</source>
+                                </sources>
+                              </configuration>
+                            </execution>
+                          </executions>
+                        </plugin>
 			<plugin> <!-- this makes a fat jar with all dependencies -->
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-shade-plugin</artifactId>
--- a/tool/src/org/antlr/v4/codegen/Unicode.java
+++ b/tool/src/org/antlr/v4/codegen/Unicode.java
@ -1,194 +0,0 @@
-/*
- * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
- * Use of this file is governed by the BSD 3-clause license that
- * can be found in the LICENSE.txt file in the project root.
- */
-
-package org.antlr.v4.codegen;
-
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UCharacterCategory;
-import com.ibm.icu.lang.UProperty;
-import com.ibm.icu.lang.UScript;
-import com.ibm.icu.text.UnicodeSet;
-import com.ibm.icu.util.RangeValueIterator;
-
-import org.antlr.v4.runtime.misc.IntervalSet;
-
-import java.util.Map;
-import java.util.LinkedHashMap;
-
-/**
- * Utility class for calculating {@link IntervalSet}s for various
- * Unicode categories and properties.
- */
-public abstract class Unicode {
-	private static void addIntervalForCategory(
-			Map<String, IntervalSet> categoryMap,
-			String categoryName,
-			int start,
-			int finish) {
-		IntervalSet intervalSet = categoryMap.get(categoryName);
-		if (intervalSet == null) {
-			intervalSet = new IntervalSet();
-			categoryMap.put(categoryName, intervalSet);
-		}
-		intervalSet.add(start, finish);
-	}
-
-	private static String getShortPropertyName(int property) {
-		String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
-		// For some reason, a few properties only have long names.
-		if (propertyName == null) {
-			propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
-		}
-		return propertyName;
-	}
-
-	/**
-	 * Returns a map of (Unicode general category code: [0-4, 10-20, 5000-6000], ...)
-	 * pairs mapping Unicode general category codes to the {@link IntervalSet} containing
-	 * the Unicode code points in that general category.
-	 *
-	 * Note that a code point belongs to exactly one general category.
-	 *
-	 * {@see http://unicode.org/reports/tr44/#General_Category_Values}
-	 */
-	public static Map<String, IntervalSet> getUnicodeCategoryCodesToCodePointRanges() {
-		Map<String, IntervalSet> result = new LinkedHashMap<>();
-		RangeValueIterator iter = UCharacter.getTypeIterator();
-		RangeValueIterator.Element element = new RangeValueIterator.Element();
-		while (iter.next(element)) {
-			String categoryName = UCharacter.getPropertyValueName(
-					UProperty.GENERAL_CATEGORY_MASK,
-					1 << element.value,
-					UProperty.NameChoice.SHORT);
-			addIntervalForCategory(result, categoryName, element.start, element.limit - 1);
-			// Add short category so Ll, Lu, Lo, etc. all show up under L
-			String shortCategoryName = categoryName.substring(0, 1);
-			addIntervalForCategory(result, shortCategoryName, element.start, element.limit - 1);
-		}
-		return result;
-	}
-
-	/**
-	 * Returns a map of (Unicode general category code: name, ...) pairs
-	 * mapping Unicode general category codes to their human-readable names.
-	 *
-	 * {@see http://unicode.org/reports/tr44/#General_Category_Values}
-	 */
-	public static Map<String, String> getUnicodeCategoryCodesToNames() {
-		Map<String, String> result = new LinkedHashMap<>();
-		RangeValueIterator iter = UCharacter.getTypeIterator();
-		RangeValueIterator.Element element = new RangeValueIterator.Element();
-		while (iter.next(element)) {
-			String categoryName = UCharacter.getPropertyValueName(
-					UProperty.GENERAL_CATEGORY_MASK,
-					1 << element.value,
-					UProperty.NameChoice.SHORT);
-			String longCategoryName = UCharacter.getPropertyValueName(
-					UProperty.GENERAL_CATEGORY_MASK,
-					1 << element.value,
-					UProperty.NameChoice.LONG);
-			result.put(categoryName, longCategoryName);
-		}
-		// Add short categories
-		result.put("C", "Control");
-		result.put("L", "Letter");
-		result.put("N", "Number");
-		result.put("M", "Mark");
-		result.put("P", "Punctuation");
-		result.put("S", "Symbol");
-		result.put("Z", "Space");
-		return result;
-	}
-
-	/**
-	 * Returns a map of (Unicode binary property code: [0-4, 10-20, 5000-6000], ...)
-	 * pairs mapping Unicode binary property codes to the {@link IntervalSet} containing
-	 * the Unicode code points which have that binary property set to a true value.
-	 *
-	 * {@see http://unicode.org/reports/tr44/#Property_List_Table}
-	 */
-	public static Map<String, IntervalSet> getUnicodeBinaryPropertyCodesToCodePointRanges() {
-		Map<String, IntervalSet> result = new LinkedHashMap<>();
-		for (int property = UProperty.BINARY_START;
-		     property < UProperty.BINARY_LIMIT;
-		     property++) {
-			String propertyName = getShortPropertyName(property);
-			IntervalSet intervalSet = new IntervalSet();
-			result.put(propertyName, intervalSet);
-			UnicodeSet set = new UnicodeSet();
-			set.applyIntPropertyValue(property, 1);
-			for (UnicodeSet.EntryRange range : set.ranges()) {
-				intervalSet.add(range.codepoint, range.codepointEnd);
-			}
-		}
-		return result;
-	}
-
-	/**
-	 * Returns a map of (Unicode general category code: name, ...) pairs
-	 * mapping Unicode binary property codes to their human-readable names.
-	 *
-	 * {@see http://unicode.org/reports/tr44/#Property_List_Table}
-	 */
-	public static Map<String, String> getUnicodeBinaryPropertyCodesToNames() {
-		Map<String, String> result = new LinkedHashMap<>();
-		for (int property = UProperty.BINARY_START;
-		     property < UProperty.BINARY_LIMIT;
-		     property++) {
-			String propertyName = getShortPropertyName(property);
-			String longPropertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
-			result.put(propertyName, longPropertyName);
-		}
-		return result;
-	}
-
-	/**
-	 * Returns a map of (Unicode script code: [0-4, 10-20, 5000-6000], ...)
-	 * pairs mapping Unicode script codes to the {@link IntervalSet} containing
-	 * the Unicode code points which use that script.
-	 *
-	 * Note that some code points belong to multiple scripts.
-	 *
-	 * {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode}
-	 */
-	public static Map<String, IntervalSet> getUnicodeScriptCodesToCodePointRanges() {
-		Map<String, IntervalSet> result = new LinkedHashMap<>();
-		for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
-		     script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
-		     script++) {
-			UnicodeSet set = new UnicodeSet();
-			set.applyIntPropertyValue(UProperty.SCRIPT, script);
-			String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
-			IntervalSet intervalSet = result.get(scriptName);
-			if (intervalSet == null) {
-				intervalSet = new IntervalSet();
-				result.put(scriptName, intervalSet);
-			}
-			for (UnicodeSet.EntryRange range : set.ranges()) {
-				intervalSet.add(range.codepoint, range.codepointEnd);
-			}
-		}
-		return result;
-	}
-
-	/**
-	 * Returns a map of (Unicode script code: name, ...) pairs
-	 * mapping Unicode script codes to their human-readable names.
-	 *
-	 * {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode}
-	 */
-	public static Map<String, String> getUnicodeScriptCodesToNames() {
-		Map<String, String> result = new LinkedHashMap<>();
-		for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
-		     script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
-		     script++) {
-			String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
-			String longPropertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.LONG);
-			result.put(propertyName, longPropertyName);
-		}
-		return result;
-	}
-}