diff --git a/pom.xml b/pom.xml index 6fae71d24..112ba3338 100644 --- a/pom.xml +++ b/pom.xml @@ -78,6 +78,7 @@ runtime/Java + tool-codegen tool antlr4-maven-plugin tool-testsuite diff --git a/tool-codegen/pom.xml b/tool-codegen/pom.xml new file mode 100644 index 000000000..478ff0f36 --- /dev/null +++ b/tool-codegen/pom.xml @@ -0,0 +1,72 @@ + + + + 4.0.0 + + org.antlr + antlr4-master + 4.6.1-SNAPSHOT + + antlr4-tool-codegen + ANTLR 4 Tool Codegen + http://www.antlr.org + Codegen for the ANTLR 4 grammar compiler. + + + org.antlr + antlr4-runtime + ${project.version} + + + com.ibm.icu + icu4j + 58.2 + + + + src + + + com.webguys + string-template-maven-plugin + 1.1 + + + + org.twdata.maven + mojo-executor + 2.1.0 + + + + + + + + + + generate-sources + + render + + + + + + + diff --git a/tool-codegen/src/main/string-template/unicodedata.st b/tool-codegen/src/main/string-template/unicodedata.st new file mode 100644 index 000000000..3930498af --- /dev/null +++ b/tool-codegen/src/main/string-template/unicodedata.st @@ -0,0 +1,58 @@ +unicodedata(propertyCodePointRanges, propertyAliases) ::= << +package org.antlr.v4.unicode; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.antlr.v4.runtime.misc.IntervalSet; +import org.antlr.v4.runtime.misc.Interval; + +/** + * Code-generated utility class mapping Unicode properties to Unicode code point ranges. + */ +public abstract class UnicodeData { + private static final Map\ propertyCodePointRanges = new HashMap\<\>(); + private static final Map\ propertyAliases = new HashMap\<\>(); + + // Work around Java 64k bytecode method limit by splitting up static + // initialization into one method per Unicode property + + " +static private void addProperty() { + List\ intervals = Arrays.asList( + , )}; separator=",\n"> + ); + IntervalSet codePointRanges = new IntervalSet(intervals); + codePointRanges.setReadonly(true); + propertyCodePointRanges.put("".toLowerCase(Locale.US), codePointRanges); +\}}; separator="\n\n"> + + // Property aliases + static private void addPropertyAliases() { + ".toLowerCase(Locale.US), "".toLowerCase(Locale.US)); }; separator="\n"> + } + + // Put it all together + static { + (); }; separator="\n"> + addPropertyAliases(); + } + + /** + * Given a Unicode property (general category code, binary property name, or script name), + * returns the {@link IntervalSet} of Unicode code point ranges which have that property. + */ + public static IntervalSet getPropertyCodePoints(String propertyCodeOrAlias) { + String normalizedPropertyCodeOrAlias = propertyCodeOrAlias.toLowerCase(Locale.US); + IntervalSet result = propertyCodePointRanges.get(normalizedPropertyCodeOrAlias); + if (result == null) { + String propertyCode = propertyAliases.get(normalizedPropertyCodeOrAlias); + result = propertyCodePointRanges.get(propertyCode); + } + return result; + } +} +>> diff --git a/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java b/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java new file mode 100644 index 000000000..111ec04d2 --- /dev/null +++ b/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.unicode; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterCategory; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.RangeValueIterator; + +import org.antlr.v4.runtime.misc.IntervalSet; + +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; + +/** + * StringTemplate controller used to generate parameters to feed + * to {@code unicodedata.st} to code-generate {@code UnicodeData.java}, + * used by the tool for Unicode property escapes like {@code \\p\{Lu\}}. + * + * Uses ICU to iterate over Unicode character categories, properties, + * and script codes, as well as aliases for those codes. + * + * This class exists in its own Maven module to avoid adding a + * dependency from the tool onto the (large) ICU runtime. + */ +public abstract class UnicodeDataTemplateController { + private static void addIntervalForCategory( + Map categoryMap, + String categoryName, + int start, + int finish) { + IntervalSet intervalSet = categoryMap.get(categoryName); + if (intervalSet == null) { + intervalSet = new IntervalSet(); + categoryMap.put(categoryName, intervalSet); + } + intervalSet.add(start, finish); + } + + private static void addPropertyAliases( + Map propertyAliases, + String propertyName, + int property) { + int nameChoice = UProperty.NameChoice.LONG; + while (true) { + String alias; + try { + alias = UCharacter.getPropertyName(property, nameChoice); + } catch (IllegalArgumentException e) { + // No more aliases. + break; + } + assert alias != null; + addPropertyAlias(propertyAliases, alias, propertyName); + nameChoice++; + } + } + + private static void addPropertyAlias( + Map propertyAliases, + String alias, + String propertyName) { + propertyAliases.put(alias, propertyName); + } + + public static Map getProperties() { + Map propertyCodePointRanges = new LinkedHashMap<>(); + addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges); + addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges); + addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges); + + Map propertyAliases = new LinkedHashMap<>(); + addUnicodeCategoryCodesToNames(propertyAliases); + addUnicodeBinaryPropertyCodesToNames(propertyAliases); + addUnicodeScriptCodesToNames(propertyAliases); + + Map properties = new LinkedHashMap<>(); + properties.put("propertyCodePointRanges", propertyCodePointRanges); + properties.put("propertyAliases", propertyAliases); + return properties; + } + + private static String getShortPropertyName(int property) { + String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT); + // For some reason, a few properties only have long names. + if (propertyName == null) { + propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG); + } + return propertyName; + } + + private static void addUnicodeCategoryCodesToCodePointRanges(Map propertyCodePointRanges) { + RangeValueIterator iter = UCharacter.getTypeIterator(); + RangeValueIterator.Element element = new RangeValueIterator.Element(); + while (iter.next(element)) { + String categoryName = UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY_MASK, + 1 << element.value, + UProperty.NameChoice.SHORT); + addIntervalForCategory(propertyCodePointRanges, categoryName, element.start, element.limit - 1); + // Add short category so Ll, Lu, Lo, etc. all show up under L + String shortCategoryName = categoryName.substring(0, 1); + addIntervalForCategory(propertyCodePointRanges, shortCategoryName, element.start, element.limit - 1); + } + } + + private static void addUnicodeCategoryCodesToNames(Map propertyAliases) { + RangeValueIterator iter = UCharacter.getTypeIterator(); + RangeValueIterator.Element element = new RangeValueIterator.Element(); + while (iter.next(element)) { + int generalCategoryMask = 1 << element.value; + String categoryName = UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY_MASK, + generalCategoryMask, + UProperty.NameChoice.SHORT); + int nameChoice = UProperty.NameChoice.LONG; + while (true) { + String alias; + try { + alias = UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY_MASK, + generalCategoryMask, + nameChoice); + } catch (IllegalArgumentException e) { + // No more aliases. + break; + } + assert alias != null; + addPropertyAlias(propertyAliases, alias, categoryName); + nameChoice++; + } + } + // Add short categories + addPropertyAlias(propertyAliases, "Control", "C"); + addPropertyAlias(propertyAliases, "Letter", "L"); + addPropertyAlias(propertyAliases, "Number", "N"); + addPropertyAlias(propertyAliases, "Mark", "M"); + addPropertyAlias(propertyAliases, "Punctuation", "P"); + addPropertyAlias(propertyAliases, "Symbol", "S"); + addPropertyAlias(propertyAliases, "Space", "Z"); + } + + private static void addUnicodeBinaryPropertyCodesToCodePointRanges(Map propertyCodePointRanges) { + for (int property = UProperty.BINARY_START; + property < UProperty.BINARY_LIMIT; + property++) { + String propertyName = getShortPropertyName(property); + IntervalSet intervalSet = new IntervalSet(); + UnicodeSet unicodeSet = new UnicodeSet(); + unicodeSet.applyIntPropertyValue(property, 1); + for (UnicodeSet.EntryRange range : unicodeSet.ranges()) { + intervalSet.add(range.codepoint, range.codepointEnd); + } + propertyCodePointRanges.put(propertyName, intervalSet); + } + } + + private static void addUnicodeBinaryPropertyCodesToNames(Map propertyAliases) { + for (int property = UProperty.BINARY_START; + property < UProperty.BINARY_LIMIT; + property++) { + String propertyName = getShortPropertyName(property); + addPropertyAliases(propertyAliases, propertyName, property); + } + } + + private static void addUnicodeScriptCodesToCodePointRanges(Map propertyCodePointRanges) { + for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); + script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); + script++) { + UnicodeSet set = new UnicodeSet(); + set.applyIntPropertyValue(UProperty.SCRIPT, script); + String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); + IntervalSet intervalSet = propertyCodePointRanges.get(scriptName); + if (intervalSet == null) { + intervalSet = new IntervalSet(); + propertyCodePointRanges.put(scriptName, intervalSet); + } + for (UnicodeSet.EntryRange range : set.ranges()) { + intervalSet.add(range.codepoint, range.codepointEnd); + } + } + } + + private static void addUnicodeScriptCodesToNames(Map propertyAliases) { + for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); + script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); + script++) { + String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); + int nameChoice = UProperty.NameChoice.LONG; + String alias; + while (true) { + try { + alias = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, nameChoice); + } catch (IllegalArgumentException e) { + // No more aliases. + break; + } + assert alias != null; + addPropertyAlias(propertyAliases, alias, propertyName); + nameChoice++; + } + } + } +} diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java deleted file mode 100644 index 195bb9cde..000000000 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. - * Use of this file is governed by the BSD 3-clause license that - * can be found in the LICENSE.txt file in the project root. - */ - -package org.antlr.v4.test.tool; - -import java.util.Map; - -import org.antlr.v4.codegen.Unicode; -import org.antlr.v4.runtime.misc.IntervalSet; - -import org.junit.Test; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class TestUnicode { - @Test - public void testUnicodeCategoryCodes() { - Map unicodeCategoryCodesToCodePointRanges = Unicode.getUnicodeCategoryCodesToCodePointRanges(); - assertTrue(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('X')); - assertFalse(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('x')); - assertTrue(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('x')); - assertFalse(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('X')); - assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('X')); - assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('x')); - assertTrue(unicodeCategoryCodesToCodePointRanges.get("N").contains('0')); - assertTrue(unicodeCategoryCodesToCodePointRanges.get("Z").contains(' ')); - } - - @Test - public void testUnicodeCategoryCodesToNames() { - Map unicodeCategoryCodesToNames = Unicode.getUnicodeCategoryCodesToNames(); - assertEquals("Lowercase_Letter", unicodeCategoryCodesToNames.get("Ll")); - assertEquals("Letter", unicodeCategoryCodesToNames.get("L")); - assertEquals("Enclosing_Mark", unicodeCategoryCodesToNames.get("Me")); - assertEquals("Mark", unicodeCategoryCodesToNames.get("M")); - } - - @Test - public void testUnicodeBinaryPropertyCodesToCodePointRanges() { - Map unicodeBinaryPropertyCodesToCodePointRanges = Unicode.getUnicodeBinaryPropertyCodesToCodePointRanges(); - assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains(0x1F4A9)); - assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains('X')); - assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains('9')); - assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains(0x1F4A9)); - assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Dash").contains('-')); - assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('D')); - assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('Q')); - } - - @Test - public void testUnicodeBinaryPropertyCodesToNames() { - Map unicodeBinaryPropertyCodesToNames = Unicode.getUnicodeBinaryPropertyCodesToNames(); - assertEquals("Ideographic", unicodeBinaryPropertyCodesToNames.get("Ideo")); - assertEquals("Soft_Dotted", unicodeBinaryPropertyCodesToNames.get("SD")); - assertEquals("Noncharacter_Code_Point", unicodeBinaryPropertyCodesToNames.get("NChar")); - } - - @Test - public void testUnicodeScriptCodesToCodePointRanges() { - Map unicodeScriptCodesToCodePointRanges = Unicode.getUnicodeScriptCodesToCodePointRanges(); - assertTrue(unicodeScriptCodesToCodePointRanges.get("Zyyy").contains('0')); - assertTrue(unicodeScriptCodesToCodePointRanges.get("Latn").contains('X')); - assertTrue(unicodeScriptCodesToCodePointRanges.get("Hani").contains(0x4E04)); - assertTrue(unicodeScriptCodesToCodePointRanges.get("Cyrl").contains(0x0404)); - } - - @Test - public void testUnicodeScriptCodesToNames() { - Map unicodeScriptCodesToNames = Unicode.getUnicodeScriptCodesToNames(); - assertEquals("Common", unicodeScriptCodesToNames.get("Zyyy")); - assertEquals("Latin", unicodeScriptCodesToNames.get("Latn")); - assertEquals("Han", unicodeScriptCodesToNames.get("Hani")); - assertEquals("Cyrillic", unicodeScriptCodesToNames.get("Cyrl")); - } -} diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java new file mode 100644 index 000000000..b6b873fb8 --- /dev/null +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.test.tool; + +import java.util.Map; + +import org.antlr.v4.unicode.UnicodeData; +import org.antlr.v4.runtime.misc.IntervalSet; + +import org.junit.Test; +import org.junit.Rule; +import org.junit.rules.ExpectedException; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestUnicodeData { + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void testUnicodeGeneralCategoriesLatin() { + assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains('X')); + assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains('x')); + assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains('x')); + assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("L").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("L").contains('x')); + assertTrue(UnicodeData.getPropertyCodePoints("N").contains('0')); + assertTrue(UnicodeData.getPropertyCodePoints("Z").contains(' ')); + } + + @Test + public void testUnicodeGeneralCategoriesBMP() { + assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains('\u1E3A')); + assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains('\u1E3B')); + assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains('\u1E3B')); + assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains('\u1E3A')); + assertTrue(UnicodeData.getPropertyCodePoints("L").contains('\u1E3A')); + assertTrue(UnicodeData.getPropertyCodePoints("L").contains('\u1E3B')); + assertTrue(UnicodeData.getPropertyCodePoints("N").contains('\u1BB0')); + assertFalse(UnicodeData.getPropertyCodePoints("N").contains('\u1E3A')); + assertTrue(UnicodeData.getPropertyCodePoints("Z").contains('\u2028')); + assertFalse(UnicodeData.getPropertyCodePoints("Z").contains('\u1E3A')); + } + + @Test + public void testUnicodeGeneralCategoriesSMP() { + assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains(0x1D5D4)); + assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains(0x1D770)); + assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains(0x1D770)); + assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains(0x1D5D4)); + assertTrue(UnicodeData.getPropertyCodePoints("L").contains(0x1D5D4)); + assertTrue(UnicodeData.getPropertyCodePoints("L").contains(0x1D770)); + assertTrue(UnicodeData.getPropertyCodePoints("N").contains(0x11C50)); + assertFalse(UnicodeData.getPropertyCodePoints("N").contains(0x1D5D4)); + } + + @Test + public void testUnicodeCategoryAliases() { + assertTrue(UnicodeData.getPropertyCodePoints("Lowercase_Letter").contains('x')); + assertFalse(UnicodeData.getPropertyCodePoints("Lowercase_Letter").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("Letter").contains('x')); + assertFalse(UnicodeData.getPropertyCodePoints("Letter").contains('0')); + assertTrue(UnicodeData.getPropertyCodePoints("Enclosing_Mark").contains(0x20E2)); + assertFalse(UnicodeData.getPropertyCodePoints("Enclosing_Mark").contains('x')); + } + + @Test + public void testUnicodeBinaryProperties() { + assertTrue(UnicodeData.getPropertyCodePoints("Emoji").contains(0x1F4A9)); + assertFalse(UnicodeData.getPropertyCodePoints("Emoji").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("alnum").contains('9')); + assertFalse(UnicodeData.getPropertyCodePoints("alnum").contains(0x1F4A9)); + assertTrue(UnicodeData.getPropertyCodePoints("Dash").contains('-')); + assertTrue(UnicodeData.getPropertyCodePoints("Hex").contains('D')); + assertFalse(UnicodeData.getPropertyCodePoints("Hex").contains('Q')); + } + + @Test + public void testUnicodeBinaryPropertyAliases() { + assertTrue(UnicodeData.getPropertyCodePoints("Ideo").contains('\u611B')); + assertFalse(UnicodeData.getPropertyCodePoints("Ideo").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("Soft_Dotted").contains('\u0456')); + assertFalse(UnicodeData.getPropertyCodePoints("Soft_Dotted").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("Noncharacter_Code_Point").contains('\uFFFF')); + assertFalse(UnicodeData.getPropertyCodePoints("Noncharacter_Code_Point").contains('X')); + } + + @Test + public void testUnicodeScripts() { + assertTrue(UnicodeData.getPropertyCodePoints("Zyyy").contains('0')); + assertTrue(UnicodeData.getPropertyCodePoints("Latn").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("Hani").contains(0x4E04)); + assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404)); + } + + @Test + public void testUnicodeScriptAliases() { + assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0')); + assertTrue(UnicodeData.getPropertyCodePoints("Latin").contains('X')); + assertTrue(UnicodeData.getPropertyCodePoints("Han").contains(0x4E04)); + assertTrue(UnicodeData.getPropertyCodePoints("Cyrillic").contains(0x0404)); + } + + @Test + public void testPropertyCaseInsensitivity() { + assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x')); + assertFalse(UnicodeData.getPropertyCodePoints("l").contains('0')); + assertTrue(UnicodeData.getPropertyCodePoints("common").contains('0')); + assertTrue(UnicodeData.getPropertyCodePoints("Alnum").contains('0')); + } + + @Test + public void modifyingUnicodeDataShouldThrow() { + thrown.expect(IllegalStateException.class); + thrown.expectMessage("can't alter readonly IntervalSet"); + UnicodeData.getPropertyCodePoints("L").add(0x12345); + } +} diff --git a/tool/pom.xml b/tool/pom.xml index 4b04ee177..fc30236a6 100644 --- a/tool/pom.xml +++ b/tool/pom.xml @@ -42,11 +42,6 @@ javax.json 1.0.4 - - com.ibm.icu - icu4j - 58.2 - @@ -85,6 +80,23 @@ + + org.codehaus.mojo + build-helper-maven-plugin + + + generate-sources + + add-source + + + + ${project.build.directory}/generated-sources/antlr4-tool-codegen + + + + + org.apache.maven.plugins maven-shade-plugin diff --git a/tool/src/org/antlr/v4/codegen/Unicode.java b/tool/src/org/antlr/v4/codegen/Unicode.java deleted file mode 100644 index a1360e293..000000000 --- a/tool/src/org/antlr/v4/codegen/Unicode.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. - * Use of this file is governed by the BSD 3-clause license that - * can be found in the LICENSE.txt file in the project root. - */ - -package org.antlr.v4.codegen; - -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UCharacterCategory; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.RangeValueIterator; - -import org.antlr.v4.runtime.misc.IntervalSet; - -import java.util.Map; -import java.util.LinkedHashMap; - -/** - * Utility class for calculating {@link IntervalSet}s for various - * Unicode categories and properties. - */ -public abstract class Unicode { - private static void addIntervalForCategory( - Map categoryMap, - String categoryName, - int start, - int finish) { - IntervalSet intervalSet = categoryMap.get(categoryName); - if (intervalSet == null) { - intervalSet = new IntervalSet(); - categoryMap.put(categoryName, intervalSet); - } - intervalSet.add(start, finish); - } - - private static String getShortPropertyName(int property) { - String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT); - // For some reason, a few properties only have long names. - if (propertyName == null) { - propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG); - } - return propertyName; - } - - /** - * Returns a map of (Unicode general category code: [0-4, 10-20, 5000-6000], ...) - * pairs mapping Unicode general category codes to the {@link IntervalSet} containing - * the Unicode code points in that general category. - * - * Note that a code point belongs to exactly one general category. - * - * {@see http://unicode.org/reports/tr44/#General_Category_Values} - */ - public static Map getUnicodeCategoryCodesToCodePointRanges() { - Map result = new LinkedHashMap<>(); - RangeValueIterator iter = UCharacter.getTypeIterator(); - RangeValueIterator.Element element = new RangeValueIterator.Element(); - while (iter.next(element)) { - String categoryName = UCharacter.getPropertyValueName( - UProperty.GENERAL_CATEGORY_MASK, - 1 << element.value, - UProperty.NameChoice.SHORT); - addIntervalForCategory(result, categoryName, element.start, element.limit - 1); - // Add short category so Ll, Lu, Lo, etc. all show up under L - String shortCategoryName = categoryName.substring(0, 1); - addIntervalForCategory(result, shortCategoryName, element.start, element.limit - 1); - } - return result; - } - - /** - * Returns a map of (Unicode general category code: name, ...) pairs - * mapping Unicode general category codes to their human-readable names. - * - * {@see http://unicode.org/reports/tr44/#General_Category_Values} - */ - public static Map getUnicodeCategoryCodesToNames() { - Map result = new LinkedHashMap<>(); - RangeValueIterator iter = UCharacter.getTypeIterator(); - RangeValueIterator.Element element = new RangeValueIterator.Element(); - while (iter.next(element)) { - String categoryName = UCharacter.getPropertyValueName( - UProperty.GENERAL_CATEGORY_MASK, - 1 << element.value, - UProperty.NameChoice.SHORT); - String longCategoryName = UCharacter.getPropertyValueName( - UProperty.GENERAL_CATEGORY_MASK, - 1 << element.value, - UProperty.NameChoice.LONG); - result.put(categoryName, longCategoryName); - } - // Add short categories - result.put("C", "Control"); - result.put("L", "Letter"); - result.put("N", "Number"); - result.put("M", "Mark"); - result.put("P", "Punctuation"); - result.put("S", "Symbol"); - result.put("Z", "Space"); - return result; - } - - /** - * Returns a map of (Unicode binary property code: [0-4, 10-20, 5000-6000], ...) - * pairs mapping Unicode binary property codes to the {@link IntervalSet} containing - * the Unicode code points which have that binary property set to a true value. - * - * {@see http://unicode.org/reports/tr44/#Property_List_Table} - */ - public static Map getUnicodeBinaryPropertyCodesToCodePointRanges() { - Map result = new LinkedHashMap<>(); - for (int property = UProperty.BINARY_START; - property < UProperty.BINARY_LIMIT; - property++) { - String propertyName = getShortPropertyName(property); - IntervalSet intervalSet = new IntervalSet(); - result.put(propertyName, intervalSet); - UnicodeSet set = new UnicodeSet(); - set.applyIntPropertyValue(property, 1); - for (UnicodeSet.EntryRange range : set.ranges()) { - intervalSet.add(range.codepoint, range.codepointEnd); - } - } - return result; - } - - /** - * Returns a map of (Unicode general category code: name, ...) pairs - * mapping Unicode binary property codes to their human-readable names. - * - * {@see http://unicode.org/reports/tr44/#Property_List_Table} - */ - public static Map getUnicodeBinaryPropertyCodesToNames() { - Map result = new LinkedHashMap<>(); - for (int property = UProperty.BINARY_START; - property < UProperty.BINARY_LIMIT; - property++) { - String propertyName = getShortPropertyName(property); - String longPropertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG); - result.put(propertyName, longPropertyName); - } - return result; - } - - /** - * Returns a map of (Unicode script code: [0-4, 10-20, 5000-6000], ...) - * pairs mapping Unicode script codes to the {@link IntervalSet} containing - * the Unicode code points which use that script. - * - * Note that some code points belong to multiple scripts. - * - * {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode} - */ - public static Map getUnicodeScriptCodesToCodePointRanges() { - Map result = new LinkedHashMap<>(); - for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); - script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); - script++) { - UnicodeSet set = new UnicodeSet(); - set.applyIntPropertyValue(UProperty.SCRIPT, script); - String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); - IntervalSet intervalSet = result.get(scriptName); - if (intervalSet == null) { - intervalSet = new IntervalSet(); - result.put(scriptName, intervalSet); - } - for (UnicodeSet.EntryRange range : set.ranges()) { - intervalSet.add(range.codepoint, range.codepointEnd); - } - } - return result; - } - - /** - * Returns a map of (Unicode script code: name, ...) pairs - * mapping Unicode script codes to their human-readable names. - * - * {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode} - */ - public static Map getUnicodeScriptCodesToNames() { - Map result = new LinkedHashMap<>(); - for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); - script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); - script++) { - String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); - String longPropertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.LONG); - result.put(propertyName, longPropertyName); - } - return result; - } -}