From f384ef5fb49ad5e7d6b9b5f4d44e70021fd2c3d4 Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Tue, 21 Feb 2017 12:26:08 -0800 Subject: [PATCH] New utility class Unicode --- .../org/antlr/v4/test/tool/TestUnicode.java | 80 ++++++++ tool/pom.xml | 5 + tool/src/org/antlr/v4/codegen/Unicode.java | 194 ++++++++++++++++++ 3 files changed, 279 insertions(+) create mode 100644 tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java create mode 100644 tool/src/org/antlr/v4/codegen/Unicode.java diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java new file mode 100644 index 000000000..195bb9cde --- /dev/null +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicode.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.test.tool; + +import java.util.Map; + +import org.antlr.v4.codegen.Unicode; +import org.antlr.v4.runtime.misc.IntervalSet; + +import org.junit.Test; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestUnicode { + @Test + public void testUnicodeCategoryCodes() { + Map unicodeCategoryCodesToCodePointRanges = Unicode.getUnicodeCategoryCodesToCodePointRanges(); + assertTrue(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('X')); + assertFalse(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('x')); + assertTrue(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('x')); + assertFalse(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('X')); + assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('X')); + assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('x')); + assertTrue(unicodeCategoryCodesToCodePointRanges.get("N").contains('0')); + assertTrue(unicodeCategoryCodesToCodePointRanges.get("Z").contains(' ')); + } + + @Test + public void testUnicodeCategoryCodesToNames() { + Map unicodeCategoryCodesToNames = Unicode.getUnicodeCategoryCodesToNames(); + assertEquals("Lowercase_Letter", unicodeCategoryCodesToNames.get("Ll")); + assertEquals("Letter", unicodeCategoryCodesToNames.get("L")); + assertEquals("Enclosing_Mark", unicodeCategoryCodesToNames.get("Me")); + assertEquals("Mark", unicodeCategoryCodesToNames.get("M")); + } + + @Test + public void testUnicodeBinaryPropertyCodesToCodePointRanges() { + Map unicodeBinaryPropertyCodesToCodePointRanges = Unicode.getUnicodeBinaryPropertyCodesToCodePointRanges(); + assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains(0x1F4A9)); + assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains('X')); + assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains('9')); + assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains(0x1F4A9)); + assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Dash").contains('-')); + assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('D')); + assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('Q')); + } + + @Test + public void testUnicodeBinaryPropertyCodesToNames() { + Map unicodeBinaryPropertyCodesToNames = Unicode.getUnicodeBinaryPropertyCodesToNames(); + assertEquals("Ideographic", unicodeBinaryPropertyCodesToNames.get("Ideo")); + assertEquals("Soft_Dotted", unicodeBinaryPropertyCodesToNames.get("SD")); + assertEquals("Noncharacter_Code_Point", unicodeBinaryPropertyCodesToNames.get("NChar")); + } + + @Test + public void testUnicodeScriptCodesToCodePointRanges() { + Map unicodeScriptCodesToCodePointRanges = Unicode.getUnicodeScriptCodesToCodePointRanges(); + assertTrue(unicodeScriptCodesToCodePointRanges.get("Zyyy").contains('0')); + assertTrue(unicodeScriptCodesToCodePointRanges.get("Latn").contains('X')); + assertTrue(unicodeScriptCodesToCodePointRanges.get("Hani").contains(0x4E04)); + assertTrue(unicodeScriptCodesToCodePointRanges.get("Cyrl").contains(0x0404)); + } + + @Test + public void testUnicodeScriptCodesToNames() { + Map unicodeScriptCodesToNames = Unicode.getUnicodeScriptCodesToNames(); + assertEquals("Common", unicodeScriptCodesToNames.get("Zyyy")); + assertEquals("Latin", unicodeScriptCodesToNames.get("Latn")); + assertEquals("Han", unicodeScriptCodesToNames.get("Hani")); + assertEquals("Cyrillic", unicodeScriptCodesToNames.get("Cyrl")); + } +} diff --git a/tool/pom.xml b/tool/pom.xml index 0f1e5a0c5..4b04ee177 100644 --- a/tool/pom.xml +++ b/tool/pom.xml @@ -42,6 +42,11 @@ javax.json 1.0.4 + + com.ibm.icu + icu4j + 58.2 + diff --git a/tool/src/org/antlr/v4/codegen/Unicode.java b/tool/src/org/antlr/v4/codegen/Unicode.java new file mode 100644 index 000000000..a1360e293 --- /dev/null +++ b/tool/src/org/antlr/v4/codegen/Unicode.java @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.codegen; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterCategory; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.RangeValueIterator; + +import org.antlr.v4.runtime.misc.IntervalSet; + +import java.util.Map; +import java.util.LinkedHashMap; + +/** + * Utility class for calculating {@link IntervalSet}s for various + * Unicode categories and properties. + */ +public abstract class Unicode { + private static void addIntervalForCategory( + Map categoryMap, + String categoryName, + int start, + int finish) { + IntervalSet intervalSet = categoryMap.get(categoryName); + if (intervalSet == null) { + intervalSet = new IntervalSet(); + categoryMap.put(categoryName, intervalSet); + } + intervalSet.add(start, finish); + } + + private static String getShortPropertyName(int property) { + String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT); + // For some reason, a few properties only have long names. + if (propertyName == null) { + propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG); + } + return propertyName; + } + + /** + * Returns a map of (Unicode general category code: [0-4, 10-20, 5000-6000], ...) + * pairs mapping Unicode general category codes to the {@link IntervalSet} containing + * the Unicode code points in that general category. + * + * Note that a code point belongs to exactly one general category. + * + * {@see http://unicode.org/reports/tr44/#General_Category_Values} + */ + public static Map getUnicodeCategoryCodesToCodePointRanges() { + Map result = new LinkedHashMap<>(); + RangeValueIterator iter = UCharacter.getTypeIterator(); + RangeValueIterator.Element element = new RangeValueIterator.Element(); + while (iter.next(element)) { + String categoryName = UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY_MASK, + 1 << element.value, + UProperty.NameChoice.SHORT); + addIntervalForCategory(result, categoryName, element.start, element.limit - 1); + // Add short category so Ll, Lu, Lo, etc. all show up under L + String shortCategoryName = categoryName.substring(0, 1); + addIntervalForCategory(result, shortCategoryName, element.start, element.limit - 1); + } + return result; + } + + /** + * Returns a map of (Unicode general category code: name, ...) pairs + * mapping Unicode general category codes to their human-readable names. + * + * {@see http://unicode.org/reports/tr44/#General_Category_Values} + */ + public static Map getUnicodeCategoryCodesToNames() { + Map result = new LinkedHashMap<>(); + RangeValueIterator iter = UCharacter.getTypeIterator(); + RangeValueIterator.Element element = new RangeValueIterator.Element(); + while (iter.next(element)) { + String categoryName = UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY_MASK, + 1 << element.value, + UProperty.NameChoice.SHORT); + String longCategoryName = UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY_MASK, + 1 << element.value, + UProperty.NameChoice.LONG); + result.put(categoryName, longCategoryName); + } + // Add short categories + result.put("C", "Control"); + result.put("L", "Letter"); + result.put("N", "Number"); + result.put("M", "Mark"); + result.put("P", "Punctuation"); + result.put("S", "Symbol"); + result.put("Z", "Space"); + return result; + } + + /** + * Returns a map of (Unicode binary property code: [0-4, 10-20, 5000-6000], ...) + * pairs mapping Unicode binary property codes to the {@link IntervalSet} containing + * the Unicode code points which have that binary property set to a true value. + * + * {@see http://unicode.org/reports/tr44/#Property_List_Table} + */ + public static Map getUnicodeBinaryPropertyCodesToCodePointRanges() { + Map result = new LinkedHashMap<>(); + for (int property = UProperty.BINARY_START; + property < UProperty.BINARY_LIMIT; + property++) { + String propertyName = getShortPropertyName(property); + IntervalSet intervalSet = new IntervalSet(); + result.put(propertyName, intervalSet); + UnicodeSet set = new UnicodeSet(); + set.applyIntPropertyValue(property, 1); + for (UnicodeSet.EntryRange range : set.ranges()) { + intervalSet.add(range.codepoint, range.codepointEnd); + } + } + return result; + } + + /** + * Returns a map of (Unicode general category code: name, ...) pairs + * mapping Unicode binary property codes to their human-readable names. + * + * {@see http://unicode.org/reports/tr44/#Property_List_Table} + */ + public static Map getUnicodeBinaryPropertyCodesToNames() { + Map result = new LinkedHashMap<>(); + for (int property = UProperty.BINARY_START; + property < UProperty.BINARY_LIMIT; + property++) { + String propertyName = getShortPropertyName(property); + String longPropertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG); + result.put(propertyName, longPropertyName); + } + return result; + } + + /** + * Returns a map of (Unicode script code: [0-4, 10-20, 5000-6000], ...) + * pairs mapping Unicode script codes to the {@link IntervalSet} containing + * the Unicode code points which use that script. + * + * Note that some code points belong to multiple scripts. + * + * {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode} + */ + public static Map getUnicodeScriptCodesToCodePointRanges() { + Map result = new LinkedHashMap<>(); + for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); + script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); + script++) { + UnicodeSet set = new UnicodeSet(); + set.applyIntPropertyValue(UProperty.SCRIPT, script); + String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); + IntervalSet intervalSet = result.get(scriptName); + if (intervalSet == null) { + intervalSet = new IntervalSet(); + result.put(scriptName, intervalSet); + } + for (UnicodeSet.EntryRange range : set.ranges()) { + intervalSet.add(range.codepoint, range.codepointEnd); + } + } + return result; + } + + /** + * Returns a map of (Unicode script code: name, ...) pairs + * mapping Unicode script codes to their human-readable names. + * + * {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode} + */ + public static Map getUnicodeScriptCodesToNames() { + Map result = new LinkedHashMap<>(); + for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); + script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); + script++) { + String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); + String longPropertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.LONG); + result.put(propertyName, longPropertyName); + } + return result; + } +}