Merge pull request #1687 from bhamiltoncx/unicode-codegen
Use code generation in new tool-codegen build step instead of making tool depend on ICU
This commit is contained in:
commit
698d6db2ab
1
pom.xml
1
pom.xml
|
@ -78,6 +78,7 @@
|
|||
|
||||
<modules>
|
||||
<module>runtime/Java</module>
|
||||
<module>tool-codegen</module>
|
||||
<module>tool</module>
|
||||
<module>antlr4-maven-plugin</module>
|
||||
<module>tool-testsuite</module>
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
<!--
|
||||
~ Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
~ Use of this file is governed by the BSD 3-clause license that
|
||||
~ can be found in the LICENSE.txt file in the project root.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>antlr4-master</artifactId>
|
||||
<version>4.6.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>antlr4-tool-codegen</artifactId>
|
||||
<name>ANTLR 4 Tool Codegen</name>
|
||||
<url>http://www.antlr.org</url>
|
||||
<description>Codegen for the ANTLR 4 grammar compiler.</description>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>antlr4-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>58.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>src</sourceDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>com.webguys</groupId>
|
||||
<artifactId>string-template-maven-plugin</artifactId>
|
||||
<version>1.1</version>
|
||||
<!-- this nonsense is for some reason needed to make this plugin
|
||||
work with maven 3 -->
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.twdata.maven</groupId>
|
||||
<artifactId>mojo-executor</artifactId>
|
||||
<version>2.1.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<configuration>
|
||||
<templates>
|
||||
<template>
|
||||
<directory>${basedir}/src/main/string-template</directory>
|
||||
<name>unicodedata</name>
|
||||
<target>../tool/target/generated-sources/antlr4-tool-codegen/org/antlr/v4/unicode/UnicodeData.java</target>
|
||||
<controller>
|
||||
<className>org.antlr.v4.unicode.UnicodeDataTemplateController</className>
|
||||
<sourceVersion>1.7</sourceVersion>
|
||||
<targetVersion>1.7</targetVersion>
|
||||
<method>getProperties</method>
|
||||
</controller>
|
||||
</template>
|
||||
</templates>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>render</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,58 @@
|
|||
unicodedata(propertyCodePointRanges, propertyAliases) ::= <<
|
||||
package org.antlr.v4.unicode;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
import org.antlr.v4.runtime.misc.Interval;
|
||||
|
||||
/**
|
||||
* Code-generated utility class mapping Unicode properties to Unicode code point ranges.
|
||||
*/
|
||||
public abstract class UnicodeData {
|
||||
private static final Map\<String, IntervalSet\> propertyCodePointRanges = new HashMap\<\>(<length(propertyCodePointRanges)>);
|
||||
private static final Map\<String, String\> propertyAliases = new HashMap\<\>(<length(propertyAliases)>);
|
||||
|
||||
// Work around Java 64k bytecode method limit by splitting up static
|
||||
// initialization into one method per Unicode property
|
||||
|
||||
<propertyCodePointRanges.keys:{ k | // Unicode code points with property "<k>"
|
||||
static private void addProperty<k>() {
|
||||
List\<Interval\> intervals = Arrays.asList(
|
||||
<propertyCodePointRanges.(k).intervals:{ interval | Interval.of(<interval.a>, <interval.b>)}; separator=",\n">
|
||||
);
|
||||
IntervalSet codePointRanges = new IntervalSet(intervals);
|
||||
codePointRanges.setReadonly(true);
|
||||
propertyCodePointRanges.put("<k>".toLowerCase(Locale.US), codePointRanges);
|
||||
\}}; separator="\n\n">
|
||||
|
||||
// Property aliases
|
||||
static private void addPropertyAliases() {
|
||||
<propertyAliases.keys:{ k | propertyAliases.put("<k>".toLowerCase(Locale.US), "<propertyAliases.(k)>".toLowerCase(Locale.US)); }; separator="\n">
|
||||
}
|
||||
|
||||
// Put it all together
|
||||
static {
|
||||
<propertyCodePointRanges.keys:{ k | addProperty<k>(); }; separator="\n">
|
||||
addPropertyAliases();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a Unicode property (general category code, binary property name, or script name),
|
||||
* returns the {@link IntervalSet} of Unicode code point ranges which have that property.
|
||||
*/
|
||||
public static IntervalSet getPropertyCodePoints(String propertyCodeOrAlias) {
|
||||
String normalizedPropertyCodeOrAlias = propertyCodeOrAlias.toLowerCase(Locale.US);
|
||||
IntervalSet result = propertyCodePointRanges.get(normalizedPropertyCodeOrAlias);
|
||||
if (result == null) {
|
||||
String propertyCode = propertyAliases.get(normalizedPropertyCodeOrAlias);
|
||||
result = propertyCodePointRanges.get(propertyCode);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
>>
|
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.unicode;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.RangeValueIterator;
|
||||
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* StringTemplate controller used to generate parameters to feed
|
||||
* to {@code unicodedata.st} to code-generate {@code UnicodeData.java},
|
||||
* used by the tool for Unicode property escapes like {@code \\p\{Lu\}}.
|
||||
*
|
||||
* Uses ICU to iterate over Unicode character categories, properties,
|
||||
* and script codes, as well as aliases for those codes.
|
||||
*
|
||||
* This class exists in its own Maven module to avoid adding a
|
||||
* dependency from the tool onto the (large) ICU runtime.
|
||||
*/
|
||||
public abstract class UnicodeDataTemplateController {
|
||||
private static void addIntervalForCategory(
|
||||
Map<String, IntervalSet> categoryMap,
|
||||
String categoryName,
|
||||
int start,
|
||||
int finish) {
|
||||
IntervalSet intervalSet = categoryMap.get(categoryName);
|
||||
if (intervalSet == null) {
|
||||
intervalSet = new IntervalSet();
|
||||
categoryMap.put(categoryName, intervalSet);
|
||||
}
|
||||
intervalSet.add(start, finish);
|
||||
}
|
||||
|
||||
private static void addPropertyAliases(
|
||||
Map<String, String> propertyAliases,
|
||||
String propertyName,
|
||||
int property) {
|
||||
int nameChoice = UProperty.NameChoice.LONG;
|
||||
while (true) {
|
||||
String alias;
|
||||
try {
|
||||
alias = UCharacter.getPropertyName(property, nameChoice);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// No more aliases.
|
||||
break;
|
||||
}
|
||||
assert alias != null;
|
||||
addPropertyAlias(propertyAliases, alias, propertyName);
|
||||
nameChoice++;
|
||||
}
|
||||
}
|
||||
|
||||
private static void addPropertyAlias(
|
||||
Map<String, String> propertyAliases,
|
||||
String alias,
|
||||
String propertyName) {
|
||||
propertyAliases.put(alias, propertyName);
|
||||
}
|
||||
|
||||
public static Map<String, Object> getProperties() {
|
||||
Map<String, IntervalSet> propertyCodePointRanges = new LinkedHashMap<>();
|
||||
addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
|
||||
addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
|
||||
addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
|
||||
|
||||
Map<String, String> propertyAliases = new LinkedHashMap<>();
|
||||
addUnicodeCategoryCodesToNames(propertyAliases);
|
||||
addUnicodeBinaryPropertyCodesToNames(propertyAliases);
|
||||
addUnicodeScriptCodesToNames(propertyAliases);
|
||||
|
||||
Map<String, Object> properties = new LinkedHashMap<>();
|
||||
properties.put("propertyCodePointRanges", propertyCodePointRanges);
|
||||
properties.put("propertyAliases", propertyAliases);
|
||||
return properties;
|
||||
}
|
||||
|
||||
private static String getShortPropertyName(int property) {
|
||||
String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
|
||||
// For some reason, a few properties only have long names.
|
||||
if (propertyName == null) {
|
||||
propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
|
||||
}
|
||||
return propertyName;
|
||||
}
|
||||
|
||||
private static void addUnicodeCategoryCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
|
||||
RangeValueIterator iter = UCharacter.getTypeIterator();
|
||||
RangeValueIterator.Element element = new RangeValueIterator.Element();
|
||||
while (iter.next(element)) {
|
||||
String categoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.SHORT);
|
||||
addIntervalForCategory(propertyCodePointRanges, categoryName, element.start, element.limit - 1);
|
||||
// Add short category so Ll, Lu, Lo, etc. all show up under L
|
||||
String shortCategoryName = categoryName.substring(0, 1);
|
||||
addIntervalForCategory(propertyCodePointRanges, shortCategoryName, element.start, element.limit - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUnicodeCategoryCodesToNames(Map<String, String> propertyAliases) {
|
||||
RangeValueIterator iter = UCharacter.getTypeIterator();
|
||||
RangeValueIterator.Element element = new RangeValueIterator.Element();
|
||||
while (iter.next(element)) {
|
||||
int generalCategoryMask = 1 << element.value;
|
||||
String categoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
generalCategoryMask,
|
||||
UProperty.NameChoice.SHORT);
|
||||
int nameChoice = UProperty.NameChoice.LONG;
|
||||
while (true) {
|
||||
String alias;
|
||||
try {
|
||||
alias = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
generalCategoryMask,
|
||||
nameChoice);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// No more aliases.
|
||||
break;
|
||||
}
|
||||
assert alias != null;
|
||||
addPropertyAlias(propertyAliases, alias, categoryName);
|
||||
nameChoice++;
|
||||
}
|
||||
}
|
||||
// Add short categories
|
||||
addPropertyAlias(propertyAliases, "Control", "C");
|
||||
addPropertyAlias(propertyAliases, "Letter", "L");
|
||||
addPropertyAlias(propertyAliases, "Number", "N");
|
||||
addPropertyAlias(propertyAliases, "Mark", "M");
|
||||
addPropertyAlias(propertyAliases, "Punctuation", "P");
|
||||
addPropertyAlias(propertyAliases, "Symbol", "S");
|
||||
addPropertyAlias(propertyAliases, "Space", "Z");
|
||||
}
|
||||
|
||||
private static void addUnicodeBinaryPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
|
||||
for (int property = UProperty.BINARY_START;
|
||||
property < UProperty.BINARY_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
IntervalSet intervalSet = new IntervalSet();
|
||||
UnicodeSet unicodeSet = new UnicodeSet();
|
||||
unicodeSet.applyIntPropertyValue(property, 1);
|
||||
for (UnicodeSet.EntryRange range : unicodeSet.ranges()) {
|
||||
intervalSet.add(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
propertyCodePointRanges.put(propertyName, intervalSet);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUnicodeBinaryPropertyCodesToNames(Map<String, String> propertyAliases) {
|
||||
for (int property = UProperty.BINARY_START;
|
||||
property < UProperty.BINARY_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
addPropertyAliases(propertyAliases, propertyName, property);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
|
||||
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
|
||||
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
|
||||
script++) {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
|
||||
IntervalSet intervalSet = propertyCodePointRanges.get(scriptName);
|
||||
if (intervalSet == null) {
|
||||
intervalSet = new IntervalSet();
|
||||
propertyCodePointRanges.put(scriptName, intervalSet);
|
||||
}
|
||||
for (UnicodeSet.EntryRange range : set.ranges()) {
|
||||
intervalSet.add(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUnicodeScriptCodesToNames(Map<String, String> propertyAliases) {
|
||||
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
|
||||
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
|
||||
script++) {
|
||||
String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
|
||||
int nameChoice = UProperty.NameChoice.LONG;
|
||||
String alias;
|
||||
while (true) {
|
||||
try {
|
||||
alias = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, nameChoice);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// No more aliases.
|
||||
break;
|
||||
}
|
||||
assert alias != null;
|
||||
addPropertyAlias(propertyAliases, alias, propertyName);
|
||||
nameChoice++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.test.tool;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.antlr.v4.codegen.Unicode;
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestUnicode {
|
||||
@Test
|
||||
public void testUnicodeCategoryCodes() {
|
||||
Map<String, IntervalSet> unicodeCategoryCodesToCodePointRanges = Unicode.getUnicodeCategoryCodesToCodePointRanges();
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('X'));
|
||||
assertFalse(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('x'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('x'));
|
||||
assertFalse(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('X'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('X'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('x'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("N").contains('0'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("Z").contains(' '));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeCategoryCodesToNames() {
|
||||
Map<String, String> unicodeCategoryCodesToNames = Unicode.getUnicodeCategoryCodesToNames();
|
||||
assertEquals("Lowercase_Letter", unicodeCategoryCodesToNames.get("Ll"));
|
||||
assertEquals("Letter", unicodeCategoryCodesToNames.get("L"));
|
||||
assertEquals("Enclosing_Mark", unicodeCategoryCodesToNames.get("Me"));
|
||||
assertEquals("Mark", unicodeCategoryCodesToNames.get("M"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBinaryPropertyCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> unicodeBinaryPropertyCodesToCodePointRanges = Unicode.getUnicodeBinaryPropertyCodesToCodePointRanges();
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains(0x1F4A9));
|
||||
assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains('X'));
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains('9'));
|
||||
assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains(0x1F4A9));
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Dash").contains('-'));
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('D'));
|
||||
assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('Q'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBinaryPropertyCodesToNames() {
|
||||
Map<String, String> unicodeBinaryPropertyCodesToNames = Unicode.getUnicodeBinaryPropertyCodesToNames();
|
||||
assertEquals("Ideographic", unicodeBinaryPropertyCodesToNames.get("Ideo"));
|
||||
assertEquals("Soft_Dotted", unicodeBinaryPropertyCodesToNames.get("SD"));
|
||||
assertEquals("Noncharacter_Code_Point", unicodeBinaryPropertyCodesToNames.get("NChar"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> unicodeScriptCodesToCodePointRanges = Unicode.getUnicodeScriptCodesToCodePointRanges();
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Zyyy").contains('0'));
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Latn").contains('X'));
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Hani").contains(0x4E04));
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Cyrl").contains(0x0404));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptCodesToNames() {
|
||||
Map<String, String> unicodeScriptCodesToNames = Unicode.getUnicodeScriptCodesToNames();
|
||||
assertEquals("Common", unicodeScriptCodesToNames.get("Zyyy"));
|
||||
assertEquals("Latin", unicodeScriptCodesToNames.get("Latn"));
|
||||
assertEquals("Han", unicodeScriptCodesToNames.get("Hani"));
|
||||
assertEquals("Cyrillic", unicodeScriptCodesToNames.get("Cyrl"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.test.tool;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.antlr.v4.unicode.UnicodeData;
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.Rule;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestUnicodeData {
|
||||
@Rule
|
||||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void testUnicodeGeneralCategoriesLatin() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains('X'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains('x'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains('x'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("L").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("L").contains('x'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("N").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Z").contains(' '));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeGeneralCategoriesBMP() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains('\u1E3A'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains('\u1E3B'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains('\u1E3B'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains('\u1E3A'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("L").contains('\u1E3A'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("L").contains('\u1E3B'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("N").contains('\u1BB0'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("N").contains('\u1E3A'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Z").contains('\u2028'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Z").contains('\u1E3A'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeGeneralCategoriesSMP() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Lu").contains(0x1D5D4));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Lu").contains(0x1D770));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Ll").contains(0x1D770));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Ll").contains(0x1D5D4));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("L").contains(0x1D5D4));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("L").contains(0x1D770));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("N").contains(0x11C50));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("N").contains(0x1D5D4));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeCategoryAliases() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Lowercase_Letter").contains('x'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Lowercase_Letter").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Letter").contains('x'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Letter").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Enclosing_Mark").contains(0x20E2));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Enclosing_Mark").contains('x'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBinaryProperties() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Emoji").contains(0x1F4A9));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Emoji").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("alnum").contains('9'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("alnum").contains(0x1F4A9));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Dash").contains('-'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Hex").contains('D'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Hex").contains('Q'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBinaryPropertyAliases() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Ideo").contains('\u611B'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Ideo").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Soft_Dotted").contains('\u0456'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Soft_Dotted").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Noncharacter_Code_Point").contains('\uFFFF'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("Noncharacter_Code_Point").contains('X'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScripts() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Zyyy").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Latn").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Hani").contains(0x4E04));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptAliases() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Latin").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Han").contains(0x4E04));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Cyrillic").contains(0x0404));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPropertyCaseInsensitivity() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));
|
||||
assertFalse(UnicodeData.getPropertyCodePoints("l").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("common").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Alnum").contains('0'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void modifyingUnicodeDataShouldThrow() {
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("can't alter readonly IntervalSet");
|
||||
UnicodeData.getPropertyCodePoints("L").add(0x12345);
|
||||
}
|
||||
}
|
22
tool/pom.xml
22
tool/pom.xml
|
@ -42,11 +42,6 @@
|
|||
<artifactId>javax.json</artifactId>
|
||||
<version>1.0.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>58.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
@ -85,6 +80,23 @@
|
|||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin> <!-- include code-generated sources -->
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>${project.build.directory}/generated-sources/antlr4-tool-codegen</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin> <!-- this makes a fat jar with all dependencies -->
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
|
|
|
@ -1,194 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.codegen;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.RangeValueIterator;
|
||||
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
/**
|
||||
* Utility class for calculating {@link IntervalSet}s for various
|
||||
* Unicode categories and properties.
|
||||
*/
|
||||
public abstract class Unicode {
|
||||
private static void addIntervalForCategory(
|
||||
Map<String, IntervalSet> categoryMap,
|
||||
String categoryName,
|
||||
int start,
|
||||
int finish) {
|
||||
IntervalSet intervalSet = categoryMap.get(categoryName);
|
||||
if (intervalSet == null) {
|
||||
intervalSet = new IntervalSet();
|
||||
categoryMap.put(categoryName, intervalSet);
|
||||
}
|
||||
intervalSet.add(start, finish);
|
||||
}
|
||||
|
||||
private static String getShortPropertyName(int property) {
|
||||
String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
|
||||
// For some reason, a few properties only have long names.
|
||||
if (propertyName == null) {
|
||||
propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
|
||||
}
|
||||
return propertyName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode general category code: [0-4, 10-20, 5000-6000], ...)
|
||||
* pairs mapping Unicode general category codes to the {@link IntervalSet} containing
|
||||
* the Unicode code points in that general category.
|
||||
*
|
||||
* Note that a code point belongs to exactly one general category.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#General_Category_Values}
|
||||
*/
|
||||
public static Map<String, IntervalSet> getUnicodeCategoryCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> result = new LinkedHashMap<>();
|
||||
RangeValueIterator iter = UCharacter.getTypeIterator();
|
||||
RangeValueIterator.Element element = new RangeValueIterator.Element();
|
||||
while (iter.next(element)) {
|
||||
String categoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.SHORT);
|
||||
addIntervalForCategory(result, categoryName, element.start, element.limit - 1);
|
||||
// Add short category so Ll, Lu, Lo, etc. all show up under L
|
||||
String shortCategoryName = categoryName.substring(0, 1);
|
||||
addIntervalForCategory(result, shortCategoryName, element.start, element.limit - 1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode general category code: name, ...) pairs
|
||||
* mapping Unicode general category codes to their human-readable names.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#General_Category_Values}
|
||||
*/
|
||||
public static Map<String, String> getUnicodeCategoryCodesToNames() {
|
||||
Map<String, String> result = new LinkedHashMap<>();
|
||||
RangeValueIterator iter = UCharacter.getTypeIterator();
|
||||
RangeValueIterator.Element element = new RangeValueIterator.Element();
|
||||
while (iter.next(element)) {
|
||||
String categoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.SHORT);
|
||||
String longCategoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.LONG);
|
||||
result.put(categoryName, longCategoryName);
|
||||
}
|
||||
// Add short categories
|
||||
result.put("C", "Control");
|
||||
result.put("L", "Letter");
|
||||
result.put("N", "Number");
|
||||
result.put("M", "Mark");
|
||||
result.put("P", "Punctuation");
|
||||
result.put("S", "Symbol");
|
||||
result.put("Z", "Space");
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode binary property code: [0-4, 10-20, 5000-6000], ...)
|
||||
* pairs mapping Unicode binary property codes to the {@link IntervalSet} containing
|
||||
* the Unicode code points which have that binary property set to a true value.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#Property_List_Table}
|
||||
*/
|
||||
public static Map<String, IntervalSet> getUnicodeBinaryPropertyCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> result = new LinkedHashMap<>();
|
||||
for (int property = UProperty.BINARY_START;
|
||||
property < UProperty.BINARY_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
IntervalSet intervalSet = new IntervalSet();
|
||||
result.put(propertyName, intervalSet);
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.applyIntPropertyValue(property, 1);
|
||||
for (UnicodeSet.EntryRange range : set.ranges()) {
|
||||
intervalSet.add(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode general category code: name, ...) pairs
|
||||
* mapping Unicode binary property codes to their human-readable names.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#Property_List_Table}
|
||||
*/
|
||||
public static Map<String, String> getUnicodeBinaryPropertyCodesToNames() {
|
||||
Map<String, String> result = new LinkedHashMap<>();
|
||||
for (int property = UProperty.BINARY_START;
|
||||
property < UProperty.BINARY_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
String longPropertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
|
||||
result.put(propertyName, longPropertyName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode script code: [0-4, 10-20, 5000-6000], ...)
|
||||
* pairs mapping Unicode script codes to the {@link IntervalSet} containing
|
||||
* the Unicode code points which use that script.
|
||||
*
|
||||
* Note that some code points belong to multiple scripts.
|
||||
*
|
||||
* {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode}
|
||||
*/
|
||||
public static Map<String, IntervalSet> getUnicodeScriptCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> result = new LinkedHashMap<>();
|
||||
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
|
||||
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
|
||||
script++) {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
|
||||
IntervalSet intervalSet = result.get(scriptName);
|
||||
if (intervalSet == null) {
|
||||
intervalSet = new IntervalSet();
|
||||
result.put(scriptName, intervalSet);
|
||||
}
|
||||
for (UnicodeSet.EntryRange range : set.ranges()) {
|
||||
intervalSet.add(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode script code: name, ...) pairs
|
||||
* mapping Unicode script codes to their human-readable names.
|
||||
*
|
||||
* {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode}
|
||||
*/
|
||||
public static Map<String, String> getUnicodeScriptCodesToNames() {
|
||||
Map<String, String> result = new LinkedHashMap<>();
|
||||
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
|
||||
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
|
||||
script++) {
|
||||
String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
|
||||
String longPropertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.LONG);
|
||||
result.put(propertyName, longPropertyName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue