forked from jasder/antlr
Merge branch 'master_upstream'
This commit is contained in:
commit
14991143d4
|
@ -55,6 +55,12 @@ matrix:
|
|||
- os: linux
|
||||
jdk: oraclejdk7
|
||||
env: TARGET=python3
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- deadsnakes # source required so it finds the package definition below
|
||||
packages:
|
||||
- python3.5
|
||||
- os: linux
|
||||
jdk: oraclejdk7
|
||||
env: TARGET=javascript
|
||||
|
|
|
@ -2,8 +2,4 @@
|
|||
|
||||
set -euo pipefail
|
||||
|
||||
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
|
||||
sudo add-apt-repository ppa:fkrull/deadsnakes -y
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq python3.5
|
||||
python3 --version
|
||||
|
|
|
@ -70,7 +70,7 @@ namespace tree {
|
|||
/// </summary>
|
||||
/// <returns> The default value returned by visitor methods. </returns>
|
||||
virtual antlrcpp::Any defaultResult() {
|
||||
return 0;
|
||||
return nullptr; // support isNotNull
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.runtime.atn;
|
||||
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
/**
|
||||
* Utility class to create {@link AtomTransition}, {@link RangeTransition},
|
||||
* and {@link SetTransition} appropriately based on the range of the input.
|
||||
*
|
||||
* To keep the serialized ATN size small, we only inline atom and
|
||||
* range transitions for Unicode code points <= U+FFFF.
|
||||
*
|
||||
* Whenever we encounter a Unicode code point > U+FFFF, we represent that
|
||||
* as a set transition (even if it is logically an atom or a range).
|
||||
*/
|
||||
public abstract class CodePointTransitions {
|
||||
/**
|
||||
* If {@code codePoint} is <= U+FFFF, returns a new {@link AtomTransition}.
|
||||
* Otherwise, returns a new {@link SetTransition}.
|
||||
*/
|
||||
public static Transition createWithCodePoint(ATNState target, int codePoint) {
|
||||
if (Character.isSupplementaryCodePoint(codePoint)) {
|
||||
return new SetTransition(target, IntervalSet.of(codePoint));
|
||||
} else {
|
||||
return new AtomTransition(target, codePoint);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If {@code codePointFrom} and {@code codePointTo} are both
|
||||
* <= U+FFFF, returns a new {@link RangeTransition}.
|
||||
* Otherwise, returns a new {@link SetTransition}.
|
||||
*/
|
||||
public static Transition createWithCodePointRange(
|
||||
ATNState target,
|
||||
int codePointFrom,
|
||||
int codePointTo) {
|
||||
if (Character.isSupplementaryCodePoint(codePointFrom) ||
|
||||
Character.isSupplementaryCodePoint(codePointTo)) {
|
||||
return new SetTransition(target, IntervalSet.of(codePointFrom, codePointTo));
|
||||
} else {
|
||||
return new RangeTransition(target, codePointFrom, codePointTo);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.test.tool;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.antlr.v4.codegen.Unicode;
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestUnicode {
|
||||
@Test
|
||||
public void testUnicodeCategoryCodes() {
|
||||
Map<String, IntervalSet> unicodeCategoryCodesToCodePointRanges = Unicode.getUnicodeCategoryCodesToCodePointRanges();
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('X'));
|
||||
assertFalse(unicodeCategoryCodesToCodePointRanges.get("Lu").contains('x'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('x'));
|
||||
assertFalse(unicodeCategoryCodesToCodePointRanges.get("Ll").contains('X'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('X'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("L").contains('x'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("N").contains('0'));
|
||||
assertTrue(unicodeCategoryCodesToCodePointRanges.get("Z").contains(' '));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeCategoryCodesToNames() {
|
||||
Map<String, String> unicodeCategoryCodesToNames = Unicode.getUnicodeCategoryCodesToNames();
|
||||
assertEquals("Lowercase_Letter", unicodeCategoryCodesToNames.get("Ll"));
|
||||
assertEquals("Letter", unicodeCategoryCodesToNames.get("L"));
|
||||
assertEquals("Enclosing_Mark", unicodeCategoryCodesToNames.get("Me"));
|
||||
assertEquals("Mark", unicodeCategoryCodesToNames.get("M"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBinaryPropertyCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> unicodeBinaryPropertyCodesToCodePointRanges = Unicode.getUnicodeBinaryPropertyCodesToCodePointRanges();
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains(0x1F4A9));
|
||||
assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Emoji").contains('X'));
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains('9'));
|
||||
assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("alnum").contains(0x1F4A9));
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Dash").contains('-'));
|
||||
assertTrue(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('D'));
|
||||
assertFalse(unicodeBinaryPropertyCodesToCodePointRanges.get("Hex").contains('Q'));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBinaryPropertyCodesToNames() {
|
||||
Map<String, String> unicodeBinaryPropertyCodesToNames = Unicode.getUnicodeBinaryPropertyCodesToNames();
|
||||
assertEquals("Ideographic", unicodeBinaryPropertyCodesToNames.get("Ideo"));
|
||||
assertEquals("Soft_Dotted", unicodeBinaryPropertyCodesToNames.get("SD"));
|
||||
assertEquals("Noncharacter_Code_Point", unicodeBinaryPropertyCodesToNames.get("NChar"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> unicodeScriptCodesToCodePointRanges = Unicode.getUnicodeScriptCodesToCodePointRanges();
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Zyyy").contains('0'));
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Latn").contains('X'));
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Hani").contains(0x4E04));
|
||||
assertTrue(unicodeScriptCodesToCodePointRanges.get("Cyrl").contains(0x0404));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptCodesToNames() {
|
||||
Map<String, String> unicodeScriptCodesToNames = Unicode.getUnicodeScriptCodesToNames();
|
||||
assertEquals("Common", unicodeScriptCodesToNames.get("Zyyy"));
|
||||
assertEquals("Latin", unicodeScriptCodesToNames.get("Latn"));
|
||||
assertEquals("Han", unicodeScriptCodesToNames.get("Hani"));
|
||||
assertEquals("Cyrillic", unicodeScriptCodesToNames.get("Cyrl"));
|
||||
}
|
||||
}
|
|
@ -42,6 +42,11 @@
|
|||
<artifactId>javax.json</artifactId>
|
||||
<version>1.0.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>58.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -10,6 +10,7 @@ import org.antlr.v4.runtime.atn.ATN;
|
|||
import org.antlr.v4.runtime.atn.ATNState;
|
||||
import org.antlr.v4.runtime.atn.AtomTransition;
|
||||
import org.antlr.v4.runtime.atn.BlockEndState;
|
||||
import org.antlr.v4.runtime.atn.CodePointTransitions;
|
||||
import org.antlr.v4.runtime.atn.DecisionState;
|
||||
import org.antlr.v4.runtime.atn.EpsilonTransition;
|
||||
import org.antlr.v4.runtime.atn.NotSetTransition;
|
||||
|
@ -116,11 +117,11 @@ public class ATNOptimizer {
|
|||
Transition newTransition;
|
||||
if (matchSet.getIntervals().size() == 1) {
|
||||
if (matchSet.size() == 1) {
|
||||
newTransition = new AtomTransition(blockEndState, matchSet.getMinElement());
|
||||
newTransition = CodePointTransitions.createWithCodePoint(blockEndState, matchSet.getMinElement());
|
||||
}
|
||||
else {
|
||||
Interval matchInterval = matchSet.getIntervals().get(0);
|
||||
newTransition = new RangeTransition(blockEndState, matchInterval.a, matchInterval.b);
|
||||
newTransition = CodePointTransitions.createWithCodePointRange(blockEndState, matchInterval.a, matchInterval.b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -17,6 +17,7 @@ import org.antlr.v4.runtime.atn.ATN;
|
|||
import org.antlr.v4.runtime.atn.ATNState;
|
||||
import org.antlr.v4.runtime.atn.ActionTransition;
|
||||
import org.antlr.v4.runtime.atn.AtomTransition;
|
||||
import org.antlr.v4.runtime.atn.CodePointTransitions;
|
||||
import org.antlr.v4.runtime.atn.LexerAction;
|
||||
import org.antlr.v4.runtime.atn.LexerChannelAction;
|
||||
import org.antlr.v4.runtime.atn.LexerCustomAction;
|
||||
|
@ -255,7 +256,7 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText());
|
||||
int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText());
|
||||
checkRange(a, b, t1, t2);
|
||||
left.addTransition(new RangeTransition(right, t1, t2));
|
||||
left.addTransition(CodePointTransitions.createWithCodePointRange(right, t1, t2));
|
||||
a.atnState = left;
|
||||
b.atnState = left;
|
||||
return new Handle(left, right);
|
||||
|
@ -301,7 +302,7 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
Transition transition;
|
||||
if (set.getIntervals().size() == 1) {
|
||||
Interval interval = set.getIntervals().get(0);
|
||||
transition = new RangeTransition(right, interval.a, interval.b);
|
||||
transition = CodePointTransitions.createWithCodePointRange(right, interval.a, interval.b);
|
||||
} else {
|
||||
transition = new SetTransition(right, set);
|
||||
}
|
||||
|
@ -356,7 +357,7 @@ public class LexerATNFactory extends ParserATNFactory {
|
|||
for (int i = 0; i < n; ) {
|
||||
right = newState(stringLiteralAST);
|
||||
int codePoint = chars.codePointAt(i);
|
||||
prev.addTransition(new AtomTransition(right, codePoint));
|
||||
prev.addTransition(CodePointTransitions.createWithCodePoint(right, codePoint));
|
||||
prev = right;
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
||||
* Use of this file is governed by the BSD 3-clause license that
|
||||
* can be found in the LICENSE.txt file in the project root.
|
||||
*/
|
||||
|
||||
package org.antlr.v4.codegen;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.RangeValueIterator;
|
||||
|
||||
import org.antlr.v4.runtime.misc.IntervalSet;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
/**
|
||||
* Utility class for calculating {@link IntervalSet}s for various
|
||||
* Unicode categories and properties.
|
||||
*/
|
||||
public abstract class Unicode {
|
||||
private static void addIntervalForCategory(
|
||||
Map<String, IntervalSet> categoryMap,
|
||||
String categoryName,
|
||||
int start,
|
||||
int finish) {
|
||||
IntervalSet intervalSet = categoryMap.get(categoryName);
|
||||
if (intervalSet == null) {
|
||||
intervalSet = new IntervalSet();
|
||||
categoryMap.put(categoryName, intervalSet);
|
||||
}
|
||||
intervalSet.add(start, finish);
|
||||
}
|
||||
|
||||
private static String getShortPropertyName(int property) {
|
||||
String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
|
||||
// For some reason, a few properties only have long names.
|
||||
if (propertyName == null) {
|
||||
propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
|
||||
}
|
||||
return propertyName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode general category code: [0-4, 10-20, 5000-6000], ...)
|
||||
* pairs mapping Unicode general category codes to the {@link IntervalSet} containing
|
||||
* the Unicode code points in that general category.
|
||||
*
|
||||
* Note that a code point belongs to exactly one general category.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#General_Category_Values}
|
||||
*/
|
||||
public static Map<String, IntervalSet> getUnicodeCategoryCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> result = new LinkedHashMap<>();
|
||||
RangeValueIterator iter = UCharacter.getTypeIterator();
|
||||
RangeValueIterator.Element element = new RangeValueIterator.Element();
|
||||
while (iter.next(element)) {
|
||||
String categoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.SHORT);
|
||||
addIntervalForCategory(result, categoryName, element.start, element.limit - 1);
|
||||
// Add short category so Ll, Lu, Lo, etc. all show up under L
|
||||
String shortCategoryName = categoryName.substring(0, 1);
|
||||
addIntervalForCategory(result, shortCategoryName, element.start, element.limit - 1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode general category code: name, ...) pairs
|
||||
* mapping Unicode general category codes to their human-readable names.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#General_Category_Values}
|
||||
*/
|
||||
public static Map<String, String> getUnicodeCategoryCodesToNames() {
|
||||
Map<String, String> result = new LinkedHashMap<>();
|
||||
RangeValueIterator iter = UCharacter.getTypeIterator();
|
||||
RangeValueIterator.Element element = new RangeValueIterator.Element();
|
||||
while (iter.next(element)) {
|
||||
String categoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.SHORT);
|
||||
String longCategoryName = UCharacter.getPropertyValueName(
|
||||
UProperty.GENERAL_CATEGORY_MASK,
|
||||
1 << element.value,
|
||||
UProperty.NameChoice.LONG);
|
||||
result.put(categoryName, longCategoryName);
|
||||
}
|
||||
// Add short categories
|
||||
result.put("C", "Control");
|
||||
result.put("L", "Letter");
|
||||
result.put("N", "Number");
|
||||
result.put("M", "Mark");
|
||||
result.put("P", "Punctuation");
|
||||
result.put("S", "Symbol");
|
||||
result.put("Z", "Space");
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode binary property code: [0-4, 10-20, 5000-6000], ...)
|
||||
* pairs mapping Unicode binary property codes to the {@link IntervalSet} containing
|
||||
* the Unicode code points which have that binary property set to a true value.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#Property_List_Table}
|
||||
*/
|
||||
public static Map<String, IntervalSet> getUnicodeBinaryPropertyCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> result = new LinkedHashMap<>();
|
||||
for (int property = UProperty.BINARY_START;
|
||||
property < UProperty.BINARY_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
IntervalSet intervalSet = new IntervalSet();
|
||||
result.put(propertyName, intervalSet);
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.applyIntPropertyValue(property, 1);
|
||||
for (UnicodeSet.EntryRange range : set.ranges()) {
|
||||
intervalSet.add(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode general category code: name, ...) pairs
|
||||
* mapping Unicode binary property codes to their human-readable names.
|
||||
*
|
||||
* {@see http://unicode.org/reports/tr44/#Property_List_Table}
|
||||
*/
|
||||
public static Map<String, String> getUnicodeBinaryPropertyCodesToNames() {
|
||||
Map<String, String> result = new LinkedHashMap<>();
|
||||
for (int property = UProperty.BINARY_START;
|
||||
property < UProperty.BINARY_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
String longPropertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
|
||||
result.put(propertyName, longPropertyName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode script code: [0-4, 10-20, 5000-6000], ...)
|
||||
* pairs mapping Unicode script codes to the {@link IntervalSet} containing
|
||||
* the Unicode code points which use that script.
|
||||
*
|
||||
* Note that some code points belong to multiple scripts.
|
||||
*
|
||||
* {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode}
|
||||
*/
|
||||
public static Map<String, IntervalSet> getUnicodeScriptCodesToCodePointRanges() {
|
||||
Map<String, IntervalSet> result = new LinkedHashMap<>();
|
||||
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
|
||||
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
|
||||
script++) {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
|
||||
IntervalSet intervalSet = result.get(scriptName);
|
||||
if (intervalSet == null) {
|
||||
intervalSet = new IntervalSet();
|
||||
result.put(scriptName, intervalSet);
|
||||
}
|
||||
for (UnicodeSet.EntryRange range : set.ranges()) {
|
||||
intervalSet.add(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map of (Unicode script code: name, ...) pairs
|
||||
* mapping Unicode script codes to their human-readable names.
|
||||
*
|
||||
* {@see https://en.wikipedia.org/wiki/Script_(Unicode)#Table_of_scripts_in_Unicode}
|
||||
*/
|
||||
public static Map<String, String> getUnicodeScriptCodesToNames() {
|
||||
Map<String, String> result = new LinkedHashMap<>();
|
||||
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
|
||||
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
|
||||
script++) {
|
||||
String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
|
||||
String longPropertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.LONG);
|
||||
result.put(propertyName, longPropertyName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue