forked from jasder/antlr
Also support Unicode enumerated properties via \p{East_Asian_Width=Ambiguous} escape
This commit is contained in:
parent
b8c74be628
commit
6007132e2e
|
@ -60,11 +60,11 @@ Match that character or sequence of characters. E.g., ’while’ or ’=’.</t
|
|||
<td>[char set]</td><td>
|
||||
<p>Match one of the characters specified in the character set. Interpret <tt>x-y</tt> as the set of characters between range <tt>x</tt> and <tt>y</tt>, inclusively. The following escaped characters are interpreted as single special characters: <tt>\n</tt>, <tt>\r</tt>, <tt>\b</tt>, <tt>\t</tt>, <tt>\f</tt>, <tt>\uXXXX</tt>, and <tt>\u{XXXXXX}</tt>. To get <tt>]</tt>, <tt>\</tt>, or <tt>-</tt> you must escape them with <tt>\</tt>.</p>
|
||||
|
||||
<p>You can also include all characters matching Unicode properties (general category, boolean, script, or block) with <tt>\p{PropertyName}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt>).</p>
|
||||
<p>You can also include all characters matching Unicode properties (general category, boolean, or enumerated including scripts and blocks) with <tt>\p{PropertyName}</tt> or <tt>\p{EnumProperty=Value}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt> or <tt>\P{EnumProperty=Value}</tt>).</p>
|
||||
|
||||
<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, and <tt>\p{Symbol}</tt>.)</p>
|
||||
<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names and values</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, <tt>\p{Symbol}</tt>, <tt>\p{Blk=Latin_1_Sup}</tt>, and <tt>\p{Block=Latin_1_Supplement}</tt>.)</p>
|
||||
|
||||
<p>Property names include <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> (they overlap with script names) and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>
|
||||
<p>As a shortcut for <tt>\p{Block=Latin_1_Supplement}</tt>, you can refer to blocks using <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>
|
||||
|
||||
<p>Property names are <b>case-insensitive</b>, and <tt>_</tt> and <tt>-</tt> are treated identically</p>
|
||||
|
||||
|
@ -77,7 +77,7 @@ UNICODE_WS : [\p{White_Space}] -> skip; // match all Unicode whitespace
|
|||
|
||||
ID : [a-zA-Z] [a-zA-Z0-9]* ; // match usual identifier spec
|
||||
|
||||
UNICODE_ID : [\p{Alpha}] [\p{Alnum}]* ; // match full Unicode alphabetic ids
|
||||
UNICODE_ID : [\p{Alpha}\p{General_Category=Other_Letter}] [\p{Alnum}\p{General_Category=Other_Letter}]* ; // match full Unicode alphabetic ids
|
||||
|
||||
EMOJI : [\u{1F4A9}\u{1F926}] ; // note Unicode code points > U+FFFF
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ public abstract class UnicodeData {
|
|||
// initialization into one method per Unicode property
|
||||
|
||||
<propertyCodePointRanges.keys:{ k | // Unicode code points with property "<k>"
|
||||
static private void addProperty<k>() {
|
||||
static private void addProperty<i>() {
|
||||
List\<Interval\> intervals = Arrays.asList(
|
||||
<propertyCodePointRanges.(k).intervals:{ interval | Interval.of(<interval.a>, <interval.b>)}; separator=",\n">
|
||||
);
|
||||
|
@ -37,7 +37,7 @@ static private void addProperty<k>() {
|
|||
|
||||
// Put it all together
|
||||
static {
|
||||
<propertyCodePointRanges.keys:{ k | addProperty<k>(); }; separator="\n">
|
||||
<propertyCodePointRanges.keys:{ k | addProperty<i>(); }; separator="\n">
|
||||
addPropertyAliases();
|
||||
}
|
||||
|
||||
|
|
|
@ -74,14 +74,14 @@ public abstract class UnicodeDataTemplateController {
|
|||
Map<String, IntervalSet> propertyCodePointRanges = new LinkedHashMap<>();
|
||||
addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
|
||||
addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
|
||||
addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
|
||||
addUnicodeBlocksToCodePointRanges(propertyCodePointRanges);
|
||||
addUnicodeIntPropertyCodesToCodePointRanges(propertyCodePointRanges);
|
||||
|
||||
Map<String, String> propertyAliases = new LinkedHashMap<>();
|
||||
addUnicodeCategoryCodesToNames(propertyAliases);
|
||||
addUnicodeBinaryPropertyCodesToNames(propertyAliases);
|
||||
addUnicodeScriptCodesToNames(propertyAliases);
|
||||
addUnicodeBlocksToNames(propertyAliases);
|
||||
addUnicodeIntPropertyCodesToNames(propertyAliases);
|
||||
|
||||
Map<String, Object> properties = new LinkedHashMap<>();
|
||||
properties.put("propertyCodePointRanges", propertyCodePointRanges);
|
||||
|
@ -191,20 +191,22 @@ public abstract class UnicodeDataTemplateController {
|
|||
}
|
||||
}
|
||||
|
||||
private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
|
||||
addIntPropertyRanges(UProperty.SCRIPT, "", propertyCodePointRanges);
|
||||
}
|
||||
|
||||
private static void addUnicodeBlocksToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
|
||||
addIntPropertyRanges(UProperty.BLOCK, "In", propertyCodePointRanges);
|
||||
private static void addUnicodeIntPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
|
||||
for (int property = UProperty.INT_START;
|
||||
property < UProperty.INT_LIMIT;
|
||||
property++) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
addIntPropertyRanges(property, propertyName + "=", propertyCodePointRanges);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addIntPropertyAliases(int property, String namePrefix, Map<String, String> propertyAliases) {
|
||||
String propertyName = getShortPropertyName(property);
|
||||
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
|
||||
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
|
||||
propertyValue++) {
|
||||
String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
|
||||
int nameChoice = UProperty.NameChoice.LONG;
|
||||
String aliasTarget = propertyName + "=" + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
|
||||
int nameChoice = UProperty.NameChoice.SHORT;
|
||||
String alias;
|
||||
while (true) {
|
||||
try {
|
||||
|
@ -214,7 +216,7 @@ public abstract class UnicodeDataTemplateController {
|
|||
break;
|
||||
}
|
||||
assert alias != null;
|
||||
addPropertyAlias(propertyAliases, alias, propertyName);
|
||||
addPropertyAlias(propertyAliases, alias, aliasTarget);
|
||||
nameChoice++;
|
||||
}
|
||||
}
|
||||
|
@ -227,4 +229,23 @@ public abstract class UnicodeDataTemplateController {
|
|||
private static void addUnicodeBlocksToNames(Map<String, String> propertyAliases) {
|
||||
addIntPropertyAliases(UProperty.BLOCK, "In", propertyAliases);
|
||||
}
|
||||
|
||||
private static void addUnicodeIntPropertyCodesToNames(Map<String, String> propertyAliases) {
|
||||
for (int property = UProperty.INT_START;
|
||||
property < UProperty.INT_LIMIT;
|
||||
property++) {
|
||||
int nameChoice = UProperty.NameChoice.SHORT + 1;
|
||||
while (true) {
|
||||
String propertyNameAlias;
|
||||
try {
|
||||
propertyNameAlias = UCharacter.getPropertyName(property, nameChoice);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// No more aliases.
|
||||
break;
|
||||
}
|
||||
addIntPropertyAliases(property, propertyNameAlias + "=", propertyAliases);
|
||||
nameChoice++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,6 +100,14 @@ public class TestUnicodeData {
|
|||
assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptEquals() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Script=Zyyy").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Script=Latn").contains('X'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Script=Hani").contains(0x4E04));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Script=Cyrl").contains(0x0404));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeScriptAliases() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0'));
|
||||
|
@ -116,12 +124,48 @@ public class TestUnicodeData {
|
|||
assertTrue(UnicodeData.getPropertyCodePoints("InMisc_Pictographs").contains(0x1F4A9));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBlockEquals() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Block=ASCII").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Block=CJK").contains(0x4E04));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Block=Cyrillic").contains(0x0404));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("Block=Misc_Pictographs").contains(0x1F4A9));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeBlockAliases() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("InBasic_Latin").contains('0'));
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("InMiscellaneous_Mathematical_Symbols_B").contains(0x29BE));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEnumeratedPropertyEquals() {
|
||||
assertTrue(
|
||||
"U+1F481 INFORMATION DESK PERSON is an emoji modifier base",
|
||||
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));
|
||||
|
||||
assertFalse(
|
||||
"U+1F47E ALIEN MONSTER is not an emoji modifier",
|
||||
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F47E));
|
||||
|
||||
assertTrue(
|
||||
"U+0E33 THAI CHARACTER SARA AM is a spacing mark",
|
||||
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));
|
||||
|
||||
assertFalse(
|
||||
"U+1038 MYANMAR SIGN VISARGA is not a spacing mark",
|
||||
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1038));
|
||||
|
||||
assertTrue(
|
||||
"U+00A1 INVERTED EXCLAMATION MARK has ambiguous East Asian Width",
|
||||
UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A1));
|
||||
|
||||
assertFalse(
|
||||
"U+00A2 CENT SIGN does not have ambiguous East Asian Width",
|
||||
UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A2));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPropertyCaseInsensitivity() {
|
||||
assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));
|
||||
|
|
|
@ -18,6 +18,8 @@ import java.util.Objects;
|
|||
* \\u{10ABCD}
|
||||
* \\p{Foo}
|
||||
* \\P{Bar}
|
||||
* \\p{Baz=Blech}
|
||||
* \\P{Baz=Blech}
|
||||
*/
|
||||
public abstract class EscapeSequenceParsing {
|
||||
public static class Result {
|
||||
|
|
Loading…
Reference in New Issue