Also support Unicode enumerated properties via \p{East_Asian_Width=Ambiguous} escape

This commit is contained in:
Ben Hamilton 2017-03-08 09:43:50 -08:00
parent b8c74be628
commit 6007132e2e
5 changed files with 84 additions and 17 deletions

View File

@ -60,11 +60,11 @@ Match that character or sequence of characters. E.g., while or =.</t
<td>[char set]</td><td>
<p>Match one of the characters specified in the character set. Interpret <tt>x-y</tt> as the set of characters between range <tt>x</tt> and <tt>y</tt>, inclusively. The following escaped characters are interpreted as single special characters: <tt>\n</tt>, <tt>\r</tt>, <tt>\b</tt>, <tt>\t</tt>, <tt>\f</tt>, <tt>\uXXXX</tt>, and <tt>\u{XXXXXX}</tt>. To get <tt>]</tt>, <tt>\</tt>, or <tt>-</tt> you must escape them with <tt>\</tt>.</p>
<p>You can also include all characters matching Unicode properties (general category, boolean, script, or block) with <tt>\p{PropertyName}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt>).</p>
<p>You can also include all characters matching Unicode properties (general category, boolean, or enumerated including scripts and blocks) with <tt>\p{PropertyName}</tt> or <tt>\p{EnumProperty=Value}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt> or <tt>\P{EnumProperty=Value}</tt>).</p>
<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, and <tt>\p{Symbol}</tt>.)</p>
<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names and values</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, <tt>\p{Symbol}</tt>, <tt>\p{Blk=Latin_1_Sup}</tt>, and <tt>\p{Block=Latin_1_Supplement}</tt>.)</p>
<p>Property names include <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> (they overlap with script names) and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>
<p>As a shortcut for <tt>\p{Block=Latin_1_Supplement}</tt>, you can refer to blocks using <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>
<p>Property names are <b>case-insensitive</b>, and <tt>_</tt> and <tt>-</tt> are treated identically</p>
@ -77,7 +77,7 @@ UNICODE_WS : [\p{White_Space}] -> skip; // match all Unicode whitespace
ID : [a-zA-Z] [a-zA-Z0-9]* ; // match usual identifier spec
UNICODE_ID : [\p{Alpha}] [\p{Alnum}]* ; // match full Unicode alphabetic ids
UNICODE_ID : [\p{Alpha}\p{General_Category=Other_Letter}] [\p{Alnum}\p{General_Category=Other_Letter}]* ; // match full Unicode alphabetic ids
EMOJI : [\u{1F4A9}\u{1F926}] ; // note Unicode code points > U+FFFF

View File

@ -21,7 +21,7 @@ public abstract class UnicodeData {
// initialization into one method per Unicode property
<propertyCodePointRanges.keys:{ k | // Unicode code points with property "<k>"
static private void addProperty<k>() {
static private void addProperty<i>() {
List\<Interval\> intervals = Arrays.asList(
<propertyCodePointRanges.(k).intervals:{ interval | Interval.of(<interval.a>, <interval.b>)}; separator=",\n">
);
@ -37,7 +37,7 @@ static private void addProperty<k>() {
// Put it all together
static {
<propertyCodePointRanges.keys:{ k | addProperty<k>(); }; separator="\n">
<propertyCodePointRanges.keys:{ k | addProperty<i>(); }; separator="\n">
addPropertyAliases();
}

View File

@ -74,14 +74,14 @@ public abstract class UnicodeDataTemplateController {
Map<String, IntervalSet> propertyCodePointRanges = new LinkedHashMap<>();
addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBlocksToCodePointRanges(propertyCodePointRanges);
addUnicodeIntPropertyCodesToCodePointRanges(propertyCodePointRanges);
Map<String, String> propertyAliases = new LinkedHashMap<>();
addUnicodeCategoryCodesToNames(propertyAliases);
addUnicodeBinaryPropertyCodesToNames(propertyAliases);
addUnicodeScriptCodesToNames(propertyAliases);
addUnicodeBlocksToNames(propertyAliases);
addUnicodeIntPropertyCodesToNames(propertyAliases);
Map<String, Object> properties = new LinkedHashMap<>();
properties.put("propertyCodePointRanges", propertyCodePointRanges);
@ -191,20 +191,22 @@ public abstract class UnicodeDataTemplateController {
}
}
private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
addIntPropertyRanges(UProperty.SCRIPT, "", propertyCodePointRanges);
}
private static void addUnicodeBlocksToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
addIntPropertyRanges(UProperty.BLOCK, "In", propertyCodePointRanges);
private static void addUnicodeIntPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
for (int property = UProperty.INT_START;
property < UProperty.INT_LIMIT;
property++) {
String propertyName = getShortPropertyName(property);
addIntPropertyRanges(property, propertyName + "=", propertyCodePointRanges);
}
}
private static void addIntPropertyAliases(int property, String namePrefix, Map<String, String> propertyAliases) {
String propertyName = getShortPropertyName(property);
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
propertyValue++) {
String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.LONG;
String aliasTarget = propertyName + "=" + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.SHORT;
String alias;
while (true) {
try {
@ -214,7 +216,7 @@ public abstract class UnicodeDataTemplateController {
break;
}
assert alias != null;
addPropertyAlias(propertyAliases, alias, propertyName);
addPropertyAlias(propertyAliases, alias, aliasTarget);
nameChoice++;
}
}
@ -227,4 +229,23 @@ public abstract class UnicodeDataTemplateController {
private static void addUnicodeBlocksToNames(Map<String, String> propertyAliases) {
addIntPropertyAliases(UProperty.BLOCK, "In", propertyAliases);
}
private static void addUnicodeIntPropertyCodesToNames(Map<String, String> propertyAliases) {
for (int property = UProperty.INT_START;
property < UProperty.INT_LIMIT;
property++) {
int nameChoice = UProperty.NameChoice.SHORT + 1;
while (true) {
String propertyNameAlias;
try {
propertyNameAlias = UCharacter.getPropertyName(property, nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
}
addIntPropertyAliases(property, propertyNameAlias + "=", propertyAliases);
nameChoice++;
}
}
}
}

View File

@ -100,6 +100,14 @@ public class TestUnicodeData {
assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404));
}
@Test
public void testUnicodeScriptEquals() {
assertTrue(UnicodeData.getPropertyCodePoints("Script=Zyyy").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("Script=Latn").contains('X'));
assertTrue(UnicodeData.getPropertyCodePoints("Script=Hani").contains(0x4E04));
assertTrue(UnicodeData.getPropertyCodePoints("Script=Cyrl").contains(0x0404));
}
@Test
public void testUnicodeScriptAliases() {
assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0'));
@ -116,12 +124,48 @@ public class TestUnicodeData {
assertTrue(UnicodeData.getPropertyCodePoints("InMisc_Pictographs").contains(0x1F4A9));
}
@Test
public void testUnicodeBlockEquals() {
assertTrue(UnicodeData.getPropertyCodePoints("Block=ASCII").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("Block=CJK").contains(0x4E04));
assertTrue(UnicodeData.getPropertyCodePoints("Block=Cyrillic").contains(0x0404));
assertTrue(UnicodeData.getPropertyCodePoints("Block=Misc_Pictographs").contains(0x1F4A9));
}
@Test
public void testUnicodeBlockAliases() {
assertTrue(UnicodeData.getPropertyCodePoints("InBasic_Latin").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("InMiscellaneous_Mathematical_Symbols_B").contains(0x29BE));
}
@Test
public void testEnumeratedPropertyEquals() {
assertTrue(
"U+1F481 INFORMATION DESK PERSON is an emoji modifier base",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));
assertFalse(
"U+1F47E ALIEN MONSTER is not an emoji modifier",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F47E));
assertTrue(
"U+0E33 THAI CHARACTER SARA AM is a spacing mark",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));
assertFalse(
"U+1038 MYANMAR SIGN VISARGA is not a spacing mark",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1038));
assertTrue(
"U+00A1 INVERTED EXCLAMATION MARK has ambiguous East Asian Width",
UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A1));
assertFalse(
"U+00A2 CENT SIGN does not have ambiguous East Asian Width",
UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A2));
}
@Test
public void testPropertyCaseInsensitivity() {
assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));

View File

@ -18,6 +18,8 @@ import java.util.Objects;
* \\u{10ABCD}
* \\p{Foo}
* \\P{Bar}
* \\p{Baz=Blech}
* \\P{Baz=Blech}
*/
public abstract class EscapeSequenceParsing {
public static class Result {