Also support Unicode enumerated properties via \p{East_Asian_Width=Ambiguous} escape

2017-03-08 09:43:50 -08:00 · 2017-03-08 09:43:50 -08:00 · 6007132e2e
parent b8c74be628
commit 6007132e2e
5 changed files with 84 additions and 17 deletions
--- a/doc/lexer-rules.md
+++ b/doc/lexer-rules.md
@ -60,11 +60,11 @@ Match that character or sequence of characters. E.g., ’while’ or ’=’.</t
 <td>[char set]</td><td>
 <p>Match one of the characters specified in the character set. Interpret <tt>x-y</tt> as the set of characters between range <tt>x</tt> and <tt>y</tt>, inclusively. The following escaped characters are interpreted as single special characters: <tt>\n</tt>, <tt>\r</tt>, <tt>\b</tt>, <tt>\t</tt>, <tt>\f</tt>, <tt>\uXXXX</tt>, and <tt>\u{XXXXXX}</tt>. To get <tt>]</tt>, <tt>\</tt>, or <tt>-</tt> you must escape them with <tt>\</tt>.</p>

-<p>You can also include all characters matching Unicode properties (general category, boolean, script, or block) with <tt>\p{PropertyName}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt>).</p>
+<p>You can also include all characters matching Unicode properties (general category, boolean, or enumerated including scripts and blocks) with <tt>\p{PropertyName}</tt> or <tt>\p{EnumProperty=Value}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt> or <tt>\P{EnumProperty=Value}</tt>).</p>

-<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, and <tt>\p{Symbol}</tt>.)</p>
+<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names and values</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, <tt>\p{Symbol}</tt>, <tt>\p{Blk=Latin_1_Sup}</tt>, and <tt>\p{Block=Latin_1_Supplement}</tt>.)</p>

-<p>Property names include <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> (they overlap with script names) and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>
+<p>As a shortcut for <tt>\p{Block=Latin_1_Supplement}</tt>, you can refer to blocks using <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>

 <p>Property names are <b>case-insensitive</b>, and <tt>_</tt> and <tt>-</tt> are treated identically</p>

@ -77,7 +77,7 @@ UNICODE_WS : [\p{White_Space}] -> skip; // match all Unicode whitespace

 ID : [a-zA-Z] [a-zA-Z0-9]* ; // match usual identifier spec

-UNICODE_ID : [\p{Alpha}] [\p{Alnum}]* ; // match full Unicode alphabetic ids
+UNICODE_ID : [\p{Alpha}\p{General_Category=Other_Letter}] [\p{Alnum}\p{General_Category=Other_Letter}]* ; // match full Unicode alphabetic ids

 EMOJI : [\u{1F4A9}\u{1F926}] ; // note Unicode code points > U+FFFF

--- a/tool-codegen/src/main/string-template/unicodedata.st
+++ b/tool-codegen/src/main/string-template/unicodedata.st
@ -21,7 +21,7 @@ public abstract class UnicodeData {
       // initialization into one method per Unicode property

       <propertyCodePointRanges.keys:{ k | // Unicode code points with property "<k>"
-static private void addProperty<k>() {
+static private void addProperty<i>() {
       List\<Interval\> intervals = Arrays.asList(
               <propertyCodePointRanges.(k).intervals:{ interval | Interval.of(<interval.a>, <interval.b>)}; separator=",\n">
       );
@ -37,7 +37,7 @@ static private void addProperty<k>() {

       // Put it all together
       static {
-              <propertyCodePointRanges.keys:{ k | addProperty<k>(); }; separator="\n">
+              <propertyCodePointRanges.keys:{ k | addProperty<i>(); }; separator="\n">
              addPropertyAliases();
       }

--- a/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java
+++ b/tool-codegen/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java
@ -74,14 +74,14 @@ public abstract class UnicodeDataTemplateController {
 		Map<String, IntervalSet> propertyCodePointRanges = new LinkedHashMap<>();
 		addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
 		addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
-		addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
-		addUnicodeBlocksToCodePointRanges(propertyCodePointRanges);
+		addUnicodeIntPropertyCodesToCodePointRanges(propertyCodePointRanges);

 		Map<String, String> propertyAliases = new LinkedHashMap<>();
 		addUnicodeCategoryCodesToNames(propertyAliases);
 		addUnicodeBinaryPropertyCodesToNames(propertyAliases);
 		addUnicodeScriptCodesToNames(propertyAliases);
 		addUnicodeBlocksToNames(propertyAliases);
+		addUnicodeIntPropertyCodesToNames(propertyAliases);

 		Map<String, Object> properties = new LinkedHashMap<>();
 		properties.put("propertyCodePointRanges", propertyCodePointRanges);
@ -191,20 +191,22 @@ public abstract class UnicodeDataTemplateController {
 		}
 	}

-	private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
-		addIntPropertyRanges(UProperty.SCRIPT, "", propertyCodePointRanges);
-	}
-
-	private static void addUnicodeBlocksToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
-		addIntPropertyRanges(UProperty.BLOCK, "In", propertyCodePointRanges);
+	private static void addUnicodeIntPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
+		for (int property = UProperty.INT_START;
+		     property < UProperty.INT_LIMIT;
+		     property++) {
+			String propertyName = getShortPropertyName(property);
+			addIntPropertyRanges(property, propertyName + "=", propertyCodePointRanges);
+		}
 	}

 	private static void addIntPropertyAliases(int property, String namePrefix, Map<String, String> propertyAliases) {
+		String propertyName = getShortPropertyName(property);
 		for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
 		     propertyValue <= UCharacter.getIntPropertyMaxValue(property);
 		     propertyValue++) {
-			String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
-			int nameChoice = UProperty.NameChoice.LONG;
+			String aliasTarget = propertyName + "=" + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
+			int nameChoice = UProperty.NameChoice.SHORT;
 			String alias;
 			while (true) {
 				try {
@ -214,7 +216,7 @@ public abstract class UnicodeDataTemplateController {
 					break;
 				}
 				assert alias != null;
-				addPropertyAlias(propertyAliases, alias, propertyName);
+				addPropertyAlias(propertyAliases, alias, aliasTarget);
 				nameChoice++;
 			}
 		}
@ -227,4 +229,23 @@ public abstract class UnicodeDataTemplateController {
 	private static void addUnicodeBlocksToNames(Map<String, String> propertyAliases) {
 		addIntPropertyAliases(UProperty.BLOCK, "In", propertyAliases);
 	}
+
+	private static void addUnicodeIntPropertyCodesToNames(Map<String, String> propertyAliases) {
+		for (int property = UProperty.INT_START;
+		     property < UProperty.INT_LIMIT;
+		     property++) {
+			int nameChoice = UProperty.NameChoice.SHORT + 1;
+			while (true) {
+				String propertyNameAlias;
+				try {
+					propertyNameAlias = UCharacter.getPropertyName(property, nameChoice);
+				} catch (IllegalArgumentException e) {
+					// No more aliases.
+					break;
+				}
+				addIntPropertyAliases(property, propertyNameAlias + "=", propertyAliases);
+				nameChoice++;
+			}
+		}
+	}
 }
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java
@ -100,6 +100,14 @@ public class TestUnicodeData {
 		assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404));
 	}

+	@Test
+	public void testUnicodeScriptEquals() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Script=Zyyy").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Script=Latn").contains('X'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Script=Hani").contains(0x4E04));
+		assertTrue(UnicodeData.getPropertyCodePoints("Script=Cyrl").contains(0x0404));
+	}
+
 	@Test
 	public void testUnicodeScriptAliases() {
 		assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0'));
@ -116,12 +124,48 @@ public class TestUnicodeData {
 		assertTrue(UnicodeData.getPropertyCodePoints("InMisc_Pictographs").contains(0x1F4A9));
 	}

+	@Test
+	public void testUnicodeBlockEquals() {
+		assertTrue(UnicodeData.getPropertyCodePoints("Block=ASCII").contains('0'));
+		assertTrue(UnicodeData.getPropertyCodePoints("Block=CJK").contains(0x4E04));
+		assertTrue(UnicodeData.getPropertyCodePoints("Block=Cyrillic").contains(0x0404));
+		assertTrue(UnicodeData.getPropertyCodePoints("Block=Misc_Pictographs").contains(0x1F4A9));
+	}
+
 	@Test
 	public void testUnicodeBlockAliases() {
 		assertTrue(UnicodeData.getPropertyCodePoints("InBasic_Latin").contains('0'));
 		assertTrue(UnicodeData.getPropertyCodePoints("InMiscellaneous_Mathematical_Symbols_B").contains(0x29BE));
 	}

+	@Test
+	public void testEnumeratedPropertyEquals() {
+		assertTrue(
+				"U+1F481 INFORMATION DESK PERSON is an emoji modifier base",
+				UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));
+
+		assertFalse(
+				"U+1F47E ALIEN MONSTER is not an emoji modifier",
+				UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F47E));
+
+		assertTrue(
+				"U+0E33 THAI CHARACTER SARA AM is a spacing mark",
+				UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));
+
+		assertFalse(
+				"U+1038 MYANMAR SIGN VISARGA is not a spacing mark",
+				UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1038));
+
+		assertTrue(
+				"U+00A1 INVERTED EXCLAMATION MARK has ambiguous East Asian Width",
+				UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A1));
+
+		assertFalse(
+				"U+00A2 CENT SIGN does not have ambiguous East Asian Width",
+				UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A2));
+
+	}
+
 	@Test
 	public void testPropertyCaseInsensitivity() {
 		assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));
--- a/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java
+++ b/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java
@ -18,6 +18,8 @@ import java.util.Objects;
 *   \\u{10ABCD}
 *   \\p{Foo}
 *   \\P{Bar}
+ *   \\p{Baz=Blech}
+ *   \\P{Baz=Blech}
 */
 public abstract class EscapeSequenceParsing {
 	public static class Result {