From 3219f1c77d60c98c2ca2dd7e76bade291733ea5a Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Wed, 8 Mar 2017 16:39:27 -0800 Subject: [PATCH] Extended Pictographic --- .../ExtendedPictographic-Parsed.txt | 126 +++++++ .../ExtendedPictographic.txt | 309 ++++++++++++++++++ scripts/parse-extended-pictographic/README.md | 8 + scripts/parse-extended-pictographic/parse.py | 21 ++ .../antlr/v4/test/tool/TestUnicodeData.java | 10 + .../UnicodeDataTemplateController.java | 155 ++++++++- 6 files changed, 626 insertions(+), 3 deletions(-) create mode 100644 scripts/parse-extended-pictographic/ExtendedPictographic-Parsed.txt create mode 100644 scripts/parse-extended-pictographic/ExtendedPictographic.txt create mode 100644 scripts/parse-extended-pictographic/README.md create mode 100755 scripts/parse-extended-pictographic/parse.py diff --git a/scripts/parse-extended-pictographic/ExtendedPictographic-Parsed.txt b/scripts/parse-extended-pictographic/ExtendedPictographic-Parsed.txt new file mode 100644 index 000000000..5195cfc72 --- /dev/null +++ b/scripts/parse-extended-pictographic/ExtendedPictographic-Parsed.txt @@ -0,0 +1,126 @@ +set.add(0x1F774, 0x1F77F); +set.add(0x2700, 0x2701); +set.add(0x2703, 0x2704); +set.add(0x270E); +set.add(0x2710, 0x2711); +set.add(0x2765, 0x2767); +set.add(0x1F030, 0x1F093); +set.add(0x1F094, 0x1F09F); +set.add(0x1F10D, 0x1F10F); +set.add(0x1F12F); +set.add(0x1F16C, 0x1F16F); +set.add(0x1F1AD, 0x1F1E5); +set.add(0x1F260, 0x1F265); +set.add(0x1F203, 0x1F20F); +set.add(0x1F23C, 0x1F23F); +set.add(0x1F249, 0x1F24F); +set.add(0x1F252, 0x1F25F); +set.add(0x1F266, 0x1F2FF); +set.add(0x1F7D5, 0x1F7FF); +set.add(0x1F000, 0x1F003); +set.add(0x1F005, 0x1F02B); +set.add(0x1F02C, 0x1F02F); +set.add(0x1F322, 0x1F323); +set.add(0x1F394, 0x1F395); +set.add(0x1F398); +set.add(0x1F39C, 0x1F39D); +set.add(0x1F3F1, 0x1F3F2); +set.add(0x1F3F6); +set.add(0x1F4FE); +set.add(0x1F53E, 0x1F548); +set.add(0x1F54F); +set.add(0x1F568, 0x1F56E); +set.add(0x1F571, 0x1F572); +set.add(0x1F57B, 0x1F586); +set.add(0x1F588, 0x1F589); +set.add(0x1F58E, 0x1F58F); +set.add(0x1F591, 0x1F594); +set.add(0x1F597, 0x1F5A3); +set.add(0x1F5A6, 0x1F5A7); +set.add(0x1F5A9, 0x1F5B0); +set.add(0x1F5B3, 0x1F5BB); +set.add(0x1F5BD, 0x1F5C1); +set.add(0x1F5C5, 0x1F5D0); +set.add(0x1F5D4, 0x1F5DB); +set.add(0x1F5DF, 0x1F5E0); +set.add(0x1F5E2); +set.add(0x1F5E4, 0x1F5E7); +set.add(0x1F5E9, 0x1F5EE); +set.add(0x1F5F0, 0x1F5F2); +set.add(0x1F5F4, 0x1F5F9); +set.add(0x2605); +set.add(0x2607, 0x260D); +set.add(0x260F, 0x2610); +set.add(0x2612); +set.add(0x2616, 0x2617); +set.add(0x2619, 0x261C); +set.add(0x261E, 0x261F); +set.add(0x2621); +set.add(0x2624, 0x2625); +set.add(0x2627, 0x2629); +set.add(0x262B, 0x262D); +set.add(0x2630, 0x2637); +set.add(0x263B, 0x2647); +set.add(0x2654, 0x265F); +set.add(0x2661, 0x2662); +set.add(0x2664); +set.add(0x2667); +set.add(0x2669, 0x267A); +set.add(0x267C, 0x267E); +set.add(0x2680, 0x2691); +set.add(0x2695); +set.add(0x2698); +set.add(0x269A); +set.add(0x269D, 0x269F); +set.add(0x26A2, 0x26A9); +set.add(0x26AC, 0x26AF); +set.add(0x26B2, 0x26BC); +set.add(0x26BF, 0x26C3); +set.add(0x26C6, 0x26C7); +set.add(0x26C9, 0x26CD); +set.add(0x26D0); +set.add(0x26D2); +set.add(0x26D5, 0x26E8); +set.add(0x26EB, 0x26EF); +set.add(0x26F6); +set.add(0x26FB, 0x26FC); +set.add(0x26FE, 0x26FF); +set.add(0x2388); +set.add(0x1FA00, 0x1FFFD); +set.add(0x1F0A0, 0x1F0AE); +set.add(0x1F0B1, 0x1F0BF); +set.add(0x1F0C1, 0x1F0CF); +set.add(0x1F0D1, 0x1F0F5); +set.add(0x1F0AF, 0x1F0B0); +set.add(0x1F0C0); +set.add(0x1F0D0); +set.add(0x1F0F6, 0x1F0FF); +set.add(0x1F80C, 0x1F80F); +set.add(0x1F848, 0x1F84F); +set.add(0x1F85A, 0x1F85F); +set.add(0x1F888, 0x1F88F); +set.add(0x1F8AE, 0x1F8FF); +set.add(0x1F900, 0x1F90B); +set.add(0x1F91F); +set.add(0x1F928, 0x1F92F); +set.add(0x1F931, 0x1F932); +set.add(0x1F94C); +set.add(0x1F95F, 0x1F96B); +set.add(0x1F992, 0x1F997); +set.add(0x1F9D0, 0x1F9E6); +set.add(0x1F90C, 0x1F90F); +set.add(0x1F93F); +set.add(0x1F94D, 0x1F94F); +set.add(0x1F96C, 0x1F97F); +set.add(0x1F998, 0x1F9BF); +set.add(0x1F9C1, 0x1F9CF); +set.add(0x1F9E7, 0x1F9FF); +set.add(0x1F6C6, 0x1F6CA); +set.add(0x1F6D3, 0x1F6D4); +set.add(0x1F6E6, 0x1F6E8); +set.add(0x1F6EA); +set.add(0x1F6F1, 0x1F6F2); +set.add(0x1F6F7, 0x1F6F8); +set.add(0x1F6D5, 0x1F6DF); +set.add(0x1F6ED, 0x1F6EF); +set.add(0x1F6F9, 0x1F6FF); diff --git a/scripts/parse-extended-pictographic/ExtendedPictographic.txt b/scripts/parse-extended-pictographic/ExtendedPictographic.txt new file mode 100644 index 000000000..a84c87e01 --- /dev/null +++ b/scripts/parse-extended-pictographic/ExtendedPictographic.txt @@ -0,0 +1,309 @@ +# ExtendedPictographic.txt +# Date: 2016-10-12 +# © 2016 Unicode®, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Extended_Pictographic (EP) is a binary code point property for characters that are pictographic +# (or otherwise similar in kind to characters with the Emoji property) +# and used to customize segmentation (UAX #29 and UAX #14) so that possible future emoji zwj sequences +# will not break grapheme clusters, words, or lines. It also includes unassigned codepoints that +# are in blocks intended for use for emoji characters, added to the Unicode 9.0 Linebreak property. +# +# For usage information, see http://unicode.org/reports/tr35/#Property_Data +# +# Format +# +# This file follows the format used for UCD files. +# Field 1 indicates that the character(s) listed in Field 0 have the value ExtendedPictographic=Yes +# All other characters have the value ExtendedPictographic=No +# +# The files contain additional comments to aid in understanding the context of the property values. +# +# 1. Blocks are included if they contain any emoji characters, even if there are no EP characters. +# +# 2. EP code points may include unassigned values. Those are based on the Line_Break assignments that +# were added in Unicode 9.0 to future-proof ZWJ linebreak behavior. +# +# 3. Comment lines are included to describe the content of the blocks for comparison. For example: +# emoji=33 : [✂✅✈-✍✏✒✔✖✝✡✨✳✴❄❇❌❎❓-❕❗❣❤➕-➗➡➰➿] +# EP=10 : [✀✁✃✄✎✐✑❥-❧] +# other=149 : [✆✇✓✕✗-✜✞-✠✢-✧✩-✲✵-❃❅❆❈-❋❍❏-❒❖❘-❢❨-➔➘-➠➢-➯➱-➾] +# otherCn=2 +# +# Meaning +# • emoji: the number of Emoji=Yes characters in that block, +# • EP: the number of ExtendedPictographic in the block (with details below), and +# • other: the number of other characters in the block (non-Emoji, non-EP), +# • otherCn: the number of other code points in the block (non-Emoji, non-EP, not characters). +# +# The characters can be listed out in detail by using any implementation that supports UnicodeSet, +# such as ​http://unicode.org/cldr/utility/list-unicodeset.jsp. +# The lines are omitted if empty: for example, Transport_And_Map_Symbols have no "other" characters. +# +# ===== + +# Alchemical_Symbols +# EP=12 : [🝴-🝿] +# other=116 : [🜀-🝳] + +U+1F774..U+1F77F ; ExtendedPictographic # GC=Cn +# count=12 + +# Arrows +# emoji=8 : [↔-↙↩↪] +# other=104 : [←-↓↚-↨↫-⇿] + +# CJK_Symbols_And_Punctuation +# emoji=2 : [〰〽] +# other=62 : [ -〯〱-〼〾〿] + +# Dingbats +# emoji=33 : [✂✅✈-✍✏✒✔✖✝✡✨✳✴❄❇❌❎❓-❕❗❣❤➕-➗➡➰➿] +# EP=10 : [✀✁✃✄✎✐✑❥-❧] +# other=149 : [✆✇✓✕✗-✜✞-✠✢-✧✩-✲✵-❃❅❆❈-❋❍❏-❒❖❘-❢❨-➔➘-➠➢-➯➱-➾] + +U+2700..U+2701 ; ExtendedPictographic # [✀✁] BLACK SAFETY SCISSORS .. UPPER BLADE SCISSORS +U+2703..U+2704 ; ExtendedPictographic # [✃✄] LOWER BLADE SCISSORS .. WHITE SCISSORS +U+270E ; ExtendedPictographic # [✎] LOWER RIGHT PENCIL +U+2710..U+2711 ; ExtendedPictographic # [✐✑] UPPER RIGHT PENCIL .. WHITE NIB +U+2765..U+2767 ; ExtendedPictographic # [❥-❧] ROTATED HEAVY BLACK HEART BULLET .. ROTATED FLORAL HEART BULLET +# count=10 + +# Domino_Tiles +# EP=112 : [🀰-🂟] + +U+1F030..U+1F093 ; ExtendedPictographic # [🀰-🂓] DOMINO TILE HORIZONTAL BACK .. DOMINO TILE VERTICAL-06-06 +U+1F094..U+1F09F ; ExtendedPictographic # GC=Cn +# count=112 + +# Emoticons +# emoji=80 : [😀-🙏] + +# Enclosed_Alphanumerics +# emoji=1 : [Ⓜ] +# other=159 : [①-ⓁⓃ-⓿] + +# Enclosed_Alphanumeric_Supplement +# emoji=15 : [🅰🅱🅾🅿🆎🆑-🆚] +# EP=65 : [🄍-🄏🄯🅬-🅯🆭-🇥] +# other=176 : [🄀-🄌🄐-🄮🄰-🅫🅲-🅽🆀-🆍🆏🆐🆛-🆬🇦-🇿] + +U+1F10D..U+1F10F ; ExtendedPictographic # GC=Cn +U+1F12F ; ExtendedPictographic # GC=Cn +U+1F16C..U+1F16F ; ExtendedPictographic # GC=Cn +U+1F1AD..U+1F1E5 ; ExtendedPictographic # GC=Cn +# count=65 + +# Enclosed_CJK_Letters_And_Months +# emoji=2 : [㊗㊙] +# other=252 : [㈀-㈞㈠-㊖㊘㊚-㋾] + +# Enclosed_Ideographic_Supplement +# emoji=15 : [🈁🈂🈚🈯🈲-🈺🉐🉑] +# EP=198 : [🈃-🈏🈼-🈿🉉-🉏🉒-🋿] +# other=43 : [🈀🈐-🈙🈛-🈮🈰🈱🈻🉀-🉈] + +U+1F260..U+1F265 ; ExtendedPictographic # [🉠-🉥] ROUNDED SYMBOL FOR FU .. ROUNDED SYMBOL FOR CAI +U+1F203..U+1F20F ; ExtendedPictographic # GC=Cn +U+1F23C..U+1F23F ; ExtendedPictographic # GC=Cn +U+1F249..U+1F24F ; ExtendedPictographic # GC=Cn +U+1F252..U+1F25F ; ExtendedPictographic # GC=Cn +U+1F266..U+1F2FF ; ExtendedPictographic # GC=Cn +# count=198 + +# Geometric_Shapes +# emoji=8 : [▪▫▶◀◻-◾] +# other=88 : [■-▩▬-▵▷-▿◁-◺◿] + +# Geometric_Shapes_Extended +# EP=43 : [🟕-🟿] +# other=85 : [🞀-🟔] + +U+1F7D5..U+1F7FF ; ExtendedPictographic # GC=Cn +# count=43 + +# Latin_1_Supplement +# emoji=2 : [©®] +# other=126 : [€-¨ª-­¯-ÿ] + +# Letterlike_Symbols +# emoji=2 : [™ℹ] +# other=78 : [℀-℡℣-ℸ℺-⅏] + +# Mahjong_Tiles +# emoji=1 : [🀄] +# EP=47 : [🀀-🀃🀅-🀯] + +U+1F000..U+1F003 ; ExtendedPictographic # [🀀-🀃] MAHJONG TILE EAST WIND .. MAHJONG TILE NORTH WIND +U+1F005..U+1F02B ; ExtendedPictographic # [🀅-🀫] MAHJONG TILE GREEN DRAGON .. MAHJONG TILE BACK +U+1F02C..U+1F02F ; ExtendedPictographic # GC=Cn +# count=47 + +# Miscellaneous_Symbols_And_Arrows +# emoji=7 : [⬅-⬇⬛⬜⭐⭕] +# other=200 : [⬀-⬄⬈-⬚⬝-⭏⭑-⭔⭖-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯒⯬-⯯] + +# Miscellaneous_Symbols_And_Pictographs +# emoji=637 : [🌀-🌡🌤-🎓🎖🎗🎙-🎛🎞-🏰🏳-🏵🏷-📽📿-🔽🕉-🕎🕐-🕧🕯🕰🕳-🕺🖇🖊-🖍🖐🖕🖖🖤🖥🖨🖱🖲🖼🗂-🗄🗑-🗓🗜-🗞🗡🗣🗨🗯🗳🗺-🗿] +# EP=131 : [🌢🌣🎔🎕🎘🎜🎝🏱🏲🏶📾🔾-🕈🕏🕨-🕮🕱🕲🕻-🖆🖈🖉🖎🖏🖑-🖔🖗-🖣🖦🖧🖩-🖰🖳-🖻🖽-🗁🗅-🗐🗔-🗛🗟🗠🗢🗤-🗧🗩-🗮🗰-🗲🗴-🗹] + +U+1F322..U+1F323 ; ExtendedPictographic # [🌢🌣] BLACK DROPLET .. WHITE SUN +U+1F394..U+1F395 ; ExtendedPictographic # [🎔🎕] HEART WITH TIP ON THE LEFT .. BOUQUET OF FLOWERS +U+1F398 ; ExtendedPictographic # [🎘] MUSICAL KEYBOARD WITH JACKS +U+1F39C..U+1F39D ; ExtendedPictographic # [🎜🎝] BEAMED ASCENDING MUSICAL NOTES .. BEAMED DESCENDING MUSICAL NOTES +U+1F3F1..U+1F3F2 ; ExtendedPictographic # [🏱🏲] WHITE PENNANT .. BLACK PENNANT +U+1F3F6 ; ExtendedPictographic # [🏶] BLACK ROSETTE +U+1F4FE ; ExtendedPictographic # [📾] PORTABLE STEREO +U+1F53E..U+1F548 ; ExtendedPictographic # [🔾-🕈] LOWER RIGHT SHADOWED WHITE CIRCLE .. CELTIC CROSS +U+1F54F ; ExtendedPictographic # [🕏] BOWL OF HYGIEIA +U+1F568..U+1F56E ; ExtendedPictographic # [🕨-🕮] RIGHT SPEAKER .. BOOK +U+1F571..U+1F572 ; ExtendedPictographic # [🕱🕲] BLACK SKULL AND CROSSBONES .. NO PIRACY +U+1F57B..U+1F586 ; ExtendedPictographic # [🕻-🖆] LEFT HAND TELEPHONE RECEIVER .. PEN OVER STAMPED ENVELOPE +U+1F588..U+1F589 ; ExtendedPictographic # [🖈🖉] BLACK PUSHPIN .. LOWER LEFT PENCIL +U+1F58E..U+1F58F ; ExtendedPictographic # [🖎🖏] LEFT WRITING HAND .. TURNED OK HAND SIGN +U+1F591..U+1F594 ; ExtendedPictographic # [🖑-🖔] REVERSED RAISED HAND WITH FINGERS SPLAYED .. REVERSED VICTORY HAND +U+1F597..U+1F5A3 ; ExtendedPictographic # [🖗-🖣] WHITE DOWN POINTING LEFT HAND INDEX .. BLACK DOWN POINTING BACKHAND INDEX +U+1F5A6..U+1F5A7 ; ExtendedPictographic # [🖦🖧] KEYBOARD AND MOUSE .. THREE NETWORKED COMPUTERS +U+1F5A9..U+1F5B0 ; ExtendedPictographic # [🖩-🖰] POCKET CALCULATOR .. TWO BUTTON MOUSE +U+1F5B3..U+1F5BB ; ExtendedPictographic # [🖳-🖻] OLD PERSONAL COMPUTER .. DOCUMENT WITH PICTURE +U+1F5BD..U+1F5C1 ; ExtendedPictographic # [🖽-🗁] FRAME WITH TILES .. OPEN FOLDER +U+1F5C5..U+1F5D0 ; ExtendedPictographic # [🗅-🗐] EMPTY NOTE .. PAGES +U+1F5D4..U+1F5DB ; ExtendedPictographic # [🗔-🗛] DESKTOP WINDOW .. DECREASE FONT SIZE SYMBOL +U+1F5DF..U+1F5E0 ; ExtendedPictographic # [🗟🗠] PAGE WITH CIRCLED TEXT .. STOCK CHART +U+1F5E2 ; ExtendedPictographic # [🗢] LIPS +U+1F5E4..U+1F5E7 ; ExtendedPictographic # [🗤-🗧] THREE RAYS ABOVE .. THREE RAYS RIGHT +U+1F5E9..U+1F5EE ; ExtendedPictographic # [🗩-🗮] RIGHT SPEECH BUBBLE .. LEFT ANGER BUBBLE +U+1F5F0..U+1F5F2 ; ExtendedPictographic # [🗰-🗲] MOOD BUBBLE .. LIGHTNING MOOD +U+1F5F4..U+1F5F9 ; ExtendedPictographic # [🗴-🗹] BALLOT SCRIPT X .. BALLOT BOX WITH BOLD CHECK +# count=131 + +# Miscellaneous_Symbols +# emoji=80 : [☀-☄☎☑☔☕☘☝☠☢☣☦☪☮☯☸-☺♀♂♈-♓♠♣♥♦♨♻♿⚒-⚗⚙⚛⚜⚠⚡⚪⚫⚰⚱⚽⚾⛄⛅⛈⛎⛏⛑⛓⛔⛩⛪⛰-⛵⛷-⛺⛽] +# EP=177 : [★☇-☍☏☐☒☖☗☙-☜☞☟☡☤☥☧-☩☫-☭☰-☷☻-♇♔-♟♡♢♤♧♩-♺♼-♾⚀-⚑⚕⚘⚚⚝-⚟⚢-⚩⚬-⚯⚲-⚼⚿-⛃⛆⛇⛉-⛍⛐⛒⛕-⛨⛫-⛯⛶⛻⛼⛾⛿] +# other=2 : [☆☓] + +U+2605 ; ExtendedPictographic # [★] BLACK STAR +U+2607..U+260D ; ExtendedPictographic # [☇-☍] LIGHTNING .. OPPOSITION +U+260F..U+2610 ; ExtendedPictographic # [☏☐] WHITE TELEPHONE .. BALLOT BOX +U+2612 ; ExtendedPictographic # [☒] BALLOT BOX WITH X +U+2616..U+2617 ; ExtendedPictographic # [☖☗] WHITE SHOGI PIECE .. BLACK SHOGI PIECE +U+2619..U+261C ; ExtendedPictographic # [☙-☜] REVERSED ROTATED FLORAL HEART BULLET .. WHITE LEFT POINTING INDEX +U+261E..U+261F ; ExtendedPictographic # [☞☟] WHITE RIGHT POINTING INDEX .. WHITE DOWN POINTING INDEX +U+2621 ; ExtendedPictographic # [☡] CAUTION SIGN +U+2624..U+2625 ; ExtendedPictographic # [☤☥] CADUCEUS .. ANKH +U+2627..U+2629 ; ExtendedPictographic # [☧-☩] CHI RHO .. CROSS OF JERUSALEM +U+262B..U+262D ; ExtendedPictographic # [☫-☭] FARSI SYMBOL .. HAMMER AND SICKLE +U+2630..U+2637 ; ExtendedPictographic # [☰-☷] TRIGRAM FOR HEAVEN .. TRIGRAM FOR EARTH +U+263B..U+2647 ; ExtendedPictographic # [☻-♇] BLACK SMILING FACE .. PLUTO +U+2654..U+265F ; ExtendedPictographic # [♔-♟] WHITE CHESS KING .. BLACK CHESS PAWN +U+2661..U+2662 ; ExtendedPictographic # [♡♢] WHITE HEART SUIT .. WHITE DIAMOND SUIT +U+2664 ; ExtendedPictographic # [♤] WHITE SPADE SUIT +U+2667 ; ExtendedPictographic # [♧] WHITE CLUB SUIT +U+2669..U+267A ; ExtendedPictographic # [♩-♺] QUARTER NOTE .. RECYCLING SYMBOL FOR GENERIC MATERIALS +U+267C..U+267E ; ExtendedPictographic # [♼-♾] RECYCLED PAPER SYMBOL .. PERMANENT PAPER SIGN +U+2680..U+2691 ; ExtendedPictographic # [⚀-⚑] DIE FACE-1 .. BLACK FLAG +U+2695 ; ExtendedPictographic # [⚕] STAFF OF AESCULAPIUS +U+2698 ; ExtendedPictographic # [⚘] FLOWER +U+269A ; ExtendedPictographic # [⚚] STAFF OF HERMES +U+269D..U+269F ; ExtendedPictographic # [⚝-⚟] OUTLINED WHITE STAR .. THREE LINES CONVERGING LEFT +U+26A2..U+26A9 ; ExtendedPictographic # [⚢-⚩] DOUBLED FEMALE SIGN .. HORIZONTAL MALE WITH STROKE SIGN +U+26AC..U+26AF ; ExtendedPictographic # [⚬-⚯] MEDIUM SMALL WHITE CIRCLE .. UNMARRIED PARTNERSHIP SYMBOL +U+26B2..U+26BC ; ExtendedPictographic # [⚲-⚼] NEUTER .. SESQUIQUADRATE +U+26BF..U+26C3 ; ExtendedPictographic # [⚿-⛃] SQUARED KEY .. BLACK DRAUGHTS KING +U+26C6..U+26C7 ; ExtendedPictographic # [⛆⛇] RAIN .. BLACK SNOWMAN +U+26C9..U+26CD ; ExtendedPictographic # [⛉-⛍] TURNED WHITE SHOGI PIECE .. DISABLED CAR +U+26D0 ; ExtendedPictographic # [⛐] CAR SLIDING +U+26D2 ; ExtendedPictographic # [⛒] CIRCLED CROSSING LANES +U+26D5..U+26E8 ; ExtendedPictographic # [⛕-⛨] ALTERNATE ONE-WAY LEFT WAY TRAFFIC .. BLACK CROSS ON SHIELD +U+26EB..U+26EF ; ExtendedPictographic # [⛫-⛯] CASTLE .. MAP SYMBOL FOR LIGHTHOUSE +U+26F6 ; ExtendedPictographic # [⛶] SQUARE FOUR CORNERS +U+26FB..U+26FC ; ExtendedPictographic # [⛻⛼] JAPANESE BANK SYMBOL .. HEADSTONE GRAVEYARD SYMBOL +U+26FE..U+26FF ; ExtendedPictographic # [⛾⛿] CUP ON BLACK SQUARE .. WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE +# count=177 + +# Miscellaneous_Technical +# emoji=18 : [⌚⌛⌨⏏⏩-⏳⏸-⏺] +# EP=1 : [⎈] +# other=237 : [⌀-⌙⌜-⌧〈-⎇⎉-⏎⏐-⏨⏴-⏷⏻-⏿] + +U+2388 ; ExtendedPictographic # [⎈] HELM SYMBOL +# count=1 + +# No_Block +# EP=1534 : [🨀-🿽] + +U+1FA00..U+1FFFD ; ExtendedPictographic # GC=Cn +# count=1534 + +# Playing_Cards +# emoji=1 : [🃏] +# EP=96 : [🂠-🃿] + +U+1F0A0..U+1F0AE ; ExtendedPictographic # [🂠-🂮] PLAYING CARD BACK .. PLAYING CARD KING OF SPADES +U+1F0B1..U+1F0BF ; ExtendedPictographic # [🂱-🂿] PLAYING CARD ACE OF HEARTS .. PLAYING CARD RED JOKER +U+1F0C1..U+1F0CF ; ExtendedPictographic # [🃁-🃏] PLAYING CARD ACE OF DIAMONDS .. PLAYING CARD BLACK JOKER +U+1F0D1..U+1F0F5 ; ExtendedPictographic # [🃑-🃵] PLAYING CARD ACE OF CLUBS .. PLAYING CARD TRUMP-21 +U+1F0AF..U+1F0B0 ; ExtendedPictographic # GC=Cn +U+1F0C0 ; ExtendedPictographic # GC=Cn +U+1F0D0 ; ExtendedPictographic # GC=Cn +U+1F0F6..U+1F0FF ; ExtendedPictographic # GC=Cn +# count=96 + +# General_Punctuation +# emoji=2 : [‼⁉] +# other=109 : [ -※‽-⁈⁊-⁤⁦-] + +# Supplemental_Arrows_B +# emoji=2 : [⤴⤵] +# other=126 : [⤀-⤳⤶-⥿] + +# Supplemental_Arrows_C +# EP=108 : [🠌-🠏🡈-🡏🡚-🡟🢈-🢏🢮-🣿] +# other=148 : [🠀-🠋🠐-🡇🡐-🡙🡠-🢇🢐-🢭] + +U+1F80C..U+1F80F ; ExtendedPictographic # GC=Cn +U+1F848..U+1F84F ; ExtendedPictographic # GC=Cn +U+1F85A..U+1F85F ; ExtendedPictographic # GC=Cn +U+1F888..U+1F88F ; ExtendedPictographic # GC=Cn +U+1F8AE..U+1F8FF ; ExtendedPictographic # GC=Cn +# count=108 + +# Supplemental_Symbols_And_Pictographs +# emoji=134 : [🤐-🤺🤼-🤾🥀-🥅🥇-🥌🥐-🥫🦀-🦗🧀🧐-🧦] +# EP=174 : [🤀-🤏🤟🤨-🤯🤱🤲🤿🥌-🥏🥟-🥿🦒-🦿🧁-🧿] +# other=2 : [🤻🥆] + +U+1F900..U+1F90B ; ExtendedPictographic # [🤀-🤋] CIRCLED CROSS FORMEE WITH FOUR DOTS .. DOWNWARD FACING NOTCHED HOOK WITH DOT +U+1F91F ; ExtendedPictographic # [🤟] I LOVE YOU HAND SIGN +U+1F928..U+1F92F ; ExtendedPictographic # [🤨-🤯] FACE WITH ONE EYEBROW RAISED .. SHOCKED FACE WITH EXPLODING HEAD +U+1F931..U+1F932 ; ExtendedPictographic # [🤱🤲] BREAST-FEEDING .. PALMS UP TOGETHER +U+1F94C ; ExtendedPictographic # [🥌] CURLING STONE +U+1F95F..U+1F96B ; ExtendedPictographic # [🥟-🥫] DUMPLING .. CANNED FOOD +U+1F992..U+1F997 ; ExtendedPictographic # [🦒-🦗] GIRAFFE FACE .. CRICKET +U+1F9D0..U+1F9E6 ; ExtendedPictographic # [🧐-🧦] FACE WITH MONOCLE .. SOCKS +U+1F90C..U+1F90F ; ExtendedPictographic # GC=Cn +U+1F93F ; ExtendedPictographic # GC=Cn +U+1F94D..U+1F94F ; ExtendedPictographic # GC=Cn +U+1F96C..U+1F97F ; ExtendedPictographic # GC=Cn +U+1F998..U+1F9BF ; ExtendedPictographic # GC=Cn +U+1F9C1..U+1F9CF ; ExtendedPictographic # GC=Cn +U+1F9E7..U+1F9FF ; ExtendedPictographic # GC=Cn +# count=174 + +# Transport_And_Map_Symbols +# emoji=94 : [🚀-🛅🛋-🛒🛠-🛥🛩🛫🛬🛰🛳-🛸] +# EP=36 : [🛆-🛊🛓-🛟🛦-🛨🛪🛭-🛯🛱🛲🛷-🛿] + +U+1F6C6..U+1F6CA ; ExtendedPictographic # [🛆-🛊] TRIANGLE WITH ROUNDED CORNERS .. GIRLS SYMBOL +U+1F6D3..U+1F6D4 ; ExtendedPictographic # [🛓🛔] STUPA .. PAGODA +U+1F6E6..U+1F6E8 ; ExtendedPictographic # [🛦-🛨] UP-POINTING MILITARY AIRPLANE .. UP-POINTING SMALL AIRPLANE +U+1F6EA ; ExtendedPictographic # [🛪] NORTHEAST-POINTING AIRPLANE +U+1F6F1..U+1F6F2 ; ExtendedPictographic # [🛱🛲] ONCOMING FIRE ENGINE .. DIESEL LOCOMOTIVE +U+1F6F7..U+1F6F8 ; ExtendedPictographic # [🛷🛸] SLED .. FLYING SAUCER +U+1F6D5..U+1F6DF ; ExtendedPictographic # GC=Cn +U+1F6ED..U+1F6EF ; ExtendedPictographic # GC=Cn +U+1F6F9..U+1F6FF ; ExtendedPictographic # GC=Cn +# count=36 + +# total_count=2744 +# EOF diff --git a/scripts/parse-extended-pictographic/README.md b/scripts/parse-extended-pictographic/README.md new file mode 100644 index 000000000..10be275e3 --- /dev/null +++ b/scripts/parse-extended-pictographic/README.md @@ -0,0 +1,8 @@ +README for scripts/extended-pictographic +=========== + +This directory contains the Unicode UTS #35 `ExtendedPictographic.txt` data file, +intended to be parsed by the script `parse.py` to produce `ExtendedPictographic-Parsed.txt`. + +This produces a series of `IntervalSet` entries to be consumed by +`UnicodeDataTemplateController`. diff --git a/scripts/parse-extended-pictographic/parse.py b/scripts/parse-extended-pictographic/parse.py new file mode 100755 index 000000000..751695ee2 --- /dev/null +++ b/scripts/parse-extended-pictographic/parse.py @@ -0,0 +1,21 @@ +from __future__ import print_function +import codecs +import re +import sys + +def main(input, output): + code_point_re = re.compile(r'^U\+([0-9a-fA-F]+)\s*;\s*ExtendedPictographic.*$') + code_point_range_re = re.compile(r'^U\+([0-9a-fA-F]+)\.\.U\+([0-9a-fA-F]+)\s*;\s*ExtendedPictographic.*$') + + for line in input: + m = code_point_re.match(line) + if m: + print('set.add(0x' + m.group(1) + ');', file=output) + else: + m = code_point_range_re.match(line) + if m: + print('set.add(0x' + m.group(1) + ', 0x' + m.group(2) + ');', file=output) + +if __name__ == '__main__': + with codecs.open(sys.argv[1], 'r', 'utf-8') as f: + main(f, sys.stdout) diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java index 74c2521b1..723d2499a 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java @@ -166,6 +166,16 @@ public class TestUnicodeData { } + @Test + public void extendedPictographic() { + assertTrue( + "U+1F588 BLACK PUSHPIN is in Extended Pictographic", + UnicodeData.getPropertyCodePoints("Extended_Pictographic").contains(0x1F588)); + assertFalse( + "0 is not in Extended Pictographic", + UnicodeData.getPropertyCodePoints("Extended_Pictographic").contains('0')); + } + @Test public void testPropertyCaseInsensitivity() { assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x')); diff --git a/tool/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java b/tool/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java index 720427f57..4a8fe3d08 100644 --- a/tool/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java +++ b/tool/src/org/antlr/v4/unicode/UnicodeDataTemplateController.java @@ -75,6 +75,7 @@ public abstract class UnicodeDataTemplateController { addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges); addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges); addUnicodeIntPropertyCodesToCodePointRanges(propertyCodePointRanges); + addTR35ExtendedPictographicPropertyCodesToCodePointRanges(propertyCodePointRanges); Map propertyAliases = new LinkedHashMap<>(); addUnicodeCategoryCodesToNames(propertyAliases); @@ -82,6 +83,7 @@ public abstract class UnicodeDataTemplateController { addUnicodeScriptCodesToNames(propertyAliases); addUnicodeBlocksToNames(propertyAliases); addUnicodeIntPropertyCodesToNames(propertyAliases); + propertyAliases.put("EP", "Extended_Pictographic"); Map properties = new LinkedHashMap<>(); properties.put("propertyCodePointRanges", propertyCodePointRanges); @@ -185,9 +187,13 @@ public abstract class UnicodeDataTemplateController { intervalSet = new IntervalSet(); propertyCodePointRanges.put(propertyName, intervalSet); } - for (UnicodeSet.EntryRange range : set.ranges()) { - intervalSet.add(range.codepoint, range.codepointEnd); - } + addUnicodeSetToIntervalSet(set, intervalSet); + } + } + + private static void addUnicodeSetToIntervalSet(UnicodeSet unicodeSet, IntervalSet intervalSet) { + for (UnicodeSet.EntryRange range : unicodeSet.ranges()) { + intervalSet.add(range.codepoint, range.codepointEnd); } } @@ -200,6 +206,149 @@ public abstract class UnicodeDataTemplateController { } } + private static void addTR35ExtendedPictographicPropertyCodesToCodePointRanges(Map propertyCodePointRanges) { + IntervalSet set = new IntervalSet(); + // Generated using scripts/parse-extended-pictographic/parse.py + set.add(0x1F774, 0x1F77F); + set.add(0x2700, 0x2701); + set.add(0x2703, 0x2704); + set.add(0x270E); + set.add(0x2710, 0x2711); + set.add(0x2765, 0x2767); + set.add(0x1F030, 0x1F093); + set.add(0x1F094, 0x1F09F); + set.add(0x1F10D, 0x1F10F); + set.add(0x1F12F); + set.add(0x1F16C, 0x1F16F); + set.add(0x1F1AD, 0x1F1E5); + set.add(0x1F260, 0x1F265); + set.add(0x1F203, 0x1F20F); + set.add(0x1F23C, 0x1F23F); + set.add(0x1F249, 0x1F24F); + set.add(0x1F252, 0x1F25F); + set.add(0x1F266, 0x1F2FF); + set.add(0x1F7D5, 0x1F7FF); + set.add(0x1F000, 0x1F003); + set.add(0x1F005, 0x1F02B); + set.add(0x1F02C, 0x1F02F); + set.add(0x1F322, 0x1F323); + set.add(0x1F394, 0x1F395); + set.add(0x1F398); + set.add(0x1F39C, 0x1F39D); + set.add(0x1F3F1, 0x1F3F2); + set.add(0x1F3F6); + set.add(0x1F4FE); + set.add(0x1F53E, 0x1F548); + set.add(0x1F54F); + set.add(0x1F568, 0x1F56E); + set.add(0x1F571, 0x1F572); + set.add(0x1F57B, 0x1F586); + set.add(0x1F588, 0x1F589); + set.add(0x1F58E, 0x1F58F); + set.add(0x1F591, 0x1F594); + set.add(0x1F597, 0x1F5A3); + set.add(0x1F5A6, 0x1F5A7); + set.add(0x1F5A9, 0x1F5B0); + set.add(0x1F5B3, 0x1F5BB); + set.add(0x1F5BD, 0x1F5C1); + set.add(0x1F5C5, 0x1F5D0); + set.add(0x1F5D4, 0x1F5DB); + set.add(0x1F5DF, 0x1F5E0); + set.add(0x1F5E2); + set.add(0x1F5E4, 0x1F5E7); + set.add(0x1F5E9, 0x1F5EE); + set.add(0x1F5F0, 0x1F5F2); + set.add(0x1F5F4, 0x1F5F9); + set.add(0x2605); + set.add(0x2607, 0x260D); + set.add(0x260F, 0x2610); + set.add(0x2612); + set.add(0x2616, 0x2617); + set.add(0x2619, 0x261C); + set.add(0x261E, 0x261F); + set.add(0x2621); + set.add(0x2624, 0x2625); + set.add(0x2627, 0x2629); + set.add(0x262B, 0x262D); + set.add(0x2630, 0x2637); + set.add(0x263B, 0x2647); + set.add(0x2654, 0x265F); + set.add(0x2661, 0x2662); + set.add(0x2664); + set.add(0x2667); + set.add(0x2669, 0x267A); + set.add(0x267C, 0x267E); + set.add(0x2680, 0x2691); + set.add(0x2695); + set.add(0x2698); + set.add(0x269A); + set.add(0x269D, 0x269F); + set.add(0x26A2, 0x26A9); + set.add(0x26AC, 0x26AF); + set.add(0x26B2, 0x26BC); + set.add(0x26BF, 0x26C3); + set.add(0x26C6, 0x26C7); + set.add(0x26C9, 0x26CD); + set.add(0x26D0); + set.add(0x26D2); + set.add(0x26D5, 0x26E8); + set.add(0x26EB, 0x26EF); + set.add(0x26F6); + set.add(0x26FB, 0x26FC); + set.add(0x26FE, 0x26FF); + set.add(0x2388); + set.add(0x1FA00, 0x1FFFD); + set.add(0x1F0A0, 0x1F0AE); + set.add(0x1F0B1, 0x1F0BF); + set.add(0x1F0C1, 0x1F0CF); + set.add(0x1F0D1, 0x1F0F5); + set.add(0x1F0AF, 0x1F0B0); + set.add(0x1F0C0); + set.add(0x1F0D0); + set.add(0x1F0F6, 0x1F0FF); + set.add(0x1F80C, 0x1F80F); + set.add(0x1F848, 0x1F84F); + set.add(0x1F85A, 0x1F85F); + set.add(0x1F888, 0x1F88F); + set.add(0x1F8AE, 0x1F8FF); + set.add(0x1F900, 0x1F90B); + set.add(0x1F91F); + set.add(0x1F928, 0x1F92F); + set.add(0x1F931, 0x1F932); + set.add(0x1F94C); + set.add(0x1F95F, 0x1F96B); + set.add(0x1F992, 0x1F997); + set.add(0x1F9D0, 0x1F9E6); + set.add(0x1F90C, 0x1F90F); + set.add(0x1F93F); + set.add(0x1F94D, 0x1F94F); + set.add(0x1F96C, 0x1F97F); + set.add(0x1F998, 0x1F9BF); + set.add(0x1F9C1, 0x1F9CF); + set.add(0x1F9E7, 0x1F9FF); + set.add(0x1F6C6, 0x1F6CA); + set.add(0x1F6D3, 0x1F6D4); + set.add(0x1F6E6, 0x1F6E8); + set.add(0x1F6EA); + set.add(0x1F6F1, 0x1F6F2); + set.add(0x1F6F7, 0x1F6F8); + set.add(0x1F6D5, 0x1F6DF); + set.add(0x1F6ED, 0x1F6EF); + set.add(0x1F6F9, 0x1F6FF); + propertyCodePointRanges.put("Extended_Pictographic", set); + + UnicodeSet emojiRKUnicodeSet = new UnicodeSet("[\\p{GCB=Regional_Indicator}\\*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]"); + IntervalSet emojiRKIntervalSet = new IntervalSet(); + addUnicodeSetToIntervalSet(emojiRKUnicodeSet, emojiRKIntervalSet); + propertyCodePointRanges.put("EmojiRK", emojiRKIntervalSet); + + UnicodeSet emojiNRKUnicodeSet = new UnicodeSet("[\\p{Emoji=Yes}]"); + emojiNRKUnicodeSet.removeAll(emojiRKUnicodeSet); + IntervalSet emojiNRKIntervalSet = new IntervalSet(); + addUnicodeSetToIntervalSet(emojiNRKUnicodeSet, emojiNRKIntervalSet); + propertyCodePointRanges.put("EmojiNRK", emojiNRKIntervalSet); + } + private static void addIntPropertyAliases(int property, String namePrefix, Map propertyAliases) { String propertyName = getShortPropertyName(property); for (int propertyValue = UCharacter.getIntPropertyMinValue(property);