From 8a211abb70f9227b2bae1351409f44bb895f1b88 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 20 Aug 2021 09:55:30 +0000 Subject: [PATCH] add some quantifiers for num.py --- examples/text_frontend/README.md | 2 +- parakeet/frontend/cn_normalization/num.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/text_frontend/README.md b/examples/text_frontend/README.md index e60f013..d41e428 100644 --- a/examples/text_frontend/README.md +++ b/examples/text_frontend/README.md @@ -17,4 +17,4 @@ Run the command below to get the results of test. ``` The `avg WER` of g2p is: 0.027495061517943988 -The `avg CER` of text normalization is: 0.0061629764893859846 +The `avg CER` of text normalization is: 0.006391234877881762 diff --git a/parakeet/frontend/cn_normalization/num.py b/parakeet/frontend/cn_normalization/num.py index 7cc36d7..e07dd80 100644 --- a/parakeet/frontend/cn_normalization/num.py +++ b/parakeet/frontend/cn_normalization/num.py @@ -28,7 +28,7 @@ UNITS = OrderedDict({ 8: '亿', }) -COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' # 分数表达式 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') @@ -74,7 +74,7 @@ def replace_negative_num(match: re.Match) -> str: # 编号-无符号整形 # 00078 -RE_DEFAULT_NUM = re.compile(r'\d{4}\d*') +RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') def replace_default_num(match: re.Match):