simplify text processing code and update notebook

This commit is contained in:
chenfeiyu 2021-05-13 17:06:34 +08:00
parent 6a1fb158d9
commit e1a7c296fe
5 changed files with 19 additions and 196 deletions

View File

@ -13,9 +13,17 @@
# limitations under the License.
from typing import List, Tuple
from pypinyin import lazy_pinyin, Style
from preprocess_transcription import split_syllable
from chinese_text_to_pinyin import convert_to_pinyin
from chinese_phonology import split_syllable
def convert_to_pinyin(text: str) -> List[str]:
"""convert text into list of syllables, other characters that are not chinese, thus
cannot be converted to pinyin are splited.
"""
syllables = lazy_pinyin(
text, style=Style.TONE3, neutral_tone_with_five=True)
return syllables
def convert_sentence(text: str) -> List[Tuple[str]]:

View File

@ -1,158 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A pinyin to phone transcription system for chinese.
Syllables are splited as initial and final. 'er' is also treated as s special symbol.
Tones are extracted and attached to finals.
"""
import re
# initials for mandarin chinese
# zero initials are not included
_initials = {
"b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh",
"ch", "sh", "r", "z", "c", "s"
}
# finals for mandarin chines
# some symbols with different pronunciations are discriminated
# e.g. i -> {i, ii, iii}
# some symbols that are abbreviated are expanded
# e.g. iu -> iou, ui -> uei, un -> uen, bo -> b uo
# some symbols are transcripted according to zhuyin scheme
# e,g, in -> ien, ong -> ueng, iong -> veng
# üis always replaced by v
_finals = {
'ii',
'iii',
'a',
'o',
'e',
'ea',
'ai',
'ei',
'ao',
'ou',
'an',
'en',
'ang',
'eng',
'er',
'i',
'ia',
'io',
'ie',
'iai',
'iao',
'iou',
'ian',
'ien',
'iang',
'ieng',
'u',
'ua',
'uo',
'uai',
'uei',
'uan',
'uen',
'uang',
'ueng',
'v',
've',
'van',
'ven',
'veng',
}
# Er hua symbol
# example tour2 -> phone: t ou &r, tone: 0 2 5
_ernized_symbol = {'&r'}
_specials = {'<pad>', '<unk>'}
_pauses = {"%",
"$"} # for different dataset, maybe you have to change this set
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
# 0: no tone, for initials
# {1, 2, 3, 4}: for tones in chinese
# 5: neutral tone
# <pad>: special token for padding
# <unk>: special token for unknown tone, though there will not be unknown tone
_tones = {'<pad>', '<unk>', '0', '1', '2', '3', '4', '5'}
def ernized(syllable):
return syllable[:2] != "er" and syllable[-2] == 'r'
def convert(syllable):
# expansion of o -> uo
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
# expansion for iong, ong
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
# expansion for ing, in
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un", "uen").replace(
"ui", "uei").replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
.replace("ri", "riii")
# rule for y preceding i, u
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
# rule for w
syllable = syllable.replace("wu", "u").replace("w", "u")
# rule for v following j, q, x
syllable = syllable.replace("ju", "jv").replace("qu",
"qv").replace("xu", "xv")
return syllable
def split_syllable(syllable: str):
if syllable in _pauses:
# phone, tone
return [syllable], ['0']
tone = syllable[-1]
syllable = convert(syllable[:-1])
phones = []
tones = []
global _initials
if syllable[:2] in _initials:
phones.append(syllable[:2])
tones.append('0')
phones.append(syllable[2:])
tones.append(tone)
elif syllable[0] in _initials:
phones.append(syllable[0])
tones.append('0')
phones.append(syllable[1:])
tones.append(tone)
else:
phones.append(syllable)
tones.append(tone)
return phones, tones

View File

@ -1,26 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from pypinyin import lazy_pinyin, Style
def convert_to_pinyin(text: str) -> List[str]:
"""convert text into list of syllables, other characters that are not chinese, thus
cannot be converted to pinyin are splited.
"""
syllables = lazy_pinyin(
text, style=Style.TONE3, neutral_tone_with_five=True)
return syllables

File diff suppressed because one or more lines are too long

View File

@ -26,7 +26,6 @@ from parakeet.utils import display
from config import get_cfg_defaults
@paddle.no_grad()
def main(config, args):
paddle.set_device(args.device)