refine code, fix typos.

This commit is contained in:
chenfeiyu 2021-07-01 18:51:35 +08:00
parent 96b8e44015
commit 47a9ab3a0b
3 changed files with 64 additions and 26 deletions

View File

@ -11,6 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Generate lexicon and symbols for Mandarin Chinese phonology.
The lexicon is used for Montreal Force Aligner.
Note that syllables are used as word in this lexicon. Since syllables rather
than words are used in transcriptions produced by `reorganize_baker.py`.
We make this choice to better leverage other software for chinese text to
pinyin tools like pypinyin. This is the convention for G2P in Chinese.
"""
import re import re
import argparse import argparse
@ -32,55 +41,68 @@ SPECIALS = ['sil', 'sp']
def rule(C, V, R, T): def rule(C, V, R, T):
"""Generate a syllable given the initial, the final, erhua indicator, and tone.
Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to
'u' in syllables when certain conditions are satisfied.
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
When a syllable is impossible or does not have any characters with this pronunciation, return None
to filter it out.
"""
# 不可拼的音节, ii 只能和 z, c, s 拼 # 不可拼的音节, ii 只能和 z, c, s 拼
if V in ["ii"] and (C not in ['z', 'c', 's']): if V in ["ii"] and (C not in ['z', 'c', 's']):
return return None
# iii 只能和 zh, ch, sh, r 拼 # iii 只能和 zh, ch, sh, r 拼
if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']): if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']):
return return None
# 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s
if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and ( if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and (
C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']): C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']):
return return None
# 撮口呼只能和 j, q, x l, n 拼 # 撮口呼只能和 j, q, x l, n 拼
if V.startswith("v"): if V.startswith("v"):
# v, ve 只能和 j ,q , x, n, l 拼 # v, ve 只能和 j ,q , x, n, l 拼
if V in ['v', 've']: if V in ['v', 've']:
if C not in ['j', 'q', 'x', 'n', 'l', '']: if C not in ['j', 'q', 'x', 'n', 'l', '']:
return return None
# 其他只能和 j, q, x 拼 # 其他只能和 j, q, x 拼
else: else:
if C not in ['j', 'q', 'x', '']: if C not in ['j', 'q', 'x', '']:
return return None
# j, q, x 只能和齐齿呼或者撮口呼拼 # j, q, x 只能和齐齿呼或者撮口呼拼
if (C in ['j', 'q', 'x']) and not ( if (C in ['j', 'q', 'x']) and not (
(V not in ['ii', 'iii']) and V[0] in ['i', 'v']): (V not in ['ii', 'iii']) and V[0] in ['i', 'v']):
return return None
# b, p ,m, f 不能和合口呼拼,除了 u 之外 # b, p ,m, f 不能和合口呼拼,除了 u 之外
# bm p, m, f 不能和撮口呼拼 # bm p, m, f 不能和撮口呼拼
if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or
V == 'ong'): V == 'ong'):
return return None
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼 # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
if V in ['ua', 'uai', 'uang' if V in ['ua', 'uai', 'uang'
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
return return None
# sh 和 ong 不能拼 # sh 和 ong 不能拼
if V == 'ong' and C in ['sh']: if V == 'ong' and C in ['sh']:
return return None
# o 和 gkh, zh ch sh r z c s 不能拼 # o 和 gkh, zh ch sh r z c s 不能拼
if V == "o" and C in [ if V == "o" and C in [
'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's' 'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's'
]: ]:
return return None
# ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong
if V == 'ueng' and C != '': if V == 'ueng' and C != '':
@ -88,7 +110,7 @@ def rule(C, V, R, T):
# 非儿化的 er 只能单独存在 # 非儿化的 er 只能单独存在
if V == 'er' and C != '': if V == 'er' and C != '':
return return None
if C == '': if C == '':
if V in ["i", "in", "ing"]: if V in ["i", "in", "ing"]:
@ -118,7 +140,7 @@ def rule(C, V, R, T):
# Filter er 不能再儿化 # Filter er 不能再儿化
if result.endswith('r') and R == 'r': if result.endswith('r') and R == 'r':
return return None
# ii and iii, change back to i # ii and iii, change back to i
result = re.sub(r'i+', 'i', result) result = re.sub(r'i+', 'i', result)
@ -127,13 +149,13 @@ def rule(C, V, R, T):
return result return result
def generate_lexicon(with_tone=False, with_r=False): def generate_lexicon(with_tone=False, with_erhua=False):
# generate lexicon withou tone and erhua """Generate lexicon for Mandarin Chinese."""
syllables = OrderedDict() syllables = OrderedDict()
for C in [''] + INITIALS: for C in [''] + INITIALS:
for V in FINALS: for V in FINALS:
for R in [''] if not with_r else ['', 'r']: for R in [''] if not with_erhua else ['', 'r']:
for T in [''] if not with_tone else ['1', '2', '3', '4', '5']: for T in [''] if not with_tone else ['1', '2', '3', '4', '5']:
result = rule(C, V, R, T) result = rule(C, V, R, T)
if result: if result:
@ -142,11 +164,12 @@ def generate_lexicon(with_tone=False, with_r=False):
def generate_symbols(lexicon): def generate_symbols(lexicon):
"""Generate phoneme list for a lexicon."""
symbols = set() symbols = set()
for p in SPECIALS: for p in SPECIALS:
symbols.add(p) symbols.add(p)
for syllable, phonems in lexicon.items(): for syllable, phonemes in lexicon.items():
phonemes = phonems.split() phonemes = phonemes.split()
for p in phonemes: for p in phonemes:
symbols.add(p) symbols.add(p)
return sorted(list(symbols)) return sorted(list(symbols))

View File

@ -11,16 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Script to reorganize Baker dataset so as to use Montreal Force
Aligner to align transcription and audio.
Please refer to https://montreal-forced-aligner.readthedocs.io/en/latest/data_prep.html
for more details about Montreal Force Aligner's requirements on cotpus.
For scripts to reorganize other corpus, please refer to
https://github.com/MontrealCorpusTools/MFA-reorganization-scripts
for more details.
"""
from typing import Union
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import soundfile as sf
import librosa
from tqdm import tqdm
import os import os
import shutil import shutil
import argparse import argparse
from typing import Union
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import librosa
import soundfile as sf
from tqdm import tqdm
def get_transcripts(path: Union[str, Path]): def get_transcripts(path: Union[str, Path]):
@ -54,7 +65,7 @@ def reorganize_baker(root_dir: Union[str, Path],
transcriptions = get_transcripts(transcript_path) transcriptions = get_transcripts(transcript_path)
wave_dir = root_dir / "Wave" wave_dir = root_dir / "Wave"
wav_paths = list(wave_dir.glob("*.wav")) wav_paths = sorted(list(wave_dir.glob("*.wav")))
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
assert wave_dir != output_dir, "Don't use an the original wav's directory as output_dir" assert wave_dir != output_dir, "Don't use an the original wav's directory as output_dir"

View File

@ -11,7 +11,9 @@ fi
if [ ! -d $EXP_DIR/baker_corpus ]; then if [ ! -d $EXP_DIR/baker_corpus ]; then
echo "reorganizing baker corpus..." echo "reorganizing baker corpus..."
python local/recorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio python local/recorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio
echo "reorganization done." echo "reorganization done. Check output in $EXP_DIR/baker_corpus."
echo "audio files are resampled to 16kHz"
echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus "
fi fi
echo "detecting oov..." echo "detecting oov..."
@ -37,7 +39,9 @@ export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/baker_alignment" ]; then if [ ! -d "$EXP_DIR/baker_alignment" ]; then
echo "Start MFA training..." echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory exp/.mfa_train_and_align mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory exp/.mfa_train_and_align
echo "training done! \nresults: $EXP_DIR/baker_alignment \nmodel: $EXP_DIR/baker_model\n" echo "training done!"
echo "results: $EXP_DIR/baker_alignment"
echo "model: $EXP_DIR/baker_model"
fi fi