diff --git a/examples/use_mfa/local/generate_lexicon.py b/examples/use_mfa/local/generate_lexicon.py index 5fb00e5..1791e7b 100644 --- a/examples/use_mfa/local/generate_lexicon.py +++ b/examples/use_mfa/local/generate_lexicon.py @@ -11,6 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Generate lexicon and symbols for Mandarin Chinese phonology. +The lexicon is used for Montreal Force Aligner. + +Note that syllables are used as word in this lexicon. Since syllables rather +than words are used in transcriptions produced by `reorganize_baker.py`. + +We make this choice to better leverage other software for chinese text to +pinyin tools like pypinyin. This is the convention for G2P in Chinese. +""" import re import argparse @@ -32,55 +41,68 @@ SPECIALS = ['sil', 'sp'] def rule(C, V, R, T): + """Generate a syllable given the initial, the final, erhua indicator, and tone. + Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu) + + Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to + 'u' in syllables when certain conditions are satisfied. + + 'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'. + + Erhua is is possibly applied to every finals, except for finals that already ends with 'r'. + + When a syllable is impossible or does not have any characters with this pronunciation, return None + to filter it out. + """ # 不可拼的音节, ii 只能和 z, c, s 拼 if V in ["ii"] and (C not in ['z', 'c', 's']): - return + return None # iii 只能和 zh, ch, sh, r 拼 if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']): - return + return None # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and ( C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']): - return + return None # 撮口呼只能和 j, q, x l, n 拼 if V.startswith("v"): # v, ve 只能和 j ,q , x, n, l 拼 if V in ['v', 've']: if C not in ['j', 'q', 'x', 'n', 'l', '']: - return + return None # 其他只能和 j, q, x 拼 else: if C not in ['j', 'q', 'x', '']: - return + return None # j, q, x 只能和齐齿呼或者撮口呼拼 if (C in ['j', 'q', 'x']) and not ( (V not in ['ii', 'iii']) and V[0] in ['i', 'v']): - return + return None # b, p ,m, f 不能和合口呼拼,除了 u 之外 # bm p, m, f 不能和撮口呼拼 if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or V == 'ong'): - return + return None # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼 if V in ['ua', 'uai', 'uang' ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: - return + return None # sh 和 ong 不能拼 if V == 'ong' and C in ['sh']: - return + return None # o 和 gkh, zh ch sh r z c s 不能拼 if V == "o" and C in [ 'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's' ]: - return + return None # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong if V == 'ueng' and C != '': @@ -88,7 +110,7 @@ def rule(C, V, R, T): # 非儿化的 er 只能单独存在 if V == 'er' and C != '': - return + return None if C == '': if V in ["i", "in", "ing"]: @@ -118,7 +140,7 @@ def rule(C, V, R, T): # Filter er 不能再儿化 if result.endswith('r') and R == 'r': - return + return None # ii and iii, change back to i result = re.sub(r'i+', 'i', result) @@ -127,13 +149,13 @@ def rule(C, V, R, T): return result -def generate_lexicon(with_tone=False, with_r=False): - # generate lexicon withou tone and erhua +def generate_lexicon(with_tone=False, with_erhua=False): + """Generate lexicon for Mandarin Chinese.""" syllables = OrderedDict() for C in [''] + INITIALS: for V in FINALS: - for R in [''] if not with_r else ['', 'r']: + for R in [''] if not with_erhua else ['', 'r']: for T in [''] if not with_tone else ['1', '2', '3', '4', '5']: result = rule(C, V, R, T) if result: @@ -142,11 +164,12 @@ def generate_lexicon(with_tone=False, with_r=False): def generate_symbols(lexicon): + """Generate phoneme list for a lexicon.""" symbols = set() for p in SPECIALS: symbols.add(p) - for syllable, phonems in lexicon.items(): - phonemes = phonems.split() + for syllable, phonemes in lexicon.items(): + phonemes = phonemes.split() for p in phonemes: symbols.add(p) return sorted(list(symbols)) diff --git a/examples/use_mfa/local/recorganize_baker.py b/examples/use_mfa/local/reorganize_baker.py similarity index 88% rename from examples/use_mfa/local/recorganize_baker.py rename to examples/use_mfa/local/reorganize_baker.py index b1f6b14..fb41751 100644 --- a/examples/use_mfa/local/recorganize_baker.py +++ b/examples/use_mfa/local/reorganize_baker.py @@ -11,16 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Script to reorganize Baker dataset so as to use Montreal Force +Aligner to align transcription and audio. + +Please refer to https://montreal-forced-aligner.readthedocs.io/en/latest/data_prep.html +for more details about Montreal Force Aligner's requirements on cotpus. + +For scripts to reorganize other corpus, please refer to + https://github.com/MontrealCorpusTools/MFA-reorganization-scripts +for more details. +""" -from typing import Union -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -import soundfile as sf -import librosa -from tqdm import tqdm import os import shutil import argparse +from typing import Union +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor + +import librosa +import soundfile as sf +from tqdm import tqdm def get_transcripts(path: Union[str, Path]): @@ -54,7 +65,7 @@ def reorganize_baker(root_dir: Union[str, Path], transcriptions = get_transcripts(transcript_path) wave_dir = root_dir / "Wave" - wav_paths = list(wave_dir.glob("*.wav")) + wav_paths = sorted(list(wave_dir.glob("*.wav"))) output_dir = Path(output_dir).expanduser() assert wave_dir != output_dir, "Don't use an the original wav's directory as output_dir" diff --git a/examples/use_mfa/run.sh b/examples/use_mfa/run.sh index 445a90a..c339408 100644 --- a/examples/use_mfa/run.sh +++ b/examples/use_mfa/run.sh @@ -11,7 +11,9 @@ fi if [ ! -d $EXP_DIR/baker_corpus ]; then echo "reorganizing baker corpus..." python local/recorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio - echo "reorganization done." + echo "reorganization done. Check output in $EXP_DIR/baker_corpus." + echo "audio files are resampled to 16kHz" + echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus " fi echo "detecting oov..." @@ -37,7 +39,9 @@ export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin" if [ ! -d "$EXP_DIR/baker_alignment" ]; then echo "Start MFA training..." mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory exp/.mfa_train_and_align - echo "training done! \nresults: $EXP_DIR/baker_alignment \nmodel: $EXP_DIR/baker_model\n" + echo "training done!" + echo "results: $EXP_DIR/baker_alignment" + echo "model: $EXP_DIR/baker_model" fi