diff --git a/examples/use_mfa/local/detect_oov.py b/examples/use_mfa/local/detect_oov.py new file mode 100644 index 0000000..f5ae728 --- /dev/null +++ b/examples/use_mfa/local/detect_oov.py @@ -0,0 +1,48 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from collections import OrderedDict +from pathlib import Path +import logging + + +def detect_oov(corpus_dir, lexicon_path, transcription_pattern="*.lab"): + corpus_dir = Path(corpus_dir) + + lexicon = OrderedDict() + with open(lexicon_path, 'rt') as f: + for line in f: + syllable, phonemes = line.split(maxsplit=1) + lexicon[syllable] = phonemes + + for fp in corpus_dir.glob(transcription_pattern): + syllables = fp.read_text().strip().split() + for s in syllables: + if s not in lexicon: + logging.warning(f"{fp.relative_to(corpus_dir)} has OOV {s} .") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="detect oov in a corpus given a lexicon") + parser.add_argument( + "corpus_dir", type=str, help="corpus dir for MFA alignment.") + parser.add_argument("lexicon_path", type=str, help="dictionary to use.") + parser.add_argument( + "--pattern", type=str, default="*.lab", help="dictionary to use.") + args = parser.parse_args() + print(args) + + detect_oov(args.corpus_dir, args.lexicon_path, args.pattern) diff --git a/examples/use_mfa/local/generate_lexicon.py b/examples/use_mfa/local/generate_lexicon.py new file mode 100644 index 0000000..1791e7b --- /dev/null +++ b/examples/use_mfa/local/generate_lexicon.py @@ -0,0 +1,199 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Generate lexicon and symbols for Mandarin Chinese phonology. +The lexicon is used for Montreal Force Aligner. + +Note that syllables are used as word in this lexicon. Since syllables rather +than words are used in transcriptions produced by `reorganize_baker.py`. + +We make this choice to better leverage other software for chinese text to +pinyin tools like pypinyin. This is the convention for G2P in Chinese. +""" + +import re +import argparse +from collections import OrderedDict + +INITIALS = [ + 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', + 'r', 'z', 'c', 's', 'j', 'q', 'x' +] + +FINALS = [ + 'a', 'ai', 'ao', 'an', 'ang', 'e', 'er', 'ei', 'en', 'eng', 'o', 'ou', + 'ong', 'ii', 'iii', 'i', 'ia', 'iao', 'ian', 'iang', 'ie', 'io', 'iou', + 'iong', 'in', 'ing', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uo', 'uen', + 'ueng', 'v', 've', 'van', 'vn' +] + +SPECIALS = ['sil', 'sp'] + + +def rule(C, V, R, T): + """Generate a syllable given the initial, the final, erhua indicator, and tone. + Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu) + + Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to + 'u' in syllables when certain conditions are satisfied. + + 'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'. + + Erhua is is possibly applied to every finals, except for finals that already ends with 'r'. + + When a syllable is impossible or does not have any characters with this pronunciation, return None + to filter it out. + """ + + # 不可拼的音节, ii 只能和 z, c, s 拼 + if V in ["ii"] and (C not in ['z', 'c', 's']): + return None + # iii 只能和 zh, ch, sh, r 拼 + if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']): + return None + + # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s + if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and ( + C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']): + return None + + # 撮口呼只能和 j, q, x l, n 拼 + if V.startswith("v"): + # v, ve 只能和 j ,q , x, n, l 拼 + if V in ['v', 've']: + if C not in ['j', 'q', 'x', 'n', 'l', '']: + return None + # 其他只能和 j, q, x 拼 + else: + if C not in ['j', 'q', 'x', '']: + return None + + # j, q, x 只能和齐齿呼或者撮口呼拼 + if (C in ['j', 'q', 'x']) and not ( + (V not in ['ii', 'iii']) and V[0] in ['i', 'v']): + return None + + # b, p ,m, f 不能和合口呼拼,除了 u 之外 + # bm p, m, f 不能和撮口呼拼 + if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or + V == 'ong'): + return None + + # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼 + if V in ['ua', 'uai', 'uang' + ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: + return None + + # sh 和 ong 不能拼 + if V == 'ong' and C in ['sh']: + return None + + # o 和 gkh, zh ch sh r z c s 不能拼 + if V == "o" and C in [ + 'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's' + ]: + return None + + # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong + if V == 'ueng' and C != '': + return + + # 非儿化的 er 只能单独存在 + if V == 'er' and C != '': + return None + + if C == '': + if V in ["i", "in", "ing"]: + C = 'y' + elif V == 'u': + C = 'w' + elif V.startswith('i') and V not in ["ii", "iii"]: + C = 'y' + V = V[1:] + elif V.startswith('u'): + C = 'w' + V = V[1:] + elif V.startswith('v'): + C = 'yu' + V = V[1:] + else: + if C in ['j', 'q', 'x']: + if V.startswith('v'): + V = re.sub('v', 'u', V) + if V == 'iou': + V = 'iu' + elif V == 'uei': + V = 'ui' + elif V == 'uen': + V = 'un' + result = C + V + + # Filter er 不能再儿化 + if result.endswith('r') and R == 'r': + return None + + # ii and iii, change back to i + result = re.sub(r'i+', 'i', result) + + result = result + R + T + return result + + +def generate_lexicon(with_tone=False, with_erhua=False): + """Generate lexicon for Mandarin Chinese.""" + syllables = OrderedDict() + + for C in [''] + INITIALS: + for V in FINALS: + for R in [''] if not with_erhua else ['', 'r']: + for T in [''] if not with_tone else ['1', '2', '3', '4', '5']: + result = rule(C, V, R, T) + if result: + syllables[result] = f'{C} {V}{R}{T}' + return syllables + + +def generate_symbols(lexicon): + """Generate phoneme list for a lexicon.""" + symbols = set() + for p in SPECIALS: + symbols.add(p) + for syllable, phonemes in lexicon.items(): + phonemes = phonemes.split() + for p in phonemes: + symbols.add(p) + return sorted(list(symbols)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate lexicon for Chinese pinyin to phoneme for MFA") + parser.add_argument("output", type=str, help="Path to save lexicon.") + parser.add_argument( + "--with-tone", action="store_true", help="whether to consider tone.") + parser.add_argument( + "--with-r", action="store_true", help="whether to consider erhua.") + args = parser.parse_args() + + lexicon = generate_lexicon(args.with_tone, args.with_r) + symbols = generate_symbols(lexicon) + + with open(args.output + ".lexicon", 'wt') as f: + for k, v in lexicon.items(): + f.write(f"{k} {v}\n") + + with open(args.output + ".symbols", 'wt') as f: + for s in symbols: + f.write(s + "\n") + + print("Done!") diff --git a/examples/use_mfa/local/reorganize_baker.py b/examples/use_mfa/local/reorganize_baker.py new file mode 100644 index 0000000..fb41751 --- /dev/null +++ b/examples/use_mfa/local/reorganize_baker.py @@ -0,0 +1,113 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Script to reorganize Baker dataset so as to use Montreal Force +Aligner to align transcription and audio. + +Please refer to https://montreal-forced-aligner.readthedocs.io/en/latest/data_prep.html +for more details about Montreal Force Aligner's requirements on cotpus. + +For scripts to reorganize other corpus, please refer to + https://github.com/MontrealCorpusTools/MFA-reorganization-scripts +for more details. +""" + +import os +import shutil +import argparse +from typing import Union +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor + +import librosa +import soundfile as sf +from tqdm import tqdm + + +def get_transcripts(path: Union[str, Path]): + transcripts = {} + + with open(path) as f: + lines = f.readlines() + + for i in range(0, len(lines), 2): + sentence_id = lines[i].split()[0] + transcription = lines[i + 1].strip() + # tones are dropped here + # since the lexicon does not consider tones, too + transcription = " ".join([item[:-1] for item in transcription.split()]) + transcripts[sentence_id] = transcription + + return transcripts + + +def resample_and_save(source, target, sr=16000): + wav, _ = librosa.load(str(source), sr=sr) + sf.write(str(target), wav, samplerate=sr, subtype='PCM_16') + return target + + +def reorganize_baker(root_dir: Union[str, Path], + output_dir: Union[str, Path]=None, + resample_audio=False): + root_dir = Path(root_dir).expanduser() + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" + transcriptions = get_transcripts(transcript_path) + + wave_dir = root_dir / "Wave" + wav_paths = sorted(list(wave_dir.glob("*.wav"))) + output_dir = Path(output_dir).expanduser() + assert wave_dir != output_dir, "Don't use an the original wav's directory as output_dir" + + output_dir.mkdir(parents=True, exist_ok=True) + + if resample_audio: + with ThreadPoolExecutor(os.cpu_count()) as pool: + with tqdm(total=len(wav_paths), desc="resampling") as pbar: + futures = [] + for wav_path in wav_paths: + future = pool.submit(resample_and_save, wav_path, + output_dir / wav_path.name) + future.add_done_callback(lambda p: pbar.update()) + futures.append(future) + + results = [] + for ft in futures: + results.append(ft.result()) + else: + for wav_path in tqdm(wav_paths, desc="copying"): + shutil.copyfile(wav_path, output_dir / wav_path.name) + + for sentence_id, transcript in tqdm( + transcriptions.items(), desc="transcription process"): + with open(output_dir / (sentence_id + ".lab"), 'wt') as f: + f.write(transcript) + f.write('\n') + print("Done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Reorganize Baker dataset for MFA") + parser.add_argument("--root-dir", type=str, help="path to baker dataset.") + parser.add_argument( + "--output-dir", + type=str, + help="path to save outputs(audio and transcriptions)") + parser.add_argument( + "--resample-audio", + action="store_true", + help="To resample audio files or just copy them") + args = parser.parse_args() + + reorganize_baker(args.root_dir, args.output_dir, args.resample_audio) diff --git a/examples/use_mfa/run.sh b/examples/use_mfa/run.sh new file mode 100644 index 0000000..1fef58b --- /dev/null +++ b/examples/use_mfa/run.sh @@ -0,0 +1,49 @@ +EXP_DIR=exp + +mkdir -p $EXP_DIR +LEXICON_NAME='simple' +if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then + echo "generating lexicon..." + python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r + echo "lexicon done" +fi + +if [ ! -d $EXP_DIR/baker_corpus ]; then + echo "reorganizing baker corpus..." + python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio + echo "reorganization done. Check output in $EXP_DIR/baker_corpus." + echo "audio files are resampled to 16kHz" + echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus " +fi + +echo "detecting oov..." +python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon" +echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs." + + +MFA_DOWNLOAD_DIR=local/ + +if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then + echo "downloading mfa..." + (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) + echo "download mfa done!" +fi + +if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then + echo "extracting mfa..." + (cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz") + echo "extraction done!" +fi + +export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin" +if [ ! -d "$EXP_DIR/baker_alignment" ]; then + echo "Start MFA training..." + mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory exp/.mfa_train_and_align + echo "training done!" + echo "results: $EXP_DIR/baker_alignment" + echo "model: $EXP_DIR/baker_model" +fi + + + +