add use_mfa example

2021-06-25 07:29:21 +00:00 · 2021-06-25 07:29:21 +00:00 · 577c3b4f10
parent d96e2828b8
commit 577c3b4f10
4 changed files with 370 additions and 0 deletions
--- a/examples/use_mfa/local/detect_oov.py
+++ b/examples/use_mfa/local/detect_oov.py
@ -0,0 +1,48 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 from collections import OrderedDict
 from pathlib import Path
 import logging
 def detect_oov(corpus_dir, lexicon_path, transcription_pattern="*.lab"):
    corpus_dir = Path(corpus_dir)
    lexicon = OrderedDict()
    with open(lexicon_path, 'rt') as f:
        for line in f:
            syllable, phonemes = line.split(maxsplit=1)
            lexicon[syllable] = phonemes
    for fp in corpus_dir.glob(transcription_pattern):
        syllables = fp.read_text().strip().split()
        for s in syllables:
            if s not in lexicon:
                logging.warning(f"{fp.relative_to(corpus_dir)} has OOV {s} .")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="detect oov in a corpus given a lexicon")
    parser.add_argument(
        "corpus_dir", type=str, help="corpus dir for MFA alignment.")
    parser.add_argument("lexicon_path", type=str, help="dictionary to use.")
    parser.add_argument(
        "--pattern", type=str, default="*.lab", help="dictionary to use.")
    args = parser.parse_args()
    print(args)
    detect_oov(args.corpus_dir, args.lexicon_path, args.pattern)
--- a/examples/use_mfa/local/generate_lexicon.py
+++ b/examples/use_mfa/local/generate_lexicon.py
@ -0,0 +1,176 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
 import argparse
 from collections import OrderedDict
 INITIALS = [
    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
    'r', 'z', 'c', 's', 'j', 'q', 'x'
 ]
 FINALS = [
    'a', 'ai', 'ao', 'an', 'ang', 'e', 'er', 'ei', 'en', 'eng', 'o', 'ou',
    'ong', 'ii', 'iii', 'i', 'ia', 'iao', 'ian', 'iang', 'ie', 'io', 'iou',
    'iong', 'in', 'ing', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uo', 'uen',
    'ueng', 'v', 've', 'van', 'vn'
 ]
 SPECIALS = ['sil', 'sp']
 def rule(C, V, R, T):
    # 不可拼的音节, ii 只能和 z, c, s 拼
    if V in ["ii"] and (C not in ['z', 'c', 's']):
        return
    # iii 只能和 zh, ch, sh, r 拼
    if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']):
        return
    # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s
    if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and (
            C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']):
        return
    # 撮口呼只能和 j, q, x l, n 拼
    if V.startswith("v"):
        # v, ve 只能和 j ,q , x, n, l 拼
        if V in ['v', 've']:
            if C not in ['j', 'q', 'x', 'n', 'l', '']:
                return
        # 其他只能和 j, q, x 拼
        else:
            if C not in ['j', 'q', 'x', '']:
                return
    # j, q, x 只能和齐齿呼或者撮口呼拼
    if (C in ['j', 'q', 'x']) and not (
        (V not in ['ii', 'iii']) and V[0] in ['i', 'v']):
        return
    # b, p ,m, f 不能和合口呼拼，除了 u 之外
    # bm p, m, f 不能和撮口呼拼
    if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or
                                        V == 'ong'):
        return
    # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
    if V in ['ua', 'uai', 'uang'
             ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
        return
    # sh 和 ong 不能拼
    if V == 'ong' and C in ['sh']:
        return
    # o 和 gkh, zh ch sh r z c s 不能拼
    if V == "o" and C in [
            'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's'
    ]:
        return
    # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong
    if V == 'ueng' and C != '':
        return
    # 非儿化的 er 只能单独存在
    if V == 'er' and C != '':
        return
    if C == '':
        if V in ["i", "in", "ing"]:
            C = 'y'
        elif V == 'u':
            C = 'w'
        elif V.startswith('i') and V not in ["ii", "iii"]:
            C = 'y'
            V = V[1:]
        elif V.startswith('u'):
            C = 'w'
            V = V[1:]
        elif V.startswith('v'):
            C = 'yu'
            V = V[1:]
    else:
        if C in ['j', 'q', 'x']:
            if V.startswith('v'):
                V = re.sub('v', 'u', V)
        if V == 'iou':
            V = 'iu'
        elif V == 'uei':
            V = 'ui'
        elif V == 'uen':
            V = 'un'
    result = C + V
    # Filter  er 不能再儿化
    if result.endswith('r') and R == 'r':
        return
    # ii and iii, change back to i
    result = re.sub(r'i+', 'i', result)
    result = result + R + T
    return result
 def generate_lexicon(with_tone=False, with_r=False):
    # generate lexicon withou tone and erhua
    syllables = OrderedDict()
    for C in [''] + INITIALS:
        for V in FINALS:
            for R in [''] if not with_r else ['', 'r']:
                for T in [''] if not with_tone else ['1', '2', '3', '4', '5']:
                    result = rule(C, V, R, T)
                    if result:
                        syllables[result] = f'{C} {V}{R}{T}'
    return syllables
 def generate_symbols(lexicon):
    symbols = set()
    for p in SPECIALS:
        symbols.add(p)
    for syllable, phonems in lexicon.items():
        phonemes = phonems.split()
        for p in phonemes:
            symbols.add(p)
    return sorted(list(symbols))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate lexicon for Chinese pinyin to phoneme for MFA")
    parser.add_argument("output", type=str, help="Path to save lexicon.")
    parser.add_argument(
        "--with-tone", action="store_true", help="whether to consider tone.")
    parser.add_argument(
        "--with-r", action="store_true", help="whether to consider erhua.")
    args = parser.parse_args()
    lexicon = generate_lexicon(args.with_tone, args.with_r)
    symbols = generate_symbols(lexicon)
    with open(args.output + ".lexicon", 'wt') as f:
        for k, v in lexicon.items():
            f.write(f"{k} {v}\n")
    with open(args.output + ".symbols", 'wt') as f:
        for s in symbols:
            f.write(s + "\n")
    print("Done!")
--- a/examples/use_mfa/local/recorganize_baker.py
+++ b/examples/use_mfa/local/recorganize_baker.py
@ -0,0 +1,102 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 import soundfile as sf
 import librosa
 from tqdm import tqdm
 import os
 import shutil
 import argparse
 def get_transcripts(path: Union[str, Path]):
    transcripts = {}
    with open(path) as f:
        lines = f.readlines()
    for i in range(0, len(lines), 2):
        sentence_id = lines[i].split()[0]
        transcription = lines[i + 1].strip()
        # tones are dropped here
        # since the lexicon does not consider tones, too
        transcription = " ".join([item[:-1] for item in transcription.split()])
        transcripts[sentence_id] = transcription
    return transcripts
 def resample_and_save(source, target, sr=16000):
    wav, _ = librosa.load(str(source), sr=sr)
    sf.write(str(target), wav, samplerate=sr, subtype='PCM_16')
    return target
 def reorganize_baker(root_dir: Union[str, Path],
                     output_dir: Union[str, Path]=None,
                     resample_audio=False):
    root_dir = Path(root_dir).expanduser()
    transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
    transcriptions = get_transcripts(transcript_path)
    wave_dir = root_dir / "Wave"
    wav_paths = list(wave_dir.glob("*.wav"))
    output_dir = Path(output_dir).expanduser()
    assert wave_dir != output_dir, "Don't use an the original wav's directory as output_dir"
    output_dir.mkdir(parents=True, exist_ok=True)
    if resample_audio:
        with ThreadPoolExecutor(os.cpu_count()) as pool:
            with tqdm(total=len(wav_paths), desc="resampling") as pbar:
                futures = []
                for wav_path in wav_paths:
                    future = pool.submit(resample_and_save, wav_path,
                                         output_dir / wav_path.name)
                    future.add_done_callback(lambda p: pbar.update())
                    futures.append(future)
                results = []
                for ft in futures:
                    results.append(ft.result())
    else:
        for wav_path in tqdm(wav_paths, desc="copying"):
            shutil.copyfile(wav_path, output_dir / wav_path.name)
    for sentence_id, transcript in tqdm(
            transcriptions.items(), desc="transcription process"):
        with open(output_dir / (sentence_id + ".lab"), 'wt') as f:
            f.write(transcript)
            f.write('\n')
    print("Done!")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Reorganize Baker dataset for MFA")
    parser.add_argument("--root-dir", type=str, help="path to baker dataset.")
    parser.add_argument(
        "--output-dir",
        type=str,
        help="path to save outputs(audio and transcriptions)")
    parser.add_argument(
        "--resample-audio",
        action="store_true",
        help="To resample audio files or just copy them")
    args = parser.parse_args()
    reorganize_baker(args.root_dir, args.output_dir, args.resample_audio)
--- a/examples/use_mfa/run.sh
+++ b/examples/use_mfa/run.sh
@ -0,0 +1,44 @@
 EXP_DIR=exp
 LEXICON_NAME='simple'
 if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
    echo "generating lexicon..."
    python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r
    echo "lexicon done"
 fi
 if [ ! -d $EXP_DIR/baker_corpus ]; then
    echo "reorganizing baker corpus..."
    python local/recorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio
    echo "reorganization done."
 fi
 echo "detecting oov..."
 python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon"
 echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs."
 MFA_DOWNLOAD_DIR=local/
 if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
    echo "downloading mfa..."
    (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
    echo "download mfa done!"
 fi
 if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
    echo "extracting mfa..."
    (cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz")
    echo "extraction done!"
 fi
 export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
 if [ ! -d "$EXP_DIR/baker_alignment" ]; then
    echo "Start MFA training..."
    mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory exp/.mfa_train_and_align
    echo "training done! \nresults: $EXP_DIR/baker_alignment \nmodel: $EXP_DIR/baker_model\n"
 fi