refine code, fix typos.

2021-07-01 18:51:35 +08:00 · 2021-07-01 18:51:35 +08:00 · 47a9ab3a0b
parent 96b8e44015
commit 47a9ab3a0b
3 changed files with 64 additions and 26 deletions
--- a/examples/use_mfa/local/generate_lexicon.py
+++ b/examples/use_mfa/local/generate_lexicon.py
@ -11,6 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Generate lexicon and symbols for Mandarin Chinese phonology.
+The lexicon is used for Montreal Force Aligner.
+
+Note that syllables are used as word in this lexicon. Since syllables rather 
+than words are used in transcriptions produced by `reorganize_baker.py`.
+
+We make this choice to better leverage other software for chinese text to 
+pinyin tools like pypinyin. This is the convention for G2P in Chinese.
+"""

 import re
 import argparse
@ -32,55 +41,68 @@ SPECIALS = ['sil', 'sp']


 def rule(C, V, R, T):
+    """Generate a syllable given the initial, the final, erhua indicator, and tone.
+    Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
+    
+    Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to 
+    'u' in syllables when certain conditions are satisfied.
+    
+    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
+
+    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+
+    When a syllable is impossible or does not have any characters with this pronunciation, return None
+    to filter it out.
+    """

    # 不可拼的音节, ii 只能和 z, c, s 拼
    if V in ["ii"] and (C not in ['z', 'c', 's']):
-        return
+        return None
    # iii 只能和 zh, ch, sh, r 拼
    if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']):
-        return
+        return None

    # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s
    if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and (
            C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']):
-        return
+        return None

    # 撮口呼只能和 j, q, x l, n 拼
    if V.startswith("v"):
        # v, ve 只能和 j ,q , x, n, l 拼
        if V in ['v', 've']:
            if C not in ['j', 'q', 'x', 'n', 'l', '']:
-                return
+                return None
        # 其他只能和 j, q, x 拼
        else:
            if C not in ['j', 'q', 'x', '']:
-                return
+                return None

    # j, q, x 只能和齐齿呼或者撮口呼拼
    if (C in ['j', 'q', 'x']) and not (
        (V not in ['ii', 'iii']) and V[0] in ['i', 'v']):
-        return
+        return None

    # b, p ,m, f 不能和合口呼拼，除了 u 之外
    # bm p, m, f 不能和撮口呼拼
    if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or
                                        V == 'ong'):
-        return
+        return None

    # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
    if V in ['ua', 'uai', 'uang'
             ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
-        return
+        return None

    # sh 和 ong 不能拼
    if V == 'ong' and C in ['sh']:
-        return
+        return None

    # o 和 gkh, zh ch sh r z c s 不能拼
    if V == "o" and C in [
            'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's'
    ]:
-        return
+        return None

    # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong
    if V == 'ueng' and C != '':
@ -88,7 +110,7 @@ def rule(C, V, R, T):

    # 非儿化的 er 只能单独存在
    if V == 'er' and C != '':
-        return
+        return None

    if C == '':
        if V in ["i", "in", "ing"]:
@ -118,7 +140,7 @@ def rule(C, V, R, T):

    # Filter  er 不能再儿化
    if result.endswith('r') and R == 'r':
-        return
+        return None

    # ii and iii, change back to i
    result = re.sub(r'i+', 'i', result)
@ -127,13 +149,13 @@ def rule(C, V, R, T):
    return result


-def generate_lexicon(with_tone=False, with_r=False):
-    # generate lexicon withou tone and erhua
+def generate_lexicon(with_tone=False, with_erhua=False):
+    """Generate lexicon for Mandarin Chinese."""
    syllables = OrderedDict()

    for C in [''] + INITIALS:
        for V in FINALS:
-            for R in [''] if not with_r else ['', 'r']:
+            for R in [''] if not with_erhua else ['', 'r']:
                for T in [''] if not with_tone else ['1', '2', '3', '4', '5']:
                    result = rule(C, V, R, T)
                    if result:
@ -142,11 +164,12 @@ def generate_lexicon(with_tone=False, with_r=False):


 def generate_symbols(lexicon):
+    """Generate phoneme list for a lexicon."""
    symbols = set()
    for p in SPECIALS:
        symbols.add(p)
-    for syllable, phonems in lexicon.items():
-        phonemes = phonems.split()
+    for syllable, phonemes in lexicon.items():
+        phonemes = phonemes.split()
        for p in phonemes:
            symbols.add(p)
    return sorted(list(symbols))
--- a/examples/use_mfa/local/recorganize_baker.py
+++ b/examples/use_mfa/local/recorganize_baker.py
@ -11,16 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Script to reorganize Baker dataset so as to use Montreal Force
+Aligner to align transcription and audio.
+
+Please refer to https://montreal-forced-aligner.readthedocs.io/en/latest/data_prep.html
+for more details about Montreal Force Aligner's requirements on cotpus.
+
+For scripts to reorganize other corpus, please refer to 
+ https://github.com/MontrealCorpusTools/MFA-reorganization-scripts
+for more details.
+"""

-from typing import Union
-from concurrent.futures import ThreadPoolExecutor
-from pathlib import Path
-import soundfile as sf
-import librosa
-from tqdm import tqdm
 import os
 import shutil
 import argparse
+from typing import Union
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+
+import librosa
+import soundfile as sf
+from tqdm import tqdm


 def get_transcripts(path: Union[str, Path]):
@ -54,7 +65,7 @@ def reorganize_baker(root_dir: Union[str, Path],
    transcriptions = get_transcripts(transcript_path)

    wave_dir = root_dir / "Wave"
-    wav_paths = list(wave_dir.glob("*.wav"))
+    wav_paths = sorted(list(wave_dir.glob("*.wav")))
    output_dir = Path(output_dir).expanduser()
    assert wave_dir != output_dir, "Don't use an the original wav's directory as output_dir"

--- a/examples/use_mfa/run.sh
+++ b/examples/use_mfa/run.sh
@ -11,7 +11,9 @@ fi
 if [ ! -d $EXP_DIR/baker_corpus ]; then
    echo "reorganizing baker corpus..."
    python local/recorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio
-    echo "reorganization done."
+    echo "reorganization done. Check output in $EXP_DIR/baker_corpus."
+    echo "audio files are resampled to 16kHz"
+    echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus "
 fi

 echo "detecting oov..."
@ -37,7 +39,9 @@ export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
 if [ ! -d "$EXP_DIR/baker_alignment" ]; then
    echo "Start MFA training..."
    mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory exp/.mfa_train_and_align
-    echo "training done! \nresults: $EXP_DIR/baker_alignment \nmodel: $EXP_DIR/baker_model\n"
+    echo "training done!"
+    echo "results: $EXP_DIR/baker_alignment"
+    echo "model: $EXP_DIR/baker_model"
 fi