clean code for data processing

2021-04-22 17:20:34 +08:00 · 2021-04-22 17:20:34 +08:00 · e8a9a118bb
parent 56f2552201
commit e8a9a118bb
8 changed files with 226 additions and 104 deletions
--- a/examples/tacotron2_aishell3/aishell3.py
+++ b/examples/tacotron2_aishell3/aishell3.py
@ -1,49 +1,55 @@
+import pickle
 from pathlib import Path
 import numpy as np
-import librosa
 from paddle.io import Dataset
-import pickle
 from parakeet.frontend import Vocab
 from parakeet.data import batch_text_id, batch_spec

 from preprocess_transcription import _phones, _tones
-voc_phones = Vocab(sorted(list(_phones)))
-print(voc_phones)
-voc_tones = Vocab(sorted(list(_tones)))
-print(voc_tones)

-# use yaml to store preprocessed aishell3 dataset
+voc_phones = Vocab(sorted(list(_phones)))
+print("vocab_phones:\n", voc_phones)
+voc_tones = Vocab(sorted(list(_tones)))
+print("vocab+tones:\n", voc_tones)
+
+
 class AiShell3(Dataset):
+    """Processed AiShell3 dataset."""
    def __init__(self, root):
+        super().__init__()
        self.root = Path(root).expanduser()
        self.embed_dir = self.root / "embed"
        self.mel_dir = self.root / "mel"

-        with open (self.root / "metadata.pickle", 'rb') as f:
+        with open(self.root / "metadata.pickle", 'rb') as f:
            self.records = pickle.load(f)
-    
+
    def __getitem__(self, index):
        metadatum = self.records[index]
        sentence_id = metadatum["sentence_id"]
        speaker_id = sentence_id[:7]
        phones = metadatum["phones"]
        tones = metadatum["tones"]
-        phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)
-        tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
+        phones = np.array([voc_phones.lookup(item) for item in phones],
+                          dtype=np.int64)
+        tones = np.array([voc_tones.lookup(item) for item in tones],
+                         dtype=np.int64)
        mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
-        embed = np.load(str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
+        embed = np.load(
+            str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
        return phones, tones, mel, embed
-    
+
    def __len__(self):
        return len(self.records)
-    
+
+
 def collate_aishell3_examples(examples):
    phones, tones, mel, embed = list(zip(*examples))

    text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
-    T_dec = np.max(spec_lengths)
-    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
+    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
+                                                      -1)).astype(np.float32)
    phones, _ = batch_text_id(phones)
    tones, _ = batch_text_id(tones)
    mel, _ = batch_spec(mel)
@ -53,13 +59,13 @@ def collate_aishell3_examples(examples):
    # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
    return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens

+
 if __name__ == "__main__":
    dataset = AiShell3("~/datasets/aishell3/train")
    example = dataset[0]

    examples = [dataset[i] for i in range(10)]
    batch = collate_aishell3_examples(examples)
-    
+
    for field in batch:
        print(field.shape, field.dtype)
-        
--- a/examples/tacotron2_aishell3/chinese_g2p.py
+++ b/examples/tacotron2_aishell3/chinese_g2p.py
@ -2,9 +2,9 @@ from chinese_text_to_pinyin import convert_to_pinyin
 from chinese_phonology import convert, split_syllable
 from typing import List, Tuple

+
 def convert_sentence(text: str) -> List[Tuple[str]]:
    syllables = convert_to_pinyin(text)
-    syllables = [item[0] for item in syllables]
    phones = []
    tones = []
    for syllable in syllables:
--- a/examples/tacotron2_aishell3/chinese_text_to_pinyin.py
+++ b/examples/tacotron2_aishell3/chinese_text_to_pinyin.py
@ -1,4 +1,4 @@
-from pypinyin import pinyin, Style
+from pypinyin import lazy_pinyin, Style
 from typing import List


@ -6,6 +6,7 @@ def convert_to_pinyin(text: str) -> List[str]:
    """convert text into list of syllables, other characters that are not chinese, thus
    cannot be converted to pinyin are splited.
    """
-    syllables = pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
+    syllables = lazy_pinyin(text,
+                            style=Style.TONE3,
+                            neutral_tone_with_five=True)
    return syllables
-    
--- a/examples/tacotron2_aishell3/config.py
+++ b/examples/tacotron2_aishell3/config.py
@ -23,8 +23,8 @@ _C.data = CN(
        n_fft=1024,  # fft frame size
        win_length=1024,  # window size
        hop_length=256,  # hop size between ajacent frame
-        f_max=8000,  # Hz, max frequency when converting to mel
-        f_min=0,  # Hz, min frequency when converting to mel
+        fmax=8000,  # Hz, max frequency when converting to mel
+        fmin=0,  # Hz, min frequency when converting to mel
        d_mels=80,  # mel bands
        padding_idx=0,  # text embedding's padding index
    ))
@ -38,8 +38,10 @@ _C.model = CN(
        encoder_conv_layers=3,  # number of conv layer in tacotron2 encoder
        encoder_kernel_size=5,  # kernel size of conv layers in tacotron2 encoder
        d_prenet=256,  # hidden size of decoder prenet
-        d_attention_rnn=1024,  # hidden size of the first rnn layer in tacotron2 decoder
-        d_decoder_rnn=1024,  # hidden size of the second rnn layer in tacotron2 decoder
+        d_attention_rnn=
+        1024,  # hidden size of the first rnn layer in tacotron2 decoder
+        d_decoder_rnn=
+        1024,  # hidden size of the second rnn layer in tacotron2 decoder
        d_attention=128,  # hidden size of  decoder location linear layer
        attention_filters=32,  # number of filter in decoder location conv layer
        attention_kernel_size=31,  # kernel size of decoder location conv layer
@ -48,8 +50,10 @@ _C.model = CN(
        postnet_conv_layers=5,  # number of conv layer in decoder postnet
        p_encoder_dropout=0.5,  # droput probability in encoder
        p_prenet_dropout=0.5,  # droput probability in decoder prenet
-        p_attention_dropout=0.1,  # droput probability of first rnn layer in decoder
-        p_decoder_dropout=0.1,  # droput probability of second rnn layer in decoder
+        p_attention_dropout=
+        0.1,  # droput probability of first rnn layer in decoder
+        p_decoder_dropout=
+        0.1,  # droput probability of second rnn layer in decoder
        p_postnet_dropout=0.5,  # droput probability in decoder postnet
        guided_attention_loss_sigma=0.2,
        d_global_condition=256,
@ -71,5 +75,3 @@ def get_cfg_defaults():
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    return _C.clone()
-
-
--- a/examples/tacotron2_aishell3/extract_mel.py
+++ b/examples/tacotron2_aishell3/extract_mel.py
@ -1,17 +1,20 @@
+import argparse
 import numpy as np
 from pathlib import Path
 from parakeet.audio import AudioProcessor
-from parakeet.audio.spec_normalizer import LogMagnitude
+from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
 import multiprocessing as mp
 from functools import partial
 import tqdm
-from yacs.config import CfgNode

-def extract_mel(fname:Path, input_dir:Path, output_dir:Path, p, n):
+from config import get_cfg_defaults
+
+
+def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
+                p: AudioProcessor, n: NormalizerBase):
    relative_path = fname.relative_to(input_dir)
    out_path = (output_dir / relative_path).with_suffix(".npy")
    out_path.parent.mkdir(parents=True, exist_ok=True)
-    # TODO: maybe we need to rescale the audio
    wav = p.read_wav(fname)
    mel = p.mel_spectrogram(wav)
    mel = n.transform(mel)
@ -25,33 +28,54 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
    output_dir.mkdir(parents=True, exist_ok=True)

    p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
-                    config.hop_length, config.n_mels, config.fmin,
-                    config.fmax)
+                       config.hop_length, config.n_mels, config.fmin,
+                       config.fmax)
    n = LogMagnitude(1e-5)

    func = partial(extract_mel,
-                input_dir=input_dir,
-                output_dir=output_dir,
-                p=p,
-                n=n)
+                   input_dir=input_dir,
+                   output_dir=output_dir,
+                   p=p,
+                   n=n)

    with mp.Pool(16) as pool:
        list(
            tqdm.tqdm(pool.imap(func, fnames),
-                    total=len(fnames),
-                    unit="utterance"))
-
+                      total=len(fnames),
+                      unit="utterance"))


 if __name__ == "__main__":
-    audio_config = {
-        "sample_rate": 22050,
-        "n_fft": 1024,
-        "win_length": 1024,
-        "hop_length": 256,
-        "n_mels": 80,
-        "fmin": 0,
-        "fmax": 8000}
-    audio_config = CfgNode(audio_config)
-    extract_mel_multispeaker(audio_config, "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/mel")
-    
+    parser = argparse.ArgumentParser(
+        description=
+        "Extract mel spectrogram from processed wav in AiShell3 training dataset."
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="yaml config file to overwrite the default config")
+    parser.add_argument("--input",
+                        type=str,
+                        default="~/datasets/aishell3/train/normalized_wav",
+                        help="path of the processed wav folder")
+    parser.add_argument("--output",
+                        type=str,
+                        default="~/datasets/aishell3/train/mel",
+                        help="path of the folder to save mel spectrograms")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help=
+        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    default_config = get_cfg_defaults()
+
+    args = parser.parse_args()
+    if args.config:
+        default_config.merge_from_file(args.config)
+    if args.opts:
+        default_config.merge_from_list(args.opts)
+    default_config.freeze()
+    audio_config = default_config.data
+
+    extract_mel_multispeaker(audio_config, args.input, args.output)
--- a/examples/tacotron2_aishell3/preprocess_transcription.py
+++ b/examples/tacotron2_aishell3/preprocess_transcription.py
@ -1,13 +1,10 @@
-from paddle.io import Dataset
+import argparse
 from pathlib import Path
 import re
 import pickle
+
 import yaml
 import tqdm
-from parakeet.audio import AudioProcessor, LogMagnitude
-import numpy as np
-import multiprocessing as mp
-from functools import partial

 zh_pattern = re.compile("[\u4e00-\u9fa5]")

@ -16,21 +13,71 @@ _tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
 _pauses = {'%', '$'}

 _initials = {
-    'b', 'p', 'm', 'f',
-    'd', 't', 'n', 'l',
-    'g', 'k', 'h',
-    'j', 'q', 'x',
-    'zh', 'ch', 'sh',
+    'b',
+    'p',
+    'm',
+    'f',
+    'd',
+    't',
+    'n',
+    'l',
+    'g',
+    'k',
+    'h',
+    'j',
+    'q',
+    'x',
+    'zh',
+    'ch',
+    'sh',
    'r',
-    'z', 'c', 's',
+    'z',
+    'c',
+    's',
 }

 _finals = {
-    'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er',
-    'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng',
-    'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
-    'v', 've', 'van', 'ven', 'veng',
-}   
+    'ii',
+    'iii',
+    'a',
+    'o',
+    'e',
+    'ea',
+    'ai',
+    'ei',
+    'ao',
+    'ou',
+    'an',
+    'en',
+    'ang',
+    'eng',
+    'er',
+    'i',
+    'ia',
+    'io',
+    'ie',
+    'iai',
+    'iao',
+    'iou',
+    'ian',
+    'ien',
+    'iang',
+    'ieng',
+    'u',
+    'ua',
+    'uo',
+    'uai',
+    'uei',
+    'uan',
+    'uen',
+    'uang',
+    'ueng',
+    'v',
+    've',
+    'van',
+    'ven',
+    'veng',
+}

 _ernized_symbol = {'&r'}

@ -38,6 +85,7 @@ _specials = {'<pad>', '<unk>', '<s>', '</s>'}

 _phones = _initials | _finals | _ernized_symbol | _specials | _pauses

+
 def is_zh(word):
    global zh_pattern
    match = zh_pattern.search(word)
@ -47,6 +95,7 @@ def is_zh(word):
 def ernized(syllable):
    return syllable[:2] != "er" and syllable[-2] == 'r'

+
 def convert(syllable):
    # expansion of o -> uo
    syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
@ -56,15 +105,17 @@ def convert(syllable):

    # expansion for ing, in
    syllable = syllable.replace("ing", "ieng").replace("in", "ien")
-    
+
    # expansion for un, ui, iu
-    syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou")
+    syllable = syllable.replace("un",
+                                "uen").replace("ui",
+                                               "uei").replace("iu", "iou")

    # rule for variants of i
    syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
        .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
        .replace("ri", "riii")
-    
+
    # rule for y preceding i, u
    syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")

@ -72,11 +123,21 @@ def convert(syllable):
    syllable = syllable.replace("wu", "u").replace("w", "u")

    # rule for v following j, q, x
-    syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv")
+    syllable = syllable.replace("ju", "jv").replace("qu",
+                                                    "qv").replace("xu", "xv")

    return syllable

-def split_syllable(syllable:str):
+
+def split_syllable(syllable: str):
+    """Split a syllable in pinyin into a list of phones and a list of tones.
+    Initials have no tone, represented by '0', while finals have tones from
+    '1,2,3,4,5'.
+
+    e.g.
+
+    zhang -> ['zh', 'ang'], ['0', '1']
+    """
    if syllable in _pauses:
        # syllable, tone
        return [syllable], ['0']
@ -104,7 +165,7 @@ def split_syllable(syllable:str):
    return phones, tones


-def load_aishell3_transcription(line:str):
+def load_aishell3_transcription(line: str):
    sentence_id, pinyin, text = line.strip().split("|")
    syllables = pinyin.strip().split()

@ -118,7 +179,7 @@ def load_aishell3_transcription(line:str):
        else:
            results.append(syllable[:-2] + syllable[-1])
            results.append('&r5')
-    
+
    phones = []
    tones = []
    for syllable in results:
@ -127,7 +188,13 @@ def load_aishell3_transcription(line:str):
        tones.extend(t)
    for p in phones:
        assert p in _phones, p
-    return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones}
+    return {
+        "sentence_id": sentence_id,
+        "text": text,
+        "syllables": results,
+        "phones": phones,
+        "tones": tones
+    }


 def process_aishell3(dataset_root, output_dir):
@ -136,26 +203,46 @@ def process_aishell3(dataset_root, output_dir):
    output_dir.mkdir(parents=True, exist_ok=True)

    prosody_label_path = dataset_root / "label_train-set.txt"
-    with open(prosody_label_path, 'rt') as f: 
+    with open(prosody_label_path, 'rt') as f:
        lines = [line.strip() for line in f]
-    
+
    records = lines[5:]
-    
+
    processed_records = []
    for record in tqdm.tqdm(records):
        new_record = load_aishell3_transcription(record)
        processed_records.append(new_record)
        print(new_record)

-    with open(output_dir / "metadata.pickle", 'wb') as f: 
+    with open(output_dir / "metadata.pickle", 'wb') as f:
        pickle.dump(processed_records, f)
-    
-    with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f: 
-        yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True)
-    
-    print("metadata done!")     

+    with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
+        yaml.safe_dump(processed_records,
+                       f,
+                       default_flow_style=None,
+                       allow_unicode=True)
+
+    print("metadata done!")


 if __name__ == "__main__":
-    process_aishell3("~/datasets/aishell3/train", "~/datasets/aishell3/train")
+    parser = argparse.ArgumentParser(
+        description=
+        "Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="~/datasets/aishell3/train",
+        help="path of the training dataset,(contains a label_train-set.txt).")
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="the directory to save the processed transcription."
+        "If not provided, it would be the same as the input.")
+    args = parser.parse_args()
+    if args.output is None:
+        args.output = args.input
+
+    process_aishell3(args.input, args.output)
--- a/examples/tacotron2_aishell3/process_wav.py
+++ b/examples/tacotron2_aishell3/process_wav.py
@ -1,15 +1,17 @@
-import librosa
-import soundfile as sf
 from pathlib import Path
 from multiprocessing import Pool
-from tqdm import tqdm
 from functools import partial
+
 import numpy as np
+import librosa
+import soundfile as sf
+from tqdm import tqdm
 from praatio import tgio

+
 def get_valid_part(fpath):
    f = tgio.openTextgrid(fpath)
-    
+
    start = 0
    phone_entry_list = f.tierDict['phones'].entryList
    first_entry = phone_entry_list[0]
@ -22,7 +24,7 @@ def get_valid_part(fpath):
    else:
        end = last_entry.end
    return start, end
-        
+

 def process_utterance(fpath, source_dir, target_dir, alignment_dir):
    rel_path = fpath.relative_to(source_dir)
@ -41,16 +43,20 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
    source_dir = Path(source_dir).expanduser()
    target_dir = Path(target_dir).expanduser()
    alignment_dir = Path(alignment_dir).expanduser()
-    
+
    wav_paths = list(source_dir.rglob("*.wav"))
    print(f"there are {len(wav_paths)} audio files in total")
-    fx = partial(process_utterance, source_dir=source_dir, target_dir=target_dir, alignment_dir=alignment_dir)
+    fx = partial(process_utterance,
+                 source_dir=source_dir,
+                 target_dir=target_dir,
+                 alignment_dir=alignment_dir)
    with Pool(16) as p:
-        list(tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
+        list(
+            tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
+                 unit="utterance"))


 if __name__ == "__main__":
-    preprocess_aishell3("~/datasets/aishell3/train/wav", "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/alignment")
-    
-    
-    
+    preprocess_aishell3("~/datasets/aishell3/train/wav",
+                        "~/datasets/aishell3/train/normalized_wav",
+                        "~/datasets/aishell3/train/alignment")
--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import time
 import sys
-from collections import defaultdict
 import logging
 from pathlib import Path
-import numpy as np

 import paddle
 from paddle import distributed as dist
 from paddle.io import DistributedBatchSampler
 from visualdl import LogWriter

-import parakeet
 from parakeet.utils import checkpoint, mp_tools

 __all__ = ["ExperimentBase"]