clean code for data processing

2021-04-22 17:20:34 +08:00 · 2021-04-22 17:20:34 +08:00 · e8a9a118bb
parent 56f2552201
commit e8a9a118bb
8 changed files with 226 additions and 104 deletions
--- a/examples/tacotron2_aishell3/aishell3.py
+++ b/examples/tacotron2_aishell3/aishell3.py
@ -1,49 +1,55 @@
 import pickle
 from pathlib import Path
 import numpy as np
 import librosa
 from paddle.io import Dataset
 import pickle
 from parakeet.frontend import Vocab
 from parakeet.data import batch_text_id, batch_spec
 from preprocess_transcription import _phones, _tones
 voc_phones = Vocab(sorted(list(_phones)))
 print(voc_phones)
 voc_tones = Vocab(sorted(list(_tones)))
 print(voc_tones)
-# use yaml to store preprocessed aishell3 dataset
+voc_phones = Vocab(sorted(list(_phones)))
 print("vocab_phones:\n", voc_phones)
 voc_tones = Vocab(sorted(list(_tones)))
 print("vocab+tones:\n", voc_tones)
 class AiShell3(Dataset):
    """Processed AiShell3 dataset."""
    def __init__(self, root):
        super().__init__()
        self.root = Path(root).expanduser()
        self.embed_dir = self.root / "embed"
        self.mel_dir = self.root / "mel"
-        with open (self.root / "metadata.pickle", 'rb') as f:
+        with open(self.root / "metadata.pickle", 'rb') as f:
            self.records = pickle.load(f)
-    
+
    def __getitem__(self, index):
        metadatum = self.records[index]
        sentence_id = metadatum["sentence_id"]
        speaker_id = sentence_id[:7]
        phones = metadatum["phones"]
        tones = metadatum["tones"]
-        phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)
+        phones = np.array([voc_phones.lookup(item) for item in phones],
-        tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
+                          dtype=np.int64)
        tones = np.array([voc_tones.lookup(item) for item in tones],
                         dtype=np.int64)
        mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
-        embed = np.load(str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
+        embed = np.load(
            str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
        return phones, tones, mel, embed
-    
+
    def __len__(self):
        return len(self.records)
-    
+
 def collate_aishell3_examples(examples):
    phones, tones, mel, embed = list(zip(*examples))
    text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
-    T_dec = np.max(spec_lengths)
+    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
-    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
+                                                      -1)).astype(np.float32)
    phones, _ = batch_text_id(phones)
    tones, _ = batch_text_id(tones)
    mel, _ = batch_spec(mel)
@ -53,13 +59,13 @@ def collate_aishell3_examples(examples):
    # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
    return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
 if __name__ == "__main__":
    dataset = AiShell3("~/datasets/aishell3/train")
    example = dataset[0]
    examples = [dataset[i] for i in range(10)]
    batch = collate_aishell3_examples(examples)
-    
+
    for field in batch:
        print(field.shape, field.dtype)
--- a/examples/tacotron2_aishell3/chinese_g2p.py
+++ b/examples/tacotron2_aishell3/chinese_g2p.py
@ -2,9 +2,9 @@ from chinese_text_to_pinyin import convert_to_pinyin
 from chinese_phonology import convert, split_syllable
 from typing import List, Tuple
 def convert_sentence(text: str) -> List[Tuple[str]]:
    syllables = convert_to_pinyin(text)
    syllables = [item[0] for item in syllables]
    phones = []
    tones = []
    for syllable in syllables:
--- a/examples/tacotron2_aishell3/chinese_text_to_pinyin.py
+++ b/examples/tacotron2_aishell3/chinese_text_to_pinyin.py
@ -1,4 +1,4 @@
-from pypinyin import pinyin, Style
+from pypinyin import lazy_pinyin, Style
 from typing import List
@ -6,6 +6,7 @@ def convert_to_pinyin(text: str) -> List[str]:
    """convert text into list of syllables, other characters that are not chinese, thus
    cannot be converted to pinyin are splited.
    """
-    syllables = pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
+    syllables = lazy_pinyin(text,
                            style=Style.TONE3,
                            neutral_tone_with_five=True)
    return syllables
--- a/examples/tacotron2_aishell3/config.py
+++ b/examples/tacotron2_aishell3/config.py
@ -23,8 +23,8 @@ _C.data = CN(
        n_fft=1024,  # fft frame size
        win_length=1024,  # window size
        hop_length=256,  # hop size between ajacent frame
-        f_max=8000,  # Hz, max frequency when converting to mel
+        fmax=8000,  # Hz, max frequency when converting to mel
-        f_min=0,  # Hz, min frequency when converting to mel
+        fmin=0,  # Hz, min frequency when converting to mel
        d_mels=80,  # mel bands
        padding_idx=0,  # text embedding's padding index
    ))
@ -38,8 +38,10 @@ _C.model = CN(
        encoder_conv_layers=3,  # number of conv layer in tacotron2 encoder
        encoder_kernel_size=5,  # kernel size of conv layers in tacotron2 encoder
        d_prenet=256,  # hidden size of decoder prenet
-        d_attention_rnn=1024,  # hidden size of the first rnn layer in tacotron2 decoder
+        d_attention_rnn=
-        d_decoder_rnn=1024,  # hidden size of the second rnn layer in tacotron2 decoder
+        1024,  # hidden size of the first rnn layer in tacotron2 decoder
        d_decoder_rnn=
        1024,  # hidden size of the second rnn layer in tacotron2 decoder
        d_attention=128,  # hidden size of  decoder location linear layer
        attention_filters=32,  # number of filter in decoder location conv layer
        attention_kernel_size=31,  # kernel size of decoder location conv layer
@ -48,8 +50,10 @@ _C.model = CN(
        postnet_conv_layers=5,  # number of conv layer in decoder postnet
        p_encoder_dropout=0.5,  # droput probability in encoder
        p_prenet_dropout=0.5,  # droput probability in decoder prenet
-        p_attention_dropout=0.1,  # droput probability of first rnn layer in decoder
+        p_attention_dropout=
-        p_decoder_dropout=0.1,  # droput probability of second rnn layer in decoder
+        0.1,  # droput probability of first rnn layer in decoder
        p_decoder_dropout=
        0.1,  # droput probability of second rnn layer in decoder
        p_postnet_dropout=0.5,  # droput probability in decoder postnet
        guided_attention_loss_sigma=0.2,
        d_global_condition=256,
@ -71,5 +75,3 @@ def get_cfg_defaults():
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    return _C.clone()
--- a/examples/tacotron2_aishell3/extract_mel.py
+++ b/examples/tacotron2_aishell3/extract_mel.py
@ -1,17 +1,20 @@
 import argparse
 import numpy as np
 from pathlib import Path
 from parakeet.audio import AudioProcessor
-from parakeet.audio.spec_normalizer import LogMagnitude
+from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
 import multiprocessing as mp
 from functools import partial
 import tqdm
 from yacs.config import CfgNode
-def extract_mel(fname:Path, input_dir:Path, output_dir:Path, p, n):
+from config import get_cfg_defaults
 def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
                p: AudioProcessor, n: NormalizerBase):
    relative_path = fname.relative_to(input_dir)
    out_path = (output_dir / relative_path).with_suffix(".npy")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    # TODO: maybe we need to rescale the audio
    wav = p.read_wav(fname)
    mel = p.mel_spectrogram(wav)
    mel = n.transform(mel)
@ -25,33 +28,54 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
    output_dir.mkdir(parents=True, exist_ok=True)
    p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
-                    config.hop_length, config.n_mels, config.fmin,
+                       config.hop_length, config.n_mels, config.fmin,
-                    config.fmax)
+                       config.fmax)
    n = LogMagnitude(1e-5)
    func = partial(extract_mel,
-                input_dir=input_dir,
+                   input_dir=input_dir,
-                output_dir=output_dir,
+                   output_dir=output_dir,
-                p=p,
+                   p=p,
-                n=n)
+                   n=n)
    with mp.Pool(16) as pool:
        list(
            tqdm.tqdm(pool.imap(func, fnames),
-                    total=len(fnames),
+                      total=len(fnames),
-                    unit="utterance"))
+                      unit="utterance"))
 if __name__ == "__main__":
-    audio_config = {
+    parser = argparse.ArgumentParser(
-        "sample_rate": 22050,
+        description=
-        "n_fft": 1024,
+        "Extract mel spectrogram from processed wav in AiShell3 training dataset."
-        "win_length": 1024,
+    )
-        "hop_length": 256,
+    parser.add_argument(
-        "n_mels": 80,
+        "--config",
-        "fmin": 0,
+        type=str,
-        "fmax": 8000}
+        help="yaml config file to overwrite the default config")
-    audio_config = CfgNode(audio_config)
+    parser.add_argument("--input",
-    extract_mel_multispeaker(audio_config, "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/mel")
+                        type=str,
-    
+                        default="~/datasets/aishell3/train/normalized_wav",
                        help="path of the processed wav folder")
    parser.add_argument("--output",
                        type=str,
                        default="~/datasets/aishell3/train/mel",
                        help="path of the folder to save mel spectrograms")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help=
        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
    default_config = get_cfg_defaults()
    args = parser.parse_args()
    if args.config:
        default_config.merge_from_file(args.config)
    if args.opts:
        default_config.merge_from_list(args.opts)
    default_config.freeze()
    audio_config = default_config.data
    extract_mel_multispeaker(audio_config, args.input, args.output)
--- a/examples/tacotron2_aishell3/preprocess_transcription.py
+++ b/examples/tacotron2_aishell3/preprocess_transcription.py
@ -1,13 +1,10 @@
-from paddle.io import Dataset
+import argparse
 from pathlib import Path
 import re
 import pickle
 import yaml
 import tqdm
 from parakeet.audio import AudioProcessor, LogMagnitude
 import numpy as np
 import multiprocessing as mp
 from functools import partial
 zh_pattern = re.compile("[\u4e00-\u9fa5]")
@ -16,21 +13,71 @@ _tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
 _pauses = {'%', '$'}
 _initials = {
-    'b', 'p', 'm', 'f',
+    'b',
-    'd', 't', 'n', 'l',
+    'p',
-    'g', 'k', 'h',
+    'm',
-    'j', 'q', 'x',
+    'f',
-    'zh', 'ch', 'sh',
+    'd',
    't',
    'n',
    'l',
    'g',
    'k',
    'h',
    'j',
    'q',
    'x',
    'zh',
    'ch',
    'sh',
    'r',
-    'z', 'c', 's',
+    'z',
    'c',
    's',
 }
 _finals = {
-    'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er',
+    'ii',
-    'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng',
+    'iii',
-    'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
+    'a',
-    'v', 've', 'van', 'ven', 'veng',
+    'o',
-}   
+    'e',
    'ea',
    'ai',
    'ei',
    'ao',
    'ou',
    'an',
    'en',
    'ang',
    'eng',
    'er',
    'i',
    'ia',
    'io',
    'ie',
    'iai',
    'iao',
    'iou',
    'ian',
    'ien',
    'iang',
    'ieng',
    'u',
    'ua',
    'uo',
    'uai',
    'uei',
    'uan',
    'uen',
    'uang',
    'ueng',
    'v',
    've',
    'van',
    'ven',
    'veng',
 }
 _ernized_symbol = {'&r'}
@ -38,6 +85,7 @@ _specials = {'<pad>', '<unk>', '<s>', '</s>'}
 _phones = _initials | _finals | _ernized_symbol | _specials | _pauses
 def is_zh(word):
    global zh_pattern
    match = zh_pattern.search(word)
@ -47,6 +95,7 @@ def is_zh(word):
 def ernized(syllable):
    return syllable[:2] != "er" and syllable[-2] == 'r'
 def convert(syllable):
    # expansion of o -> uo
    syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
@ -56,15 +105,17 @@ def convert(syllable):
    # expansion for ing, in
    syllable = syllable.replace("ing", "ieng").replace("in", "ien")
-    
+
    # expansion for un, ui, iu
-    syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou")
+    syllable = syllable.replace("un",
                                "uen").replace("ui",
                                               "uei").replace("iu", "iou")
    # rule for variants of i
    syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
        .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
        .replace("ri", "riii")
-    
+
    # rule for y preceding i, u
    syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
@ -72,11 +123,21 @@ def convert(syllable):
    syllable = syllable.replace("wu", "u").replace("w", "u")
    # rule for v following j, q, x
-    syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv")
+    syllable = syllable.replace("ju", "jv").replace("qu",
                                                    "qv").replace("xu", "xv")
    return syllable
-def split_syllable(syllable:str):
+
 def split_syllable(syllable: str):
    """Split a syllable in pinyin into a list of phones and a list of tones.
    Initials have no tone, represented by '0', while finals have tones from
    '1,2,3,4,5'.
    e.g.
    zhang -> ['zh', 'ang'], ['0', '1']
    """
    if syllable in _pauses:
        # syllable, tone
        return [syllable], ['0']
@ -104,7 +165,7 @@ def split_syllable(syllable:str):
    return phones, tones
-def load_aishell3_transcription(line:str):
+def load_aishell3_transcription(line: str):
    sentence_id, pinyin, text = line.strip().split("|")
    syllables = pinyin.strip().split()
@ -118,7 +179,7 @@ def load_aishell3_transcription(line:str):
        else:
            results.append(syllable[:-2] + syllable[-1])
            results.append('&r5')
-    
+
    phones = []
    tones = []
    for syllable in results:
@ -127,7 +188,13 @@ def load_aishell3_transcription(line:str):
        tones.extend(t)
    for p in phones:
        assert p in _phones, p
-    return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones}
+    return {
        "sentence_id": sentence_id,
        "text": text,
        "syllables": results,
        "phones": phones,
        "tones": tones
    }
 def process_aishell3(dataset_root, output_dir):
@ -136,26 +203,46 @@ def process_aishell3(dataset_root, output_dir):
    output_dir.mkdir(parents=True, exist_ok=True)
    prosody_label_path = dataset_root / "label_train-set.txt"
-    with open(prosody_label_path, 'rt') as f: 
+    with open(prosody_label_path, 'rt') as f:
        lines = [line.strip() for line in f]
-    
+
    records = lines[5:]
-    
+
    processed_records = []
    for record in tqdm.tqdm(records):
        new_record = load_aishell3_transcription(record)
        processed_records.append(new_record)
        print(new_record)
-    with open(output_dir / "metadata.pickle", 'wb') as f: 
+    with open(output_dir / "metadata.pickle", 'wb') as f:
        pickle.dump(processed_records, f)
    with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f: 
        yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True)
    print("metadata done!")     
    with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
        yaml.safe_dump(processed_records,
                       f,
                       default_flow_style=None,
                       allow_unicode=True)
    print("metadata done!")
 if __name__ == "__main__":
-    process_aishell3("~/datasets/aishell3/train", "~/datasets/aishell3/train")
+    parser = argparse.ArgumentParser(
        description=
        "Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
    )
    parser.add_argument(
        "--input",
        type=str,
        default="~/datasets/aishell3/train",
        help="path of the training dataset,(contains a label_train-set.txt).")
    parser.add_argument(
        "--output",
        type=str,
        help="the directory to save the processed transcription."
        "If not provided, it would be the same as the input.")
    args = parser.parse_args()
    if args.output is None:
        args.output = args.input
    process_aishell3(args.input, args.output)
--- a/examples/tacotron2_aishell3/process_wav.py
+++ b/examples/tacotron2_aishell3/process_wav.py
@ -1,15 +1,17 @@
 import librosa
 import soundfile as sf
 from pathlib import Path
 from multiprocessing import Pool
 from tqdm import tqdm
 from functools import partial
 import numpy as np
 import librosa
 import soundfile as sf
 from tqdm import tqdm
 from praatio import tgio
 def get_valid_part(fpath):
    f = tgio.openTextgrid(fpath)
-    
+
    start = 0
    phone_entry_list = f.tierDict['phones'].entryList
    first_entry = phone_entry_list[0]
@ -22,7 +24,7 @@ def get_valid_part(fpath):
    else:
        end = last_entry.end
    return start, end
-        
+
 def process_utterance(fpath, source_dir, target_dir, alignment_dir):
    rel_path = fpath.relative_to(source_dir)
@ -41,16 +43,20 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
    source_dir = Path(source_dir).expanduser()
    target_dir = Path(target_dir).expanduser()
    alignment_dir = Path(alignment_dir).expanduser()
-    
+
    wav_paths = list(source_dir.rglob("*.wav"))
    print(f"there are {len(wav_paths)} audio files in total")
-    fx = partial(process_utterance, source_dir=source_dir, target_dir=target_dir, alignment_dir=alignment_dir)
+    fx = partial(process_utterance,
                 source_dir=source_dir,
                 target_dir=target_dir,
                 alignment_dir=alignment_dir)
    with Pool(16) as p:
-        list(tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
+        list(
            tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
                 unit="utterance"))
 if __name__ == "__main__":
-    preprocess_aishell3("~/datasets/aishell3/train/wav", "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/alignment")
+    preprocess_aishell3("~/datasets/aishell3/train/wav",
-    
+                        "~/datasets/aishell3/train/normalized_wav",
-    
+                        "~/datasets/aishell3/train/alignment")
--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 import sys
 from collections import defaultdict
 import logging
 from pathlib import Path
 import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle.io import DistributedBatchSampler
 from visualdl import LogWriter
 import parakeet
 from parakeet.utils import checkpoint, mp_tools
 __all__ = ["ExperimentBase"]