From e8a9a118bba0e2228efe94bad7ebc62721bdf83e Mon Sep 17 00:00:00 2001 From: iclementine Date: Thu, 22 Apr 2021 17:20:34 +0800 Subject: [PATCH] clean code for data processing --- examples/tacotron2_aishell3/aishell3.py | 42 +++-- examples/tacotron2_aishell3/chinese_g2p.py | 2 +- .../chinese_text_to_pinyin.py | 7 +- examples/tacotron2_aishell3/config.py | 18 +- examples/tacotron2_aishell3/extract_mel.py | 72 +++++--- .../preprocess_transcription.py | 155 ++++++++++++++---- examples/tacotron2_aishell3/process_wav.py | 30 ++-- parakeet/training/experiment.py | 4 - 8 files changed, 226 insertions(+), 104 deletions(-) diff --git a/examples/tacotron2_aishell3/aishell3.py b/examples/tacotron2_aishell3/aishell3.py index d017bdc..995f45c 100644 --- a/examples/tacotron2_aishell3/aishell3.py +++ b/examples/tacotron2_aishell3/aishell3.py @@ -1,49 +1,55 @@ +import pickle from pathlib import Path import numpy as np -import librosa from paddle.io import Dataset -import pickle from parakeet.frontend import Vocab from parakeet.data import batch_text_id, batch_spec from preprocess_transcription import _phones, _tones -voc_phones = Vocab(sorted(list(_phones))) -print(voc_phones) -voc_tones = Vocab(sorted(list(_tones))) -print(voc_tones) -# use yaml to store preprocessed aishell3 dataset +voc_phones = Vocab(sorted(list(_phones))) +print("vocab_phones:\n", voc_phones) +voc_tones = Vocab(sorted(list(_tones))) +print("vocab+tones:\n", voc_tones) + + class AiShell3(Dataset): + """Processed AiShell3 dataset.""" def __init__(self, root): + super().__init__() self.root = Path(root).expanduser() self.embed_dir = self.root / "embed" self.mel_dir = self.root / "mel" - with open (self.root / "metadata.pickle", 'rb') as f: + with open(self.root / "metadata.pickle", 'rb') as f: self.records = pickle.load(f) - + def __getitem__(self, index): metadatum = self.records[index] sentence_id = metadatum["sentence_id"] speaker_id = sentence_id[:7] phones = metadatum["phones"] tones = metadatum["tones"] - phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64) - tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64) + phones = np.array([voc_phones.lookup(item) for item in phones], + dtype=np.int64) + tones = np.array([voc_tones.lookup(item) for item in tones], + dtype=np.int64) mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy"))) - embed = np.load(str(self.embed_dir / speaker_id / (sentence_id + ".npy"))) + embed = np.load( + str(self.embed_dir / speaker_id / (sentence_id + ".npy"))) return phones, tones, mel, embed - + def __len__(self): return len(self.records) - + + def collate_aishell3_examples(examples): phones, tones, mel, embed = list(zip(*examples)) text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64) spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64) - T_dec = np.max(spec_lengths) - stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32) + stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, + -1)).astype(np.float32) phones, _ = batch_text_id(phones) tones, _ = batch_text_id(tones) mel, _ = batch_spec(mel) @@ -53,13 +59,13 @@ def collate_aishell3_examples(examples): # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T) return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens + if __name__ == "__main__": dataset = AiShell3("~/datasets/aishell3/train") example = dataset[0] examples = [dataset[i] for i in range(10)] batch = collate_aishell3_examples(examples) - + for field in batch: print(field.shape, field.dtype) - diff --git a/examples/tacotron2_aishell3/chinese_g2p.py b/examples/tacotron2_aishell3/chinese_g2p.py index a1fccdd..47d84d2 100644 --- a/examples/tacotron2_aishell3/chinese_g2p.py +++ b/examples/tacotron2_aishell3/chinese_g2p.py @@ -2,9 +2,9 @@ from chinese_text_to_pinyin import convert_to_pinyin from chinese_phonology import convert, split_syllable from typing import List, Tuple + def convert_sentence(text: str) -> List[Tuple[str]]: syllables = convert_to_pinyin(text) - syllables = [item[0] for item in syllables] phones = [] tones = [] for syllable in syllables: diff --git a/examples/tacotron2_aishell3/chinese_text_to_pinyin.py b/examples/tacotron2_aishell3/chinese_text_to_pinyin.py index 8a165ef..0574141 100644 --- a/examples/tacotron2_aishell3/chinese_text_to_pinyin.py +++ b/examples/tacotron2_aishell3/chinese_text_to_pinyin.py @@ -1,4 +1,4 @@ -from pypinyin import pinyin, Style +from pypinyin import lazy_pinyin, Style from typing import List @@ -6,6 +6,7 @@ def convert_to_pinyin(text: str) -> List[str]: """convert text into list of syllables, other characters that are not chinese, thus cannot be converted to pinyin are splited. """ - syllables = pinyin(text, style=Style.TONE3, neutral_tone_with_five=True) + syllables = lazy_pinyin(text, + style=Style.TONE3, + neutral_tone_with_five=True) return syllables - diff --git a/examples/tacotron2_aishell3/config.py b/examples/tacotron2_aishell3/config.py index 4580ddb..3148162 100644 --- a/examples/tacotron2_aishell3/config.py +++ b/examples/tacotron2_aishell3/config.py @@ -23,8 +23,8 @@ _C.data = CN( n_fft=1024, # fft frame size win_length=1024, # window size hop_length=256, # hop size between ajacent frame - f_max=8000, # Hz, max frequency when converting to mel - f_min=0, # Hz, min frequency when converting to mel + fmax=8000, # Hz, max frequency when converting to mel + fmin=0, # Hz, min frequency when converting to mel d_mels=80, # mel bands padding_idx=0, # text embedding's padding index )) @@ -38,8 +38,10 @@ _C.model = CN( encoder_conv_layers=3, # number of conv layer in tacotron2 encoder encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder d_prenet=256, # hidden size of decoder prenet - d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder - d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder + d_attention_rnn= + 1024, # hidden size of the first rnn layer in tacotron2 decoder + d_decoder_rnn= + 1024, # hidden size of the second rnn layer in tacotron2 decoder d_attention=128, # hidden size of decoder location linear layer attention_filters=32, # number of filter in decoder location conv layer attention_kernel_size=31, # kernel size of decoder location conv layer @@ -48,8 +50,10 @@ _C.model = CN( postnet_conv_layers=5, # number of conv layer in decoder postnet p_encoder_dropout=0.5, # droput probability in encoder p_prenet_dropout=0.5, # droput probability in decoder prenet - p_attention_dropout=0.1, # droput probability of first rnn layer in decoder - p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder + p_attention_dropout= + 0.1, # droput probability of first rnn layer in decoder + p_decoder_dropout= + 0.1, # droput probability of second rnn layer in decoder p_postnet_dropout=0.5, # droput probability in decoder postnet guided_attention_loss_sigma=0.2, d_global_condition=256, @@ -71,5 +75,3 @@ def get_cfg_defaults(): # Return a clone so that the defaults will not be altered # This is for the "local variable" use pattern return _C.clone() - - diff --git a/examples/tacotron2_aishell3/extract_mel.py b/examples/tacotron2_aishell3/extract_mel.py index f701bb2..31909eb 100644 --- a/examples/tacotron2_aishell3/extract_mel.py +++ b/examples/tacotron2_aishell3/extract_mel.py @@ -1,17 +1,20 @@ +import argparse import numpy as np from pathlib import Path from parakeet.audio import AudioProcessor -from parakeet.audio.spec_normalizer import LogMagnitude +from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude import multiprocessing as mp from functools import partial import tqdm -from yacs.config import CfgNode -def extract_mel(fname:Path, input_dir:Path, output_dir:Path, p, n): +from config import get_cfg_defaults + + +def extract_mel(fname: Path, input_dir: Path, output_dir: Path, + p: AudioProcessor, n: NormalizerBase): relative_path = fname.relative_to(input_dir) out_path = (output_dir / relative_path).with_suffix(".npy") out_path.parent.mkdir(parents=True, exist_ok=True) - # TODO: maybe we need to rescale the audio wav = p.read_wav(fname) mel = p.mel_spectrogram(wav) mel = n.transform(mel) @@ -25,33 +28,54 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"): output_dir.mkdir(parents=True, exist_ok=True) p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length, - config.hop_length, config.n_mels, config.fmin, - config.fmax) + config.hop_length, config.n_mels, config.fmin, + config.fmax) n = LogMagnitude(1e-5) func = partial(extract_mel, - input_dir=input_dir, - output_dir=output_dir, - p=p, - n=n) + input_dir=input_dir, + output_dir=output_dir, + p=p, + n=n) with mp.Pool(16) as pool: list( tqdm.tqdm(pool.imap(func, fnames), - total=len(fnames), - unit="utterance")) - + total=len(fnames), + unit="utterance")) if __name__ == "__main__": - audio_config = { - "sample_rate": 22050, - "n_fft": 1024, - "win_length": 1024, - "hop_length": 256, - "n_mels": 80, - "fmin": 0, - "fmax": 8000} - audio_config = CfgNode(audio_config) - extract_mel_multispeaker(audio_config, "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/mel") - + parser = argparse.ArgumentParser( + description= + "Extract mel spectrogram from processed wav in AiShell3 training dataset." + ) + parser.add_argument( + "--config", + type=str, + help="yaml config file to overwrite the default config") + parser.add_argument("--input", + type=str, + default="~/datasets/aishell3/train/normalized_wav", + help="path of the processed wav folder") + parser.add_argument("--output", + type=str, + default="~/datasets/aishell3/train/mel", + help="path of the folder to save mel spectrograms") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help= + "options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + default_config = get_cfg_defaults() + + args = parser.parse_args() + if args.config: + default_config.merge_from_file(args.config) + if args.opts: + default_config.merge_from_list(args.opts) + default_config.freeze() + audio_config = default_config.data + + extract_mel_multispeaker(audio_config, args.input, args.output) diff --git a/examples/tacotron2_aishell3/preprocess_transcription.py b/examples/tacotron2_aishell3/preprocess_transcription.py index f9e40e0..f9a53df 100644 --- a/examples/tacotron2_aishell3/preprocess_transcription.py +++ b/examples/tacotron2_aishell3/preprocess_transcription.py @@ -1,13 +1,10 @@ -from paddle.io import Dataset +import argparse from pathlib import Path import re import pickle + import yaml import tqdm -from parakeet.audio import AudioProcessor, LogMagnitude -import numpy as np -import multiprocessing as mp -from functools import partial zh_pattern = re.compile("[\u4e00-\u9fa5]") @@ -16,21 +13,71 @@ _tones = {'', '', '', '0', '1', '2', '3', '4', '5'} _pauses = {'%', '$'} _initials = { - 'b', 'p', 'm', 'f', - 'd', 't', 'n', 'l', - 'g', 'k', 'h', - 'j', 'q', 'x', - 'zh', 'ch', 'sh', + 'b', + 'p', + 'm', + 'f', + 'd', + 't', + 'n', + 'l', + 'g', + 'k', + 'h', + 'j', + 'q', + 'x', + 'zh', + 'ch', + 'sh', 'r', - 'z', 'c', 's', + 'z', + 'c', + 's', } _finals = { - 'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er', - 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng', - 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng', - 'v', 've', 'van', 'ven', 'veng', -} + 'ii', + 'iii', + 'a', + 'o', + 'e', + 'ea', + 'ai', + 'ei', + 'ao', + 'ou', + 'an', + 'en', + 'ang', + 'eng', + 'er', + 'i', + 'ia', + 'io', + 'ie', + 'iai', + 'iao', + 'iou', + 'ian', + 'ien', + 'iang', + 'ieng', + 'u', + 'ua', + 'uo', + 'uai', + 'uei', + 'uan', + 'uen', + 'uang', + 'ueng', + 'v', + 've', + 'van', + 'ven', + 'veng', +} _ernized_symbol = {'&r'} @@ -38,6 +85,7 @@ _specials = {'', '', '', ''} _phones = _initials | _finals | _ernized_symbol | _specials | _pauses + def is_zh(word): global zh_pattern match = zh_pattern.search(word) @@ -47,6 +95,7 @@ def is_zh(word): def ernized(syllable): return syllable[:2] != "er" and syllable[-2] == 'r' + def convert(syllable): # expansion of o -> uo syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable) @@ -56,15 +105,17 @@ def convert(syllable): # expansion for ing, in syllable = syllable.replace("ing", "ieng").replace("in", "ien") - + # expansion for un, ui, iu - syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou") + syllable = syllable.replace("un", + "uen").replace("ui", + "uei").replace("iu", "iou") # rule for variants of i syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\ .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\ .replace("ri", "riii") - + # rule for y preceding i, u syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i") @@ -72,11 +123,21 @@ def convert(syllable): syllable = syllable.replace("wu", "u").replace("w", "u") # rule for v following j, q, x - syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv") + syllable = syllable.replace("ju", "jv").replace("qu", + "qv").replace("xu", "xv") return syllable -def split_syllable(syllable:str): + +def split_syllable(syllable: str): + """Split a syllable in pinyin into a list of phones and a list of tones. + Initials have no tone, represented by '0', while finals have tones from + '1,2,3,4,5'. + + e.g. + + zhang -> ['zh', 'ang'], ['0', '1'] + """ if syllable in _pauses: # syllable, tone return [syllable], ['0'] @@ -104,7 +165,7 @@ def split_syllable(syllable:str): return phones, tones -def load_aishell3_transcription(line:str): +def load_aishell3_transcription(line: str): sentence_id, pinyin, text = line.strip().split("|") syllables = pinyin.strip().split() @@ -118,7 +179,7 @@ def load_aishell3_transcription(line:str): else: results.append(syllable[:-2] + syllable[-1]) results.append('&r5') - + phones = [] tones = [] for syllable in results: @@ -127,7 +188,13 @@ def load_aishell3_transcription(line:str): tones.extend(t) for p in phones: assert p in _phones, p - return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones} + return { + "sentence_id": sentence_id, + "text": text, + "syllables": results, + "phones": phones, + "tones": tones + } def process_aishell3(dataset_root, output_dir): @@ -136,26 +203,46 @@ def process_aishell3(dataset_root, output_dir): output_dir.mkdir(parents=True, exist_ok=True) prosody_label_path = dataset_root / "label_train-set.txt" - with open(prosody_label_path, 'rt') as f: + with open(prosody_label_path, 'rt') as f: lines = [line.strip() for line in f] - + records = lines[5:] - + processed_records = [] for record in tqdm.tqdm(records): new_record = load_aishell3_transcription(record) processed_records.append(new_record) print(new_record) - with open(output_dir / "metadata.pickle", 'wb') as f: + with open(output_dir / "metadata.pickle", 'wb') as f: pickle.dump(processed_records, f) - - with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f: - yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True) - - print("metadata done!") + with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f: + yaml.safe_dump(processed_records, + f, + default_flow_style=None, + allow_unicode=True) + + print("metadata done!") if __name__ == "__main__": - process_aishell3("~/datasets/aishell3/train", "~/datasets/aishell3/train") \ No newline at end of file + parser = argparse.ArgumentParser( + description= + "Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)." + ) + parser.add_argument( + "--input", + type=str, + default="~/datasets/aishell3/train", + help="path of the training dataset,(contains a label_train-set.txt).") + parser.add_argument( + "--output", + type=str, + help="the directory to save the processed transcription." + "If not provided, it would be the same as the input.") + args = parser.parse_args() + if args.output is None: + args.output = args.input + + process_aishell3(args.input, args.output) diff --git a/examples/tacotron2_aishell3/process_wav.py b/examples/tacotron2_aishell3/process_wav.py index 8cf0a95..b6c6a78 100644 --- a/examples/tacotron2_aishell3/process_wav.py +++ b/examples/tacotron2_aishell3/process_wav.py @@ -1,15 +1,17 @@ -import librosa -import soundfile as sf from pathlib import Path from multiprocessing import Pool -from tqdm import tqdm from functools import partial + import numpy as np +import librosa +import soundfile as sf +from tqdm import tqdm from praatio import tgio + def get_valid_part(fpath): f = tgio.openTextgrid(fpath) - + start = 0 phone_entry_list = f.tierDict['phones'].entryList first_entry = phone_entry_list[0] @@ -22,7 +24,7 @@ def get_valid_part(fpath): else: end = last_entry.end return start, end - + def process_utterance(fpath, source_dir, target_dir, alignment_dir): rel_path = fpath.relative_to(source_dir) @@ -41,16 +43,20 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir): source_dir = Path(source_dir).expanduser() target_dir = Path(target_dir).expanduser() alignment_dir = Path(alignment_dir).expanduser() - + wav_paths = list(source_dir.rglob("*.wav")) print(f"there are {len(wav_paths)} audio files in total") - fx = partial(process_utterance, source_dir=source_dir, target_dir=target_dir, alignment_dir=alignment_dir) + fx = partial(process_utterance, + source_dir=source_dir, + target_dir=target_dir, + alignment_dir=alignment_dir) with Pool(16) as p: - list(tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance")) + list( + tqdm(p.imap(fx, wav_paths), total=len(wav_paths), + unit="utterance")) if __name__ == "__main__": - preprocess_aishell3("~/datasets/aishell3/train/wav", "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/alignment") - - - + preprocess_aishell3("~/datasets/aishell3/train/wav", + "~/datasets/aishell3/train/normalized_wav", + "~/datasets/aishell3/train/alignment") diff --git a/parakeet/training/experiment.py b/parakeet/training/experiment.py index 433362b..a49c1f9 100644 --- a/parakeet/training/experiment.py +++ b/parakeet/training/experiment.py @@ -12,19 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import time import sys -from collections import defaultdict import logging from pathlib import Path -import numpy as np import paddle from paddle import distributed as dist from paddle.io import DistributedBatchSampler from visualdl import LogWriter -import parakeet from parakeet.utils import checkpoint, mp_tools __all__ = ["ExperimentBase"]