From 4a039b6407590017f855fcfe9b2c6c4b4e7ea6bb Mon Sep 17 00:00:00 2001 From: iclementine Date: Wed, 31 Mar 2021 17:34:19 +0800 Subject: [PATCH] add vctk example for refactored tacotron --- examples/tacotron2_vctk/config.py | 74 ++++++++++++ examples/tacotron2_vctk/train.py | 190 ++++++++++++++++++++++++++++++ examples/tacotron2_vctk/vctk.py | 55 +++++++++ parakeet/models/tacotron2.py | 2 +- 4 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 examples/tacotron2_vctk/config.py create mode 100644 examples/tacotron2_vctk/train.py create mode 100644 examples/tacotron2_vctk/vctk.py diff --git a/examples/tacotron2_vctk/config.py b/examples/tacotron2_vctk/config.py new file mode 100644 index 0000000..6b5dff6 --- /dev/null +++ b/examples/tacotron2_vctk/config.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from yacs.config import CfgNode as CN + +_C = CN() +_C.data = CN( + dict( + batch_size=32, # batch size + valid_size=5, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=1024, # fft frame size + win_length=1024, # window size + hop_length=256, # hop size between ajacent frame + f_max=8000, # Hz, max frequency when converting to mel + f_min=0, # Hz, min frequency when converting to mel + d_mels=80, # mel bands + padding_idx=0, # text embedding's padding index + )) + +_C.model = CN( + dict( + vocab_size=37, # set this according to the frontend's vocab size + num_speakers=109, # set this according to the dataset you use + d_speakers=32, + reduction_factor=1, # reduction factor + d_encoder=512, # embedding & encoder's internal size + encoder_conv_layers=3, # number of conv layer in tacotron2 encoder + encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder + d_prenet=256, # hidden size of decoder prenet + d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder + d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder + d_attention=128, # hidden size of decoder location linear layer + attention_filters=32, # number of filter in decoder location conv layer + attention_kernel_size=31, # kernel size of decoder location conv layer + d_postnet=512, # hidden size of decoder postnet + postnet_kernel_size=5, # kernel size of conv layers in postnet + postnet_conv_layers=5, # number of conv layer in decoder postnet + p_encoder_dropout=0.5, # droput probability in encoder + p_prenet_dropout=0.5, # droput probability in decoder prenet + p_attention_dropout=0.1, # droput probability of first rnn layer in decoder + p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder + p_postnet_dropout=0.5, # droput probability in decoder postnet + guided_attn_loss_sigma=0.2 # sigma in guided attention loss + )) + +_C.training = CN( + dict( + lr=1e-3, # learning rate + weight_decay=1e-6, # the coeff of weight decay + grad_clip_thresh=1.0, # the clip norm of grad clip. + plot_interval=1000, # plot attention and spectrogram + valid_interval=1000, # validation + save_interval=1000, # checkpoint + max_iteration=500000, # max iteration to train + )) + + +def get_cfg_defaults(): + """Get a yacs CfgNode object with default values for my_project.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + return _C.clone() diff --git a/examples/tacotron2_vctk/train.py b/examples/tacotron2_vctk/train.py new file mode 100644 index 0000000..461becd --- /dev/null +++ b/examples/tacotron2_vctk/train.py @@ -0,0 +1,190 @@ +import time +from collections import defaultdict +import numpy as np +import librosa + +import paddle +from paddle import distributed as dist +from paddle import DataParallel +from paddle.io import DataLoader, DistributedBatchSampler +from paddle.optimizer import Adam + +import parakeet +from parakeet.data import dataset +from parakeet.frontend import EnglishCharacter +from parakeet.training.cli import default_argument_parser +from parakeet.training.experiment import ExperimentBase +from parakeet.utils import display, mp_tools +from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss + +from config import get_cfg_defaults +from vctk import VCTK, collate_vctk_examples + + +class TacotronVCTKExperiment(ExperimentBase): + def setup_model(self): + config = self.config + model_config = config.model + data_config = config.data + + model = Tacotron2( + vocab_size=model_config.vocab_size, + num_speakers=model_config.num_speakers, + d_speaker=model_config.d_speaker, + d_mels=data_config.d_mels, + d_encoder=model_config.d_encoder, + encoder_conv_layers=model_config.encoder_conv_layers, + encoder_kernel_size=model_config.encoder_kernel_size, + d_prenet=model_config.d_prenet, + d_attention_rnn=model_config.d_attention_rnn, + d_decoder_rnn=model_config.d_decoder_rnn, + attention_filters=model_config.attention_filters, + attention_kernel_size=model_config.attention_kernel_size, + d_attention=model_config.d_attention, + d_postnet=model_config.d_postnet, + postnet_kernel_size=model_config.postnet_kernel_size, + postnet_conv_layers=model_config.postnet_conv_layers, + reduction_factor=model_config.reduction_factor, + p_encoder_dropout=model_config.p_encodewr_dropout, + p_prenet_dropout=model_config.p_prenet_dropout, + p_attention_dropout=model_config.p_attention_dropout, + p_decoder_dropout=model_config.p_decoder_dropout, + p_postnet_dropout=model_config.p_postnet_dropout) + self.model_core = model + self.model = DataParallel(model) if self.parallel else model + + grad_clip = paddle.nn.ClipGradByGlobalNorm( + config.training.grad_clip_thresh) + optimizer = Adam(learning_rate=config.training.lr, + parameters=model.parameters(), + weight_decay=paddle.regularizer.L2Decay( + config.training.weight_decay), + grad_clip=grad_clip) + self.optimizer = optimizer + + criterion = Tacotron2Loss(config.mode.guided_attn_loss_sigma) + self.criterion = criterion + + def setup_dataloader(self): + config = self.config + args = self.args + + dataset = VCTK(args.data) + valid_dataset, train_dataset = dataset.split(dataste, + config.data.valid_size) + if self.parallel: + sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.data.batch_size, + shuffle=True, + drop_last=True) + self.train_loader = DataLoader(train_dataset, + batch_sampler=sampler, + collate_fn=collate_vctk_examples, + num_workers=4) + else: + self.train_loader = DataLoader(train_dataset, + batch_size=config.data.batch_size, + num_workers=8, + shuffle=True, + drop_last=True) + self.valid_loader = DataLoader(valid_dataset, + batch_size=1, + num_workers=1, + shuffle=False, + drop_last=False) + + + def train_batch(self): + if self.parallel: + dist.barrier() + + start = time.time() + batch = self.read_batch() + data_loader_time = time.time() - start + + self.optimizer.clear_grad() + self.model.train() + phonemes, plens, mels, slens, speaker_ids = batch + + outputs = self.model(phonemes, plens, mels, slens, speaker_ids) + + losses = self.criterion(outputs["mel_output"], + outputs["mel_outputs_postnets"], + mels, + outputs["alignments"], + slens, + plens) + loss = losses["loss"] + loss.backward() + self.optimizer.step() + iteration_time = time.time() - start + + losses_np = {k: float(v) for k, v in losses.items()} + # logging + msg = "Rank: {}, ".format(dist.get_rank()) + msg += "step: {}, ".format(self.iteration) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_np.items()) + self.logger.info(msg) + + if dist.get_rank() == 0: + for k, v in losses_np.items(): + self.visualizer.add_scalar(f"train_loss/{k}", v, + self.iteration) + + @mp_tools.rank_zero_only + @paddle.no_grad() + def valid(self): + # this is evaluation + self.model.eval() + model_core = self.model_core + for i, batch in self.valid_loader: + phonemes, plens, mels, slens, speaker_ids = batch + outputs = model_core.infer(phonemes, speaker_ids=speaker_ids) + + fig = display.plot_spectrogram(output["mel_outputs_postnet"][0].numpy().T) + self.visualizer.add_figure(f"sentence_{i}/predicted_mel", fig, self.iteration) + + fig = display.plot_spectrogram(mels[0].numpy().T) + self.visualizer.add_figure(f"sentence_{i}/ground_truth_mel", fig, self.iteration) + + fig = display.plot_alignment(outputs["alignments"][0].numpy()) + self.visualizer.add_figure(f"sentence_{i}/predicted_mel", fig, self.iteration) + + mel_basis = librosa.filters.mel(22050, n_fft=1024, n_mels=80, fmin=0, fmax=8000) + _inv_mel_basis = np.linalg.pinv(mel_basis) + spec = np.matmul(_inv_mel_basis, np.exp(output["mel_outputs_postnet"][0].numpy().T)) + wav = librosa.core.griffinlim(spec, hop_length=256, win_length=1024) + self.visualizer.add_audio(f"predicted/sentence_{i}", wav, self.iteration, sample_rate=22050) + + + + def main_sp(config, args): + exp = Experiment(config, args) + exp.setup() + exp.run() + + +def main(config, args): + if args.nprocs > 1 and args.device == "gpu": + dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + else: + main_sp(config, args) + + +if __name__ == "__main__": + config = get_cfg_defaults() + parser = default_argument_parser() + args = parser.parse_args() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + print(args) + + main(config, args) diff --git a/examples/tacotron2_vctk/vctk.py b/examples/tacotron2_vctk/vctk.py new file mode 100644 index 0000000..f24a0d0 --- /dev/null +++ b/examples/tacotron2_vctk/vctk.py @@ -0,0 +1,55 @@ +import numpy as np +import paddle +from paddle.io import Dataset +from pathlib import Path +import yaml +import pickle +from parakeet.frontend.vocab import Vocab +from parakeet.data import batch_spec, batch_text_id + + +class VCTK(Dataset): + def __init__(self, root): + self.root = root + record_path = self.root / "metadata.pickle" + self.wav_root = root / "wav" + self.mel_root = root / "mel" + with open(record_path, 'rb') as f: + self.metadata = pickle.load(f) + with open(self.root / "vocab" / "phonemes.yaml", 'rt') as f: + phonemes = yaml.safe_load(f) + self.phoneme_vocab = Vocab(phonemes) + with open(self.root / "vocab" / "speakers.yaml", 'rt') as f: + speakers = yaml.safe_load(f) + self.speaker_vocab = Vocab(speakers, + padding_symbol=None, + unk_symbol=None, + start_symbol=None, + end_symbol=None) + + def __getitem__(self, idx): + metadatum = self.metadata[idx] + fileid = metadatum['id'] + speaker_id = fileid.split('_')[0] + s_id = self.speaker_vocab.lookup(speaker_id) + phonemes = np.array([self.phoneme_vocab.lookup(item) for item in metadatum['phonemes']], dtype=np.int64) + mel_path = (self.mel_root / speaker_id / fileid).with_suffix(".npy") + mel = np.load(mel_path).astype(np.float32) + + example = (phonemes, mel, s_id) + return example + + + def __len__(self): + return len(self.metadata) + +def collate_vctk_examples(examples): + phonemes, mels, speaker_ids = list(zip(*examples)) + plens = np.array([item.shape[0] for item in phonemes], dtype=np.int64) + slens = np.array([item.shape[1] for item in mels], dtype=np.int64) + speaker_ids = np.array(speaker_ids, dtype=np.int64) + + phonemes = batch_text_id(phonemes, pad_id=0) + mels = np.transpose(batch_spec(mels, pad_value=0.), [0, 2, 1]) + return phonemes, plens, mels, slens, speaker_ids + diff --git a/parakeet/models/tacotron2.py b/parakeet/models/tacotron2.py index 29bc4e7..2b33f36 100644 --- a/parakeet/models/tacotron2.py +++ b/parakeet/models/tacotron2.py @@ -597,7 +597,7 @@ class Tacotron2(nn.Layer): num_layers=postnet_conv_layers, dropout=p_postnet_dropout) - def forward(self, text_inputs, mels, text_lens, output_lens=None, speaker_ids=None): + def forward(self, text_inputs, text_lens, mels, output_lens=None, speaker_ids=None): """Calculate forward propagation of tacotron2. Parameters