add vctk example for refactored tacotron

This commit is contained in:
iclementine 2021-03-31 17:34:19 +08:00
parent 7cc3e8c340
commit 4a039b6407
4 changed files with 320 additions and 1 deletions

View File

@ -0,0 +1,74 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN
_C = CN()
_C.data = CN(
dict(
batch_size=32, # batch size
valid_size=5, # the first N examples are reserved for validation
sample_rate=22050, # Hz, sample rate
n_fft=1024, # fft frame size
win_length=1024, # window size
hop_length=256, # hop size between ajacent frame
f_max=8000, # Hz, max frequency when converting to mel
f_min=0, # Hz, min frequency when converting to mel
d_mels=80, # mel bands
padding_idx=0, # text embedding's padding index
))
_C.model = CN(
dict(
vocab_size=37, # set this according to the frontend's vocab size
num_speakers=109, # set this according to the dataset you use
d_speakers=32,
reduction_factor=1, # reduction factor
d_encoder=512, # embedding & encoder's internal size
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
d_prenet=256, # hidden size of decoder prenet
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
d_attention=128, # hidden size of decoder location linear layer
attention_filters=32, # number of filter in decoder location conv layer
attention_kernel_size=31, # kernel size of decoder location conv layer
d_postnet=512, # hidden size of decoder postnet
postnet_kernel_size=5, # kernel size of conv layers in postnet
postnet_conv_layers=5, # number of conv layer in decoder postnet
p_encoder_dropout=0.5, # droput probability in encoder
p_prenet_dropout=0.5, # droput probability in decoder prenet
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
p_postnet_dropout=0.5, # droput probability in decoder postnet
guided_attn_loss_sigma=0.2 # sigma in guided attention loss
))
_C.training = CN(
dict(
lr=1e-3, # learning rate
weight_decay=1e-6, # the coeff of weight decay
grad_clip_thresh=1.0, # the clip norm of grad clip.
plot_interval=1000, # plot attention and spectrogram
valid_interval=1000, # validation
save_interval=1000, # checkpoint
max_iteration=500000, # max iteration to train
))
def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project."""
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
return _C.clone()

View File

@ -0,0 +1,190 @@
import time
from collections import defaultdict
import numpy as np
import librosa
import paddle
from paddle import distributed as dist
from paddle import DataParallel
from paddle.io import DataLoader, DistributedBatchSampler
from paddle.optimizer import Adam
import parakeet
from parakeet.data import dataset
from parakeet.frontend import EnglishCharacter
from parakeet.training.cli import default_argument_parser
from parakeet.training.experiment import ExperimentBase
from parakeet.utils import display, mp_tools
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
from config import get_cfg_defaults
from vctk import VCTK, collate_vctk_examples
class TacotronVCTKExperiment(ExperimentBase):
def setup_model(self):
config = self.config
model_config = config.model
data_config = config.data
model = Tacotron2(
vocab_size=model_config.vocab_size,
num_speakers=model_config.num_speakers,
d_speaker=model_config.d_speaker,
d_mels=data_config.d_mels,
d_encoder=model_config.d_encoder,
encoder_conv_layers=model_config.encoder_conv_layers,
encoder_kernel_size=model_config.encoder_kernel_size,
d_prenet=model_config.d_prenet,
d_attention_rnn=model_config.d_attention_rnn,
d_decoder_rnn=model_config.d_decoder_rnn,
attention_filters=model_config.attention_filters,
attention_kernel_size=model_config.attention_kernel_size,
d_attention=model_config.d_attention,
d_postnet=model_config.d_postnet,
postnet_kernel_size=model_config.postnet_kernel_size,
postnet_conv_layers=model_config.postnet_conv_layers,
reduction_factor=model_config.reduction_factor,
p_encoder_dropout=model_config.p_encodewr_dropout,
p_prenet_dropout=model_config.p_prenet_dropout,
p_attention_dropout=model_config.p_attention_dropout,
p_decoder_dropout=model_config.p_decoder_dropout,
p_postnet_dropout=model_config.p_postnet_dropout)
self.model_core = model
self.model = DataParallel(model) if self.parallel else model
grad_clip = paddle.nn.ClipGradByGlobalNorm(
config.training.grad_clip_thresh)
optimizer = Adam(learning_rate=config.training.lr,
parameters=model.parameters(),
weight_decay=paddle.regularizer.L2Decay(
config.training.weight_decay),
grad_clip=grad_clip)
self.optimizer = optimizer
criterion = Tacotron2Loss(config.mode.guided_attn_loss_sigma)
self.criterion = criterion
def setup_dataloader(self):
config = self.config
args = self.args
dataset = VCTK(args.data)
valid_dataset, train_dataset = dataset.split(dataste,
config.data.valid_size)
if self.parallel:
sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True)
self.train_loader = DataLoader(train_dataset,
batch_sampler=sampler,
collate_fn=collate_vctk_examples,
num_workers=4)
else:
self.train_loader = DataLoader(train_dataset,
batch_size=config.data.batch_size,
num_workers=8,
shuffle=True,
drop_last=True)
self.valid_loader = DataLoader(valid_dataset,
batch_size=1,
num_workers=1,
shuffle=False,
drop_last=False)
def train_batch(self):
if self.parallel:
dist.barrier()
start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train()
phonemes, plens, mels, slens, speaker_ids = batch
outputs = self.model(phonemes, plens, mels, slens, speaker_ids)
losses = self.criterion(outputs["mel_output"],
outputs["mel_outputs_postnets"],
mels,
outputs["alignments"],
slens,
plens)
loss = losses["loss"]
loss.backward()
self.optimizer.step()
iteration_time = time.time() - start
losses_np = {k: float(v) for k, v in losses.items()}
# logging
msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
self.logger.info(msg)
if dist.get_rank() == 0:
for k, v in losses_np.items():
self.visualizer.add_scalar(f"train_loss/{k}", v,
self.iteration)
@mp_tools.rank_zero_only
@paddle.no_grad()
def valid(self):
# this is evaluation
self.model.eval()
model_core = self.model_core
for i, batch in self.valid_loader:
phonemes, plens, mels, slens, speaker_ids = batch
outputs = model_core.infer(phonemes, speaker_ids=speaker_ids)
fig = display.plot_spectrogram(output["mel_outputs_postnet"][0].numpy().T)
self.visualizer.add_figure(f"sentence_{i}/predicted_mel", fig, self.iteration)
fig = display.plot_spectrogram(mels[0].numpy().T)
self.visualizer.add_figure(f"sentence_{i}/ground_truth_mel", fig, self.iteration)
fig = display.plot_alignment(outputs["alignments"][0].numpy())
self.visualizer.add_figure(f"sentence_{i}/predicted_mel", fig, self.iteration)
mel_basis = librosa.filters.mel(22050, n_fft=1024, n_mels=80, fmin=0, fmax=8000)
_inv_mel_basis = np.linalg.pinv(mel_basis)
spec = np.matmul(_inv_mel_basis, np.exp(output["mel_outputs_postnet"][0].numpy().T))
wav = librosa.core.griffinlim(spec, hop_length=256, win_length=1024)
self.visualizer.add_audio(f"predicted/sentence_{i}", wav, self.iteration, sample_rate=22050)
def main_sp(config, args):
exp = Experiment(config, args)
exp.setup()
exp.run()
def main(config, args):
if args.nprocs > 1 and args.device == "gpu":
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else:
main_sp(config, args)
if __name__ == "__main__":
config = get_cfg_defaults()
parser = default_argument_parser()
args = parser.parse_args()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
print(args)
main(config, args)

View File

@ -0,0 +1,55 @@
import numpy as np
import paddle
from paddle.io import Dataset
from pathlib import Path
import yaml
import pickle
from parakeet.frontend.vocab import Vocab
from parakeet.data import batch_spec, batch_text_id
class VCTK(Dataset):
def __init__(self, root):
self.root = root
record_path = self.root / "metadata.pickle"
self.wav_root = root / "wav"
self.mel_root = root / "mel"
with open(record_path, 'rb') as f:
self.metadata = pickle.load(f)
with open(self.root / "vocab" / "phonemes.yaml", 'rt') as f:
phonemes = yaml.safe_load(f)
self.phoneme_vocab = Vocab(phonemes)
with open(self.root / "vocab" / "speakers.yaml", 'rt') as f:
speakers = yaml.safe_load(f)
self.speaker_vocab = Vocab(speakers,
padding_symbol=None,
unk_symbol=None,
start_symbol=None,
end_symbol=None)
def __getitem__(self, idx):
metadatum = self.metadata[idx]
fileid = metadatum['id']
speaker_id = fileid.split('_')[0]
s_id = self.speaker_vocab.lookup(speaker_id)
phonemes = np.array([self.phoneme_vocab.lookup(item) for item in metadatum['phonemes']], dtype=np.int64)
mel_path = (self.mel_root / speaker_id / fileid).with_suffix(".npy")
mel = np.load(mel_path).astype(np.float32)
example = (phonemes, mel, s_id)
return example
def __len__(self):
return len(self.metadata)
def collate_vctk_examples(examples):
phonemes, mels, speaker_ids = list(zip(*examples))
plens = np.array([item.shape[0] for item in phonemes], dtype=np.int64)
slens = np.array([item.shape[1] for item in mels], dtype=np.int64)
speaker_ids = np.array(speaker_ids, dtype=np.int64)
phonemes = batch_text_id(phonemes, pad_id=0)
mels = np.transpose(batch_spec(mels, pad_value=0.), [0, 2, 1])
return phonemes, plens, mels, slens, speaker_ids

View File

@ -597,7 +597,7 @@ class Tacotron2(nn.Layer):
num_layers=postnet_conv_layers,
dropout=p_postnet_dropout)
def forward(self, text_inputs, mels, text_lens, output_lens=None, speaker_ids=None):
def forward(self, text_inputs, text_lens, mels, output_lens=None, speaker_ids=None):
"""Calculate forward propagation of tacotron2.
Parameters