add vctk example for refactored tacotron
This commit is contained in:
parent
7cc3e8c340
commit
4a039b6407
|
@ -0,0 +1,74 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=5, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
f_max=8000, # Hz, max frequency when converting to mel
|
||||
f_min=0, # Hz, min frequency when converting to mel
|
||||
d_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=37, # set this according to the frontend's vocab size
|
||||
num_speakers=109, # set this according to the dataset you use
|
||||
d_speakers=32,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
|
||||
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
guided_attn_loss_sigma=0.2 # sigma in guided attention loss
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
plot_interval=1000, # plot attention and spectrogram
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
|
@ -0,0 +1,190 @@
|
|||
import time
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
import librosa
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle import DataParallel
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from paddle.optimizer import Adam
|
||||
|
||||
import parakeet
|
||||
from parakeet.data import dataset
|
||||
from parakeet.frontend import EnglishCharacter
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
|
||||
|
||||
from config import get_cfg_defaults
|
||||
from vctk import VCTK, collate_vctk_examples
|
||||
|
||||
|
||||
class TacotronVCTKExperiment(ExperimentBase):
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model_config = config.model
|
||||
data_config = config.data
|
||||
|
||||
model = Tacotron2(
|
||||
vocab_size=model_config.vocab_size,
|
||||
num_speakers=model_config.num_speakers,
|
||||
d_speaker=model_config.d_speaker,
|
||||
d_mels=data_config.d_mels,
|
||||
d_encoder=model_config.d_encoder,
|
||||
encoder_conv_layers=model_config.encoder_conv_layers,
|
||||
encoder_kernel_size=model_config.encoder_kernel_size,
|
||||
d_prenet=model_config.d_prenet,
|
||||
d_attention_rnn=model_config.d_attention_rnn,
|
||||
d_decoder_rnn=model_config.d_decoder_rnn,
|
||||
attention_filters=model_config.attention_filters,
|
||||
attention_kernel_size=model_config.attention_kernel_size,
|
||||
d_attention=model_config.d_attention,
|
||||
d_postnet=model_config.d_postnet,
|
||||
postnet_kernel_size=model_config.postnet_kernel_size,
|
||||
postnet_conv_layers=model_config.postnet_conv_layers,
|
||||
reduction_factor=model_config.reduction_factor,
|
||||
p_encoder_dropout=model_config.p_encodewr_dropout,
|
||||
p_prenet_dropout=model_config.p_prenet_dropout,
|
||||
p_attention_dropout=model_config.p_attention_dropout,
|
||||
p_decoder_dropout=model_config.p_decoder_dropout,
|
||||
p_postnet_dropout=model_config.p_postnet_dropout)
|
||||
self.model_core = model
|
||||
self.model = DataParallel(model) if self.parallel else model
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = Adam(learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
self.optimizer = optimizer
|
||||
|
||||
criterion = Tacotron2Loss(config.mode.guided_attn_loss_sigma)
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
config = self.config
|
||||
args = self.args
|
||||
|
||||
dataset = VCTK(args.data)
|
||||
valid_dataset, train_dataset = dataset.split(dataste,
|
||||
config.data.valid_size)
|
||||
if self.parallel:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(train_dataset,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=collate_vctk_examples,
|
||||
num_workers=4)
|
||||
else:
|
||||
self.train_loader = DataLoader(train_dataset,
|
||||
batch_size=config.data.batch_size,
|
||||
num_workers=8,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.valid_loader = DataLoader(valid_dataset,
|
||||
batch_size=1,
|
||||
num_workers=1,
|
||||
shuffle=False,
|
||||
drop_last=False)
|
||||
|
||||
|
||||
def train_batch(self):
|
||||
if self.parallel:
|
||||
dist.barrier()
|
||||
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
phonemes, plens, mels, slens, speaker_ids = batch
|
||||
|
||||
outputs = self.model(phonemes, plens, mels, slens, speaker_ids)
|
||||
|
||||
losses = self.criterion(outputs["mel_output"],
|
||||
outputs["mel_outputs_postnets"],
|
||||
mels,
|
||||
outputs["alignments"],
|
||||
slens,
|
||||
plens)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
# this is evaluation
|
||||
self.model.eval()
|
||||
model_core = self.model_core
|
||||
for i, batch in self.valid_loader:
|
||||
phonemes, plens, mels, slens, speaker_ids = batch
|
||||
outputs = model_core.infer(phonemes, speaker_ids=speaker_ids)
|
||||
|
||||
fig = display.plot_spectrogram(output["mel_outputs_postnet"][0].numpy().T)
|
||||
self.visualizer.add_figure(f"sentence_{i}/predicted_mel", fig, self.iteration)
|
||||
|
||||
fig = display.plot_spectrogram(mels[0].numpy().T)
|
||||
self.visualizer.add_figure(f"sentence_{i}/ground_truth_mel", fig, self.iteration)
|
||||
|
||||
fig = display.plot_alignment(outputs["alignments"][0].numpy())
|
||||
self.visualizer.add_figure(f"sentence_{i}/predicted_mel", fig, self.iteration)
|
||||
|
||||
mel_basis = librosa.filters.mel(22050, n_fft=1024, n_mels=80, fmin=0, fmax=8000)
|
||||
_inv_mel_basis = np.linalg.pinv(mel_basis)
|
||||
spec = np.matmul(_inv_mel_basis, np.exp(output["mel_outputs_postnet"][0].numpy().T))
|
||||
wav = librosa.core.griffinlim(spec, hop_length=256, win_length=1024)
|
||||
self.visualizer.add_audio(f"predicted/sentence_{i}", wav, self.iteration, sample_rate=22050)
|
||||
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -0,0 +1,55 @@
|
|||
import numpy as np
|
||||
import paddle
|
||||
from paddle.io import Dataset
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
import pickle
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
from parakeet.data import batch_spec, batch_text_id
|
||||
|
||||
|
||||
class VCTK(Dataset):
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
record_path = self.root / "metadata.pickle"
|
||||
self.wav_root = root / "wav"
|
||||
self.mel_root = root / "mel"
|
||||
with open(record_path, 'rb') as f:
|
||||
self.metadata = pickle.load(f)
|
||||
with open(self.root / "vocab" / "phonemes.yaml", 'rt') as f:
|
||||
phonemes = yaml.safe_load(f)
|
||||
self.phoneme_vocab = Vocab(phonemes)
|
||||
with open(self.root / "vocab" / "speakers.yaml", 'rt') as f:
|
||||
speakers = yaml.safe_load(f)
|
||||
self.speaker_vocab = Vocab(speakers,
|
||||
padding_symbol=None,
|
||||
unk_symbol=None,
|
||||
start_symbol=None,
|
||||
end_symbol=None)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
metadatum = self.metadata[idx]
|
||||
fileid = metadatum['id']
|
||||
speaker_id = fileid.split('_')[0]
|
||||
s_id = self.speaker_vocab.lookup(speaker_id)
|
||||
phonemes = np.array([self.phoneme_vocab.lookup(item) for item in metadatum['phonemes']], dtype=np.int64)
|
||||
mel_path = (self.mel_root / speaker_id / fileid).with_suffix(".npy")
|
||||
mel = np.load(mel_path).astype(np.float32)
|
||||
|
||||
example = (phonemes, mel, s_id)
|
||||
return example
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.metadata)
|
||||
|
||||
def collate_vctk_examples(examples):
|
||||
phonemes, mels, speaker_ids = list(zip(*examples))
|
||||
plens = np.array([item.shape[0] for item in phonemes], dtype=np.int64)
|
||||
slens = np.array([item.shape[1] for item in mels], dtype=np.int64)
|
||||
speaker_ids = np.array(speaker_ids, dtype=np.int64)
|
||||
|
||||
phonemes = batch_text_id(phonemes, pad_id=0)
|
||||
mels = np.transpose(batch_spec(mels, pad_value=0.), [0, 2, 1])
|
||||
return phonemes, plens, mels, slens, speaker_ids
|
||||
|
|
@ -597,7 +597,7 @@ class Tacotron2(nn.Layer):
|
|||
num_layers=postnet_conv_layers,
|
||||
dropout=p_postnet_dropout)
|
||||
|
||||
def forward(self, text_inputs, mels, text_lens, output_lens=None, speaker_ids=None):
|
||||
def forward(self, text_inputs, text_lens, mels, output_lens=None, speaker_ids=None):
|
||||
"""Calculate forward propagation of tacotron2.
|
||||
|
||||
Parameters
|
||||
|
|
Loading…
Reference in New Issue