add FastSpeech

This commit is contained in:
lifuchen 2020-01-03 08:25:17 +00:00 committed by chenfeiyu
parent 9fe6ad11f0
commit 2179d6d5b0
29 changed files with 1457 additions and 442 deletions

View File

View File

@ -0,0 +1,148 @@
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from paddle import fluid
from parakeet import g2p
from parakeet import audio
from parakeet.data.sampler import *
from parakeet.data.datacargo import DataCargo
from parakeet.data.dataset import Dataset
from parakeet.data.batch import TextIDBatcher, SpecBatcher
class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_vocoder=False):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path)
dataset = LJSpeech(LJSPEECH_ROOT, config)
sampler = DistributedSampler(len(dataset), nranks, rank)
assert config.batch_size % nranks == 0
each_bs = config.batch_size // nranks
if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True)
else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True)
self.reader = fluid.io.DataLoader.from_generator(
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
self.reader.set_batch_generator(dataloader, place)
class LJSpeech(Dataset):
def __init__(self, root, config):
super(LJSpeech, self).__init__()
assert isinstance(root, (str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.metadata = self._prepare_metadata()
self.config = config
def _prepare_metadata(self):
csv_path = self.root.joinpath("metadata.csv")
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
names=["fname", "raw_text", "normalized_text"])
return metadata
def _get_example(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname, raw_text, normalized_text = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav")
_ljspeech_processor = audio.AudioProcessor(
sample_rate=22050,
num_mels=80,
min_level_db=-100,
ref_level_db=20,
n_fft=2048,
win_length= int(22050 * 0.05),
hop_length= int(22050 * 0.0125),
power=1.2,
preemphasis=0.97,
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
mel_fmin=0,
mel_fmax=None,
clip_norm=True,
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = _ljspeech_processor.load_wav(str(wav_path))
mag = _ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def __getitem__(self, index):
metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum)
return example
def __iter__(self):
for i in range(len(self)):
yield self[i]
def __len__(self):
return len(self.metadata)
def batch_examples(batch):
texts = []
mels = []
mel_inputs = []
text_lens = []
pos_texts = []
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
def batch_examples_vocoder(batch):
mels=[]
mags=[]
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
return (mels, mags)

View File

View File

@ -0,0 +1,41 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
encoder_output_size: 384
word_vec_dim: 384
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
decoder_output_size: 384
d_model: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
image_step: 2000
use_gpu: False
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ./checkpoint
transformer_step: 70000
log_dir: ./log

View File

@ -0,0 +1,43 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
encoder_output_size: 384
embedding_size: 384
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
decoder_output_size: 384
hidden_size: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
dropout: 0.1
transformer_head: 4
warm_up_step: 4000
grad_clip_thresh: 0.1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint
transformer_step: 20
log_dir: ./log

View File

@ -0,0 +1,124 @@
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
import os
import hparams
import Audio
from text import text_to_sequence
from utils import process_text, pad_1D, pad_2D
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class FastSpeechDataset(Dataset):
""" LJSpeech """
def __init__(self):
self.text = process_text(os.path.join("data", "train.txt"))
def __len__(self):
return len(self.text)
def __getitem__(self, idx):
mel_gt_name = os.path.join(
hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1))
mel_gt_target = np.load(mel_gt_name)
D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy"))
character = self.text[idx][0:len(self.text[idx])-1]
character = np.array(text_to_sequence(
character, hparams.text_cleaners))
sample = {"text": character,
"mel_target": mel_gt_target,
"D": D}
return sample
def reprocess(batch, cut_list):
texts = [batch[ind]["text"] for ind in cut_list]
mel_targets = [batch[ind]["mel_target"] for ind in cut_list]
Ds = [batch[ind]["D"] for ind in cut_list]
length_text = np.array([])
for text in texts:
length_text = np.append(length_text, text.shape[0])
src_pos = list()
max_len = int(max(length_text))
for length_src_row in length_text:
src_pos.append(np.pad([i+1 for i in range(int(length_src_row))],
(0, max_len-int(length_src_row)), 'constant'))
src_pos = np.array(src_pos)
length_mel = np.array(list())
for mel in mel_targets:
length_mel = np.append(length_mel, mel.shape[0])
mel_pos = list()
max_mel_len = int(max(length_mel))
for length_mel_row in length_mel:
mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))],
(0, max_mel_len-int(length_mel_row)), 'constant'))
mel_pos = np.array(mel_pos)
texts = pad_1D(texts)
Ds = pad_1D(Ds)
mel_targets = pad_2D(mel_targets)
out = {"text": texts,
"mel_target": mel_targets,
"D": Ds,
"mel_pos": mel_pos,
"src_pos": src_pos,
"mel_max_len": max_mel_len}
return out
def collate_fn(batch):
len_arr = np.array([d["text"].shape[0] for d in batch])
index_arr = np.argsort(-len_arr)
batchsize = len(batch)
real_batchsize = int(math.sqrt(batchsize))
cut_list = list()
for i in range(real_batchsize):
cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize])
output = list()
for i in range(real_batchsize):
output.append(reprocess(batch, cut_list[i]))
return output
if __name__ == "__main__":
# Test
dataset = FastSpeechDataset()
training_loader = DataLoader(dataset,
batch_size=1,
shuffle=False,
collate_fn=collate_fn,
drop_last=True,
num_workers=0)
total_step = hparams.epochs * len(training_loader) * hparams.batch_size
cnt = 0
for i, batchs in enumerate(training_loader):
for j, data_of_batch in enumerate(batchs):
mel_target = torch.from_numpy(
data_of_batch["mel_target"]).float().to(device)
D = torch.from_numpy(data_of_batch["D"]).int().to(device)
# print(mel_target.size())
# print(D.sum())
print(cnt)
if mel_target.size(1) == D.sum().item():
cnt += 1
print(cnt)

View File

@ -0,0 +1,117 @@
import numpy as np
import math
import utils
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer):
"""FFT Block"""
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
enc_output, enc_slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
enc_output *= non_pad_mask
enc_output = self.pos_ffn(enc_output)
enc_output *= non_pad_mask
return enc_output, enc_slf_attn
class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor(input_size=input_size,
out_channels=out_channels,
filter_size=filter_size,
dropout=dropout)
def LR(self, x, duration_predictor_output, alpha=1.0):
output = []
batch_size = x.shape[0]
for i in range(batch_size):
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
output = self.pad(output)
return output
def pad(self, input_ele):
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
out_list = []
for i in range(len(input_ele)):
pad_len = max_len - input_ele[i].shape[0]
one_batch_padded = layers.pad(
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
out_list.append(one_batch_padded)
out_padded = layers.stack(out_list)
return out_padded
def expand(self, batch, predicted, alpha):
out = []
time_steps = batch.shape[1]
fertilities = predicted.numpy()
batch = layers.squeeze(batch,[0])
for i in range(time_steps):
if fertilities[0,i]==0:
continue
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
out = layers.concat(out, axis=0)
return out
def forward(self, x, alpha=1.0, target=None):
duration_predictor_output = self.duration_predictor(x)
if fluid.framework._dygraph_tracer()._train_mode:
output = self.LR(x, target)
return output, duration_predictor_output
else:
duration_predictor_output = layers.round(duration_predictor_output)
output = self.LR(x, duration_predictor_output, alpha)
mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])])
return output, mel_pos
class DurationPredictor(dg.Layer):
""" Duration Predictor """
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(DurationPredictor, self).__init__()
self.input_size = input_size
self.out_channels = out_channels
self.filter_size = filter_size
self.dropout = dropout
self.conv1 = Conv1D(in_channels = self.input_size,
out_channels = self.out_channels,
filter_size = self.filter_size,
padding=1,
data_format='NTC')
self.conv2 = Conv1D(in_channels = self.out_channels,
out_channels = self.out_channels,
filter_size = self.filter_size,
padding=1,
data_format='NTC')
self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.linear =dg.Linear(self.out_channels, 1)
def forward(self, encoder_output):
# encoder_output.shape(N, T, C)
out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout)
out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout)
out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1])
return out

View File

@ -0,0 +1,163 @@
from utils import *
from modules import *
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols
from parakeet.modules.utils import *
from parakeet.modules.post_convnet import PostConvNet
class Encoder(dg.Layer):
def __init__(self,
n_src_vocab,
len_max_seq,
d_word_vec,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, character, text_pos):
enc_slf_attn_list = []
# -- prepare masks
# shape character (N, T)
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
non_pad_mask = get_non_pad_mask(character)
# -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list
class Decoder(dg.Layer):
def __init__(self,
len_max_seq,
d_word_vec,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos):
dec_slf_attn_list = []
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
non_pad_mask = get_non_pad_mask(enc_pos)
# -- Forward
dec_output = enc_seq + self.position_enc(enc_pos)
for dec_layer in self.layer_stack:
dec_output, dec_slf_attn = dec_layer(
dec_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list
class FastSpeech(dg.Layer):
def __init__(self, cfg):
" FastSpeech"
super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size,
n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head,
d_k=64,
d_v=64,
d_model=cfg.hidden_size,
d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size,
out_channels=cfg.duration_predictor_output_size,
filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size,
n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head,
d_k=64,
d_v=64,
d_model=cfg.hidden_size,
d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1)
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels)
self.postnet = PostConvNet(n_mels=80,
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=1,
use_cudnn=True,
dropout=0.1)
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
target=length_target,
alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
else:
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
decoder_output = self.decoder(length_regulator_output, decoder_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet

View File

@ -0,0 +1,93 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=float, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=float, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding.")
parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2,
help="the attention head number in encoder.")
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.")
parser.add_argument('--encoder_output_size', type=int, default=256,
help="the output channel size of encoder.")
parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.")
parser.add_argument('--decoder_output_size', type=int, default=256,
help="the output channel size of decoder.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
help="the filter size of conv1d in duration prediction.")
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
help="the filter size of conv1d in fft.")
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
help="the padding size of conv1d in fft.")
parser.add_argument('--dropout', type=float, default=0.1,
help="the dropout in network.")
parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=70000,
help="the step to load transformerTTS model.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -0,0 +1,139 @@
import numpy as np
import argparse
import os
import time
import math
import jsonargparse
from pathlib import Path
from tqdm import tqdm
from tensorboardX import SummaryWriter
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parse import add_config_options_to_parser
from pprint import pprint
from network import FastSpeech
from utils import get_alignment
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
from parakeet.models.transformerTTS.network import TransformerTTS
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main(cfg):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
transformerTTS = TransformerTTS(cfg)
model_path = os.path.join(cfg.transtts_path, "transformer")
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
#for param in transformerTTS.state_dict():
# print(param)
transformerTTS.set_dict(model_dict)
transformerTTS.eval()
model = FastSpeech(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step))
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
global_step += 1
#Forward
result= model(character,
pos_text,
mel_pos=pos_mel,
length_target=alignment)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0:
print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if cfg.use_data_parallel:
total_loss = model.scale_loss(total_loss)
total_loss.backward()
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
writer.close()
if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c config/fastspeech.yaml'.split())
main(cfg)

View File

@ -0,0 +1,32 @@
import numpy as np
def get_alignment(attn_probs, n_head):
max_F = 0
assert attn_probs[0].shape[0] % n_head == 0
batch_size = int(attn_probs[0].shape[0] // n_head)
for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy()
for j in range(n_head):
attn = multi_attn[j*batch_size:(j+1)*batch_size]
F = score_F(attn)
if max_F < F:
max_F = F
max_attn = attn
alignment = compute_duration(max_attn)
return alignment
def score_F(attn):
max = np.max(attn, axis=-1)
mean = np.mean(max)
return mean
def compute_duration(attn):
alignment = np.zeros([attn.shape[0],attn.shape[2]])
for i in range(attn.shape[0]):
for j in range(attn.shape[1]):
max_index = attn[i,j].tolist().index(attn[i,j].max())
alignment[i,max_index] += 1
return alignment

View File

@ -10,9 +10,8 @@ audio:
ref_level_db: 20 ref_level_db: 20
outputs_per_step: 1 outputs_per_step: 1
network: hidden_size: 256
hidden_size: 256 embedding_size: 512
embedding_size: 512
batch_size: 32 batch_size: 32

View File

@ -10,15 +10,15 @@ audio:
ref_level_db: 20 ref_level_db: 20
outputs_per_step: 1 outputs_per_step: 1
network:
hidden_size: 256 hidden_size: 384 #256
embedding_size: 512 embedding_size: 384 #512
batch_size: 32 batch_size: 32
epochs: 10000 epochs: 10000
lr: 0.001 lr: 0.001
save_step: 500 save_step: 10
image_step: 2000 image_step: 2000
use_gpu: True use_gpu: True
use_data_parallel: True use_data_parallel: True

View File

@ -3,10 +3,10 @@ import numpy as np
from paddle import fluid from paddle import fluid
from parakeet.data.sampler import DistributedSampler from parakeet.data.sampler import DistributedSampler
from parakeet.data.datacargo import DataCargo from parakeet.data.datacargo import DataCargo
from preprocess import batch_examples, LJSpeech, batch_examples_postnet from preprocess import batch_examples, LJSpeech, batch_examples_vocoder
class LJSpeechLoader: class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_postnet=False): def __init__(self, config, nranks, rank, is_vocoder=False):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path) LJSPEECH_ROOT = Path(config.data_path)
@ -15,8 +15,8 @@ class LJSpeechLoader:
assert config.batch_size % nranks == 0 assert config.batch_size % nranks == 0
each_bs = config.batch_size // nranks each_bs = config.batch_size // nranks
if is_postnet: if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True)
else: else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True)

View File

@ -14,7 +14,6 @@ class Conv1D(dg.Layer):
""" """
def __init__(self, def __init__(self,
name_scope,
in_channels, in_channels,
num_filters, num_filters,
filter_size=3, filter_size=3,
@ -28,7 +27,7 @@ class Conv1D(dg.Layer):
act=None, act=None,
data_format='NCT', data_format='NCT',
dtype="float32"): dtype="float32"):
super(Conv1D, self).__init__(name_scope, dtype=dtype) super(Conv1D, self).__init__(dtype=dtype)
self.padding = padding self.padding = padding
self.in_channels = in_channels self.in_channels = in_channels
@ -41,7 +40,7 @@ class Conv1D(dg.Layer):
self.data_format = data_format self.data_format = data_format
self.conv = dg.Conv2D( self.conv = dg.Conv2D(
self.full_name(), in_channels=in_channels,
num_filters=num_filters, num_filters=num_filters,
filter_size=(1, filter_size), filter_size=(1, filter_size),
stride=(1, stride), stride=(1, stride),
@ -77,7 +76,6 @@ class Pool1D(dg.Layer):
A Pool 1D block implemented with Pool2D. A Pool 1D block implemented with Pool2D.
""" """
def __init__(self, def __init__(self,
name_scope,
pool_size=-1, pool_size=-1,
pool_type='max', pool_type='max',
pool_stride=1, pool_stride=1,
@ -88,7 +86,7 @@ class Pool1D(dg.Layer):
exclusive=True, exclusive=True,
data_format='NCT', data_format='NCT',
dtype='float32'): dtype='float32'):
super(Pool1D, self).__init__(name_scope, dtype=dtype) super(Pool1D, self).__init__(dtype=dtype)
self.pool_size = pool_size self.pool_size = pool_size
self.pool_type = pool_type self.pool_type = pool_type
self.pool_stride = pool_stride self.pool_stride = pool_stride
@ -101,7 +99,7 @@ class Pool1D(dg.Layer):
self.dtype = dtype self.dtype = dtype
self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type, self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn, global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
@ -127,7 +125,6 @@ class Pool1D(dg.Layer):
class DynamicGRU(dg.Layer): class DynamicGRU(dg.Layer):
def __init__(self, def __init__(self,
scope_name,
size, size,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
@ -137,9 +134,8 @@ class DynamicGRU(dg.Layer):
h_0=None, h_0=None,
origin_mode=False, origin_mode=False,
init_size=None): init_size=None):
super(DynamicGRU, self).__init__(scope_name) super(DynamicGRU, self).__init__()
self.gru_unit = dg.GRUUnit( self.gru_unit = dg.GRUUnit(
self.full_name(),
size * 3, size * 3,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,

View File

@ -3,339 +3,63 @@ from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from layers import Conv1D, Pool1D, DynamicGRU from parakeet.modules.layers import Conv1D, Pool1D
from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np import numpy as np
class FC(dg.Layer):
def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1):
super(FC, self).__init__(name_scope)
self.in_features = in_features
self.out_features = out_features
self.is_bias = is_bias
self.dtype = dtype
self.gain = gain
self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features),
dtype=dtype,
default_initializer = fluid.initializer.XavierInitializer())
#self.weight = gain * self.weight
# mind the implicit conversion to ParamAttr for many cases
if is_bias is not False:
k = math.sqrt(1 / in_features)
self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ),
is_bias=True,
dtype=dtype,
default_initializer = fluid.initializer.Uniform(low=-k, high=k))
# 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature
def forward(self, x):
x = fluid.layers.matmul(x, self.weight)
if hasattr(self, "bias"):
x = fluid.layers.elementwise_add(x, self.bias)
return x
class Conv(dg.Layer):
def __init__(self, name_scope, in_channels, out_channels, filter_size=1,
padding=0, dilation=1, stride=1, use_cudnn=True,
data_format="NCT", is_bias=True, gain=1):
super(Conv, self).__init__(name_scope)
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_size = filter_size
self.padding = padding
self.dilation = dilation
self.stride = stride
self.use_cudnn = use_cudnn
self.data_format = data_format
self.is_bias = is_bias
self.gain = gain
self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer())
self.bias_attr = None
if is_bias is not False:
k = math.sqrt(1 / in_channels)
self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k))
self.conv = Conv1D( self.full_name(),
in_channels = in_channels,
num_filters = out_channels,
filter_size = filter_size,
padding = padding,
dilation = dilation,
stride = stride,
param_attr = self.weight_attr,
bias_attr = self.bias_attr,
use_cudnn = use_cudnn,
data_format = data_format)
def forward(self, x):
x = self.conv(x)
return x
class EncoderPrenet(dg.Layer): class EncoderPrenet(dg.Layer):
def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True): def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__(name_scope) super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.embedding = dg.Embedding(self.full_name(), self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
size = [len(symbols), embedding_size],
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
padding_idx = None) padding_idx = None)
self.conv1 = Conv(self.full_name(), self.conv_list = []
in_channels = embedding_size, self.conv_list.append(Conv1D(in_channels = embedding_size,
out_channels = num_hidden, out_channels = num_hidden,
filter_size = 5, filter_size = 5,
padding = int(np.floor(5/2)), padding = int(np.floor(5/2)),
use_cudnn = use_cudnn, use_cudnn = use_cudnn,
data_format = "NCT", data_format = "NCT"))
gain = math.sqrt(2)) for _ in range(2):
self.conv2 = Conv(self.full_name(), self.conv_list = Conv1D(in_channels = num_hidden,
in_channels = num_hidden, out_channels = num_hidden,
out_channels = num_hidden, filter_size = 5,
filter_size = 5, padding = int(np.floor(5/2)),
padding = int(np.floor(5/2)), use_cudnn = use_cudnn,
use_cudnn = use_cudnn, data_format = "NCT")
data_format = "NCT",
gain = math.sqrt(2))
self.conv3 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.projection = FC(self.full_name(), num_hidden, num_hidden)
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2)
x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2)
x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = self.projection(x)
return x
class FFN(dg.Layer):
def __init__(self, name_scope, num_hidden, use_cudnn=True):
super(FFN, self).__init__(name_scope)
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.w_1 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden * 4,
filter_size = 1,
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.w_2 = Conv(self.full_name(),
in_channels = num_hidden * 4,
out_channels = num_hidden,
filter_size = 1,
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2)
def forward(self, input):
#FFN Networt
x = layers.transpose(input, [0,2,1])
x = self.w_2(layers.relu(self.w_1(x)))
x = layers.transpose(x,[0,2,1])
# dropout
# x = layers.dropout(x, 0.1)
# not sure where dropout should be placed, in paper should before residual,
# but the diagonal alignment did not appear correctly in the attention plot.
# residual connection
x = x + input
#layer normalization
x = self.layer_norm(x)
return x
class DecoderPrenet(dg.Layer):
def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5):
super(DecoderPrenet, self).__init__(name_scope)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_rate = dropout_rate
self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1
self.fc2 = FC(self.full_name(), hidden_size, output_size)
def forward(self, x):
x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate)
x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate)
return x
class ScaledDotProductAttention(dg.Layer):
def __init__(self, name_scope, d_key):
super(ScaledDotProductAttention, self).__init__(name_scope)
self.d_key = d_key
# please attention this mask is diff from pytorch
def forward(self, key, value, query, mask=None, query_mask=None):
# Compute attention score
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
attention = attention / math.sqrt(self.d_key)
# Mask key to ignore padding
if mask is not None:
attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
attention = attention + mask
attention = layers.softmax(attention)
# Mask query to ignore padding
# Not sure how to work
if query_mask is not None:
attention = attention * query_mask
result = layers.matmul(attention, value)
return result, attention
class MultiheadAttention(dg.Layer):
def __init__(self, name_scope, num_hidden, num_head=4):
super(MultiheadAttention, self).__init__(name_scope)
self.num_hidden = num_hidden
self.num_hidden_per_attn = num_hidden // num_head
self.num_head = num_head
self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn)
self.fc = FC(self.full_name(), num_hidden * 2, num_hidden)
self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2)
def forward(self, key, value, query_input, mask=None, query_mask=None):
batch_size = key.shape[0]
seq_len_key = key.shape[1]
seq_len_query = query_input.shape[1]
# repeat masks h times
if query_mask is not None:
query_mask = layers.unsqueeze(query_mask, axes=[-1])
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn])
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn])
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn])
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn])
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn])
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn])
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
#print(result.().shape)
# concat result with input
result = layers.concat([query_input, result], axis=-1)
result = self.fc(result)
result = result + query_input
result = self.layer_norm(result)
return result, attention
class PostConvNet(dg.Layer):
def __init__(self, name_scope, config):
super(PostConvNet, self).__init__(name_scope)
num_hidden = config.network.hidden_size
self.num_hidden = num_hidden
self.conv1 = Conv(self.full_name(),
in_channels = config.audio.num_mels * config.audio.outputs_per_step,
out_channels = num_hidden,
filter_size = 5,
padding = 4,
use_cudnn = config.use_gpu,
data_format = "NCT",
gain = 5 / 3)
self.conv_list = [Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = 4,
use_cudnn = config.use_gpu,
data_format = "NCT",
gain = 5 / 3) for _ in range(3)]
for i, layer in enumerate(self.conv_list): for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.conv5 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = config.audio.num_mels * config.audio.outputs_per_step,
filter_size = 5,
padding = 4,
use_cudnn = config.use_gpu,
data_format = "NCT")
self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden, self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean', moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var', moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(3)] data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
def forward(self, input): self.projection = dg.Linear(num_hidden, num_hidden)
input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1)
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1) x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
input = self.conv5(input)[:, :, :-4] x = layers.transpose(x,[0,2,1]) #(N,T,C)
return input x = self.projection(x)
return x
class CBHG(dg.Layer): class CBHG(dg.Layer):
def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2, def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False): max_pool_kernel_size=2, is_post=False):
super(CBHG, self).__init__(name_scope) super(CBHG, self).__init__()
""" """
:param hidden_size: dimension of hidden unit :param hidden_size: dimension of hidden unit
:param K: # of convolution banks :param K: # of convolution banks
@ -344,19 +68,16 @@ class CBHG(dg.Layer):
:param max_pool_kernel_size: max pooling kernel size :param max_pool_kernel_size: max pooling kernel size
:param is_post: whether post processing or not :param is_post: whether post processing or not
""" """
hidden_size = config.network.hidden_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.projection_size = projection_size self.projection_size = projection_size
self.conv_list = [] self.conv_list = []
self.conv_list.append(Conv(self.full_name(), self.conv_list.append(Conv1D(in_channels = projection_size,
in_channels = projection_size,
out_channels = hidden_size, out_channels = hidden_size,
filter_size = 1, filter_size = 1,
padding = int(np.floor(1/2)), padding = int(np.floor(1/2)),
data_format = "NCT")) data_format = "NCT"))
for i in range(2,K+1): for i in range(2,K+1):
self.conv_list.append(Conv(self.full_name(), self.conv_list.append(Conv1D(in_channels = hidden_size,
in_channels = hidden_size,
out_channels = hidden_size, out_channels = hidden_size,
filter_size = i, filter_size = i,
padding = int(np.floor(i/2)), padding = int(np.floor(i/2)),
@ -367,7 +88,7 @@ class CBHG(dg.Layer):
self.batchnorm_list = [] self.batchnorm_list = []
for i in range(K): for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size, self.batchnorm_list.append(dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean', moving_mean_name = 'moving_mean',
@ -379,69 +100,63 @@ class CBHG(dg.Layer):
conv_outdim = hidden_size * K conv_outdim = hidden_size * K
self.conv_projection_1 = Conv(self.full_name(), self.conv_projection_1 = Conv1D(in_channels = conv_outdim,
in_channels = conv_outdim,
out_channels = hidden_size, out_channels = hidden_size,
filter_size = 3, filter_size = 3,
padding = int(np.floor(3/2)), padding = int(np.floor(3/2)),
data_format = "NCT") data_format = "NCT")
self.conv_projection_2 = Conv(self.full_name(), self.conv_projection_2 = Conv1D(in_channels = hidden_size,
in_channels = hidden_size,
out_channels = projection_size, out_channels = projection_size,
filter_size = 3, filter_size = 3,
padding = int(np.floor(3/2)), padding = int(np.floor(3/2)),
data_format = "NCT") data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size, self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean', moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var', moving_variance_name = 'moving_var',
data_layout='NCHW') data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size, self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean', moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var', moving_variance_name = 'moving_var',
data_layout='NCHW') data_layout='NCHW')
self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size, self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max', pool_type='max',
pool_stride=1, pool_stride=1,
pool_padding=1, pool_padding=1,
data_format = "NCT") data_format = "NCT")
self.highway = Highwaynet(self.full_name(), self.projection_size) self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32") h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0) h_0 = dg.to_variable(h_0)
self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(self.full_name(), self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False, is_reverse = False,
origin_mode = True, origin_mode = True,
h_0 = h_0) h_0 = h_0)
self.gru_reverse1 = DynamicGRU(self.full_name(), self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True, is_reverse=True,
origin_mode=True, origin_mode=True,
h_0 = h_0) h_0 = h_0)
self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(self.full_name(), self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False, is_reverse = False,
origin_mode = True, origin_mode = True,
h_0 = h_0) h_0 = h_0)
self.gru_reverse2 = DynamicGRU(self.full_name(), self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'), param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'), bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True, is_reverse=True,
@ -491,8 +206,8 @@ class CBHG(dg.Layer):
return out return out
class Highwaynet(dg.Layer): class Highwaynet(dg.Layer):
def __init__(self, name_scope, num_units, num_layers=4): def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__(name_scope) super(Highwaynet, self).__init__()
self.num_units = num_units self.num_units = num_units
self.num_layers = num_layers self.num_layers = num_layers
@ -500,8 +215,8 @@ class Highwaynet(dg.Layer):
self.linears = [] self.linears = []
for i in range(num_layers): for i in range(num_layers):
self.linears.append(FC(self.full_name(), num_units, num_units)) self.linears.append(dg.Linear(num_units, num_units))
self.gates.append(FC(self.full_name(), num_units, num_units)) self.gates.append(dg.Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("linears_{}".format(i), linear)

View File

@ -1,39 +1,42 @@
from module import * from parakeet.models.transformerTTS.module import *
from utils import get_positional_table, get_sinusoid_encoding_table
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
from parakeet.modules.prenet import PreNet
from parakeet.modules.post_convnet import PostConvNet
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, name_scope, embedding_size, num_hidden, config): def __init__(self, embedding_size, num_hidden, config):
super(Encoder, self).__init__(name_scope) super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha', param = fluid.ParamAttr(name='alpha',
initializer=fluid.initializer.Constant(value=1.0)) initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32') self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(name_scope=self.full_name(), self.pos_emb = dg.Embedding(size=[1024, num_hidden],
size=[1024, num_hidden],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='weight', name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False)) trainable=False))
self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(), self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
embedding_size = embedding_size,
num_hidden = num_hidden, num_hidden = num_hidden,
use_cudnn=config.use_gpu) use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] self.layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional): def forward(self, x, positional):
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
query_mask = (positional != 0).astype(np.float32) query_mask = get_non_pad_mask(positional)
mask = (positional != 0).astype(np.float32) mask = get_attn_key_pad_mask(positional, x)
mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1])
else: else:
query_mask, mask = None, None query_mask, mask = None, None
@ -59,65 +62,60 @@ class Encoder(dg.Layer):
return x, query_mask, attentions return x, query_mask, attentions
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, name_scope, num_hidden, config): def __init__(self, num_hidden, config):
super(Decoder, self).__init__(name_scope) super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha') param = fluid.ParamAttr(name='alpha')
self.alpha = self.create_parameter(param, shape=(1,), dtype='float32', self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(name_scope=self.full_name(), self.pos_emb = dg.Embedding(size=[1024, num_hidden],
size=[1024, num_hidden],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='weight', name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False)) trainable=False))
self.decoder_prenet = DecoderPrenet(self.full_name(), self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
input_size = config.audio.num_mels,
hidden_size = num_hidden * 2, hidden_size = num_hidden * 2,
output_size = num_hidden, output_size = num_hidden,
dropout_rate=0.2) dropout_rate=0.2)
self.linear = FC(self.full_name(), num_hidden, num_hidden) self.linear = dg.Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers): for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] self.attn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)]
for i, layer in enumerate(self.attn_layers): for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer) self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step) self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1) self.stop_linear = dg.Linear(num_hidden, 1)
self.postconvnet = PostConvNet(self.full_name(), config) self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step,
use_cudnn = config.use_gpu)
def forward(self, key, value, query, c_mask, positional): def forward(self, key, value, query, c_mask, positional):
batch_size = key.shape[0]
decoder_len = query.shape[1]
# get decoder mask with triangular matrix # get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
#zeros = np.zeros(positional.shape, dtype=np.float32) m_mask = get_non_pad_mask(positional)
m_mask = (positional != 0).astype(np.float32) mask = get_attn_key_pad_mask(positional, query)
mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1) triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) mask = mask + triu_tensor
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) mask = fluid.layers.cast(mask != 0, np.float32)
# (batch_size, decoder_len, decoder_len) # (batch_size, decoder_len, encoder_len)
zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(np.float32), axes=2), [1,1,decoder_len]) zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
# (batch_size, decoder_len, seq_len)
zero_mask = fluid.layers.transpose(zero_mask, [0,2,1])
else: else:
mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) mask = fluid.layers.cast(dg.to_variable(mask != 0), np.float32)
m_mask, zero_mask = None, None m_mask, zero_mask = None, None
#import pdb; pdb.set_trace()
# Decoder pre-network # Decoder pre-network
query = self.decoder_prenet(query) query = self.decoder_prenet(query)
@ -145,21 +143,21 @@ class Decoder(dg.Layer):
# Mel linear projection # Mel linear projection
mel_out = self.mel_linear(query) mel_out = self.mel_linear(query)
# Post Mel Network # Post Mel Network
postnet_input = layers.transpose(mel_out, [0,2,1]) out = self.postconvnet(mel_out)
out = self.postconvnet(postnet_input) out = mel_out + out
out = postnet_input + out
out = layers.transpose(out, [0,2,1])
# Stop tokens # Stop tokens
stop_tokens = self.stop_linear(query) stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1])
stop_tokens = layers.sigmoid(stop_tokens)
return mel_out, out, attn_list, stop_tokens, selfattn_list return mel_out, out, attn_list, stop_tokens, selfattn_list
class Model(dg.Layer): class TransformerTTS(dg.Layer):
def __init__(self, name_scope, config): def __init__(self, config):
super(Model, self).__init__(name_scope) super(TransformerTTS, self).__init__()
self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config) self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
self.decoder = Decoder(self.full_name(), config.network.hidden_size, config) self.decoder = Decoder(config.hidden_size, config)
self.config = config self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel): def forward(self, characters, mel_input, pos_text, pos_mel):
@ -180,16 +178,16 @@ class ModelPostNet(dg.Layer):
""" """
CBHG Network (mel -> linear) CBHG Network (mel -> linear)
""" """
def __init__(self, name_scope, config): def __init__(self, config):
super(ModelPostNet, self).__init__(name_scope) super(ModelPostNet, self).__init__()
self.pre_proj = Conv(self.full_name(), self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
in_channels = config.audio.num_mels, out_channels = config.hidden_size,
out_channels = config.network.hidden_size, filter_size=1,
data_format = "NCT") data_format = "NCT")
self.cbhg = CBHG(self.full_name(), config) self.cbhg = CBHG(config.hidden_size, config.batch_size)
self.post_proj = Conv(self.full_name(), self.post_proj = Conv1D(in_channels = config.hidden_size,
in_channels = config.audio.num_mels,
out_channels = (config.audio.n_fft // 2) + 1, out_channels = (config.audio.n_fft // 2) + 1,
filter_size=1,
data_format = "NCT") data_format = "NCT")
def forward(self, mel): def forward(self, mel):

View File

@ -22,9 +22,9 @@ def add_config_options_to_parser(parser):
parser.add_argument('--audio.outputs_per_step', type=int, default=1, parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.") help="the outputs per step.")
parser.add_argument('--network.hidden_size', type=int, default=256, parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in network.") help="the hidden size in network.")
parser.add_argument('--network.embedding_size', type=int, default=512, parser.add_argument('--embedding_size', type=int, default=512,
help="the embedding vector size.") help="the embedding vector size.")
parser.add_argument('--batch_size', type=int, default=32, parser.add_argument('--batch_size', type=int, default=32,

View File

@ -62,20 +62,6 @@ class LJSpeech(Dataset):
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def _batch_examples(self, minibatch):
mag_batch = []
mel_batch = []
phoneme_batch = []
for example in minibatch:
mag, mel, phoneme = example
mag_batch.append(mag)
mel_batch.append(mel)
phoneme_batch.append(phoneme)
mag_batch = SpecBatcher(pad_value=0.)(mag_batch)
mel_batch = SpecBatcher(pad_value=0.)(mel_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return (mag_batch, mel_batch, phoneme_batch)
def __getitem__(self, index): def __getitem__(self, index):
metadatum = self.metadata.iloc[index] metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum) example = self._get_example(metadatum)
@ -121,7 +107,7 @@ def batch_examples(batch):
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
def batch_examples_postnet(batch): def batch_examples_vocoder(batch):
mels=[] mels=[]
mags=[] mags=[]
for data in batch: for data in batch:

View File

@ -28,8 +28,8 @@ def synthesis(text_input, cfg):
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
model = Model('transtts', cfg) model = Model(cfg)
model_postnet = ModelPostNet('postnet', cfg) model_postnet = ModelPostNet(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))

View File

@ -47,7 +47,7 @@ def main(cfg):
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = ModelPostNet('postnet', cfg) model = ModelPostNet(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
@ -62,7 +62,7 @@ def main(cfg):
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy) model = MyDataParallel(model, strategy)
reader = LJSpeechLoader(cfg, nranks, local_rank, is_postnet=True).reader() reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(cfg.epochs): for epoch in range(cfg.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
@ -74,7 +74,6 @@ def main(cfg):
global_step += 1 global_step += 1
mag_pred = model(mel) mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel: if cfg.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)

View File

@ -9,7 +9,8 @@ import jsonargparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from matplotlib import cm from matplotlib import cm
from data import LJSpeechLoader from parakeet.modules.utils import cross_entropy
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
class MyDataParallel(dg.parallel.DataParallel): class MyDataParallel(dg.parallel.DataParallel):
""" """
@ -49,7 +50,7 @@ def main(cfg):
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = Model('transtts', cfg) model = TransformerTTS(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
@ -76,14 +77,21 @@ def main(cfg):
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
label = np.zeros(stop_preds.shape).astype(np.float32)
text_length = text_length.numpy()
for i in range(label.shape[0]):
label[i][text_length[i] - 1] = 1
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
loss = mel_loss + post_mel_loss + stop_loss
if local_rank==0: if local_rank==0:
writer.add_scalars('training_loss', { writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(), 'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy(), 'post_mel_loss':post_mel_loss.numpy(),
'stop_loss':stop_loss.numpy()
}, global_step) }, global_step)
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
@ -97,7 +105,7 @@ def main(cfg):
for i, prob in enumerate(attn_probs): for i, prob in enumerate(attn_probs):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_enc): for i, prob in enumerate(attn_enc):
for j in range(4): for j in range(4):

View File

@ -0,0 +1,44 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class DynamicGRU(dg.Layer):
def __init__(self,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__()
self.gru_unit = dg.GRUUnit(
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = layers.concat(res, axis=1)
return res

View File

@ -0,0 +1,40 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv1D
class PositionwiseFeedForward(dg.Layer):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.dropout = dropout
self.w_1 = Conv1D(in_channels = d_in,
out_channels = num_hidden,
filter_size = filter_size,
padding=padding,
use_cudnn = use_cudnn,
data_format = "NTC")
self.w_2 = Conv1D(in_channels = num_hidden,
out_channels = d_in,
filter_size = filter_size,
padding=padding,
use_cudnn = use_cudnn,
data_format = "NTC")
self.layer_norm = dg.LayerNorm(d_in)
def forward(self, input):
#FFN Networt
x = self.w_2(layers.relu(self.w_1(input)))
# dropout
x = layers.dropout(x, self.dropout)
# residual connection
x = x + input
#layer normalization
x = self.layer_norm(x)
return x

122
parakeet/modules/layers.py Normal file
View File

@ -0,0 +1,122 @@
import math
import numpy as np
import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Conv1D(dg.Layer):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def __init__(self,
in_channels,
out_channels,
filter_size=3,
padding=0,
dilation=1,
stride=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
data_format='NCT',
dtype="float32"):
super(Conv1D, self).__init__(dtype=dtype)
self.padding = padding
self.in_channels = in_channels
self.num_filters = out_channels
self.filter_size = filter_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.act = act
self.data_format = data_format
self.conv = dg.Conv2D(
num_channels=in_channels,
num_filters=out_channels,
filter_size=(1, filter_size),
stride=(1, stride),
dilation=(1, dilation),
padding=(0, padding),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.conv(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
pool_size=-1,
pool_type='max',
pool_stride=1,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
exclusive=True,
data_format='NCT'):
super(Pool1D, self).__init__()
self.pool_size = pool_size
self.pool_type = pool_type
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.global_pooling = global_pooling
self.use_cudnn = use_cudnn
self.ceil_mode = ceil_mode
self.exclusive = exclusive
self.data_format = data_format
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.pool2d(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x

View File

@ -0,0 +1,84 @@
import math
import numpy as np
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key):
super(ScaledDotProductAttention, self).__init__()
self.d_key = d_key
# please attention this mask is diff from pytorch
def forward(self, key, value, query, mask=None, query_mask=None):
# Compute attention score
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
attention = attention / math.sqrt(self.d_key)
# Mask key to ignore padding
if mask is not None:
attention = attention * (mask == 0).astype(np.float32)
mask = mask * (-2 ** 32 + 1)
attention = attention + mask
attention = layers.softmax(attention)
attention = layers.dropout(attention, 0.0)
# Mask query to ignore padding
# Not sure how to work
if query_mask is not None:
attention = attention * query_mask
result = layers.matmul(attention, value)
return result, attention
class MultiheadAttention(dg.Layer):
def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1):
super(MultiheadAttention, self).__init__()
self.num_hidden = num_hidden
self.num_head = num_head
self.d_k = d_k
self.d_q = d_q
self.dropout = dropout
self.key = dg.Linear(num_hidden, num_head * d_k)
self.value = dg.Linear(num_hidden, num_head * d_k)
self.query = dg.Linear(num_hidden, num_head * d_q)
self.scal_attn = ScaledDotProductAttention(d_k)
self.fc = dg.Linear(num_head * d_q, num_hidden)
self.layer_norm = dg.LayerNorm(num_hidden)
def forward(self, key, value, query_input, mask=None, query_mask=None):
batch_size = key.shape[0]
seq_len_key = key.shape[1]
seq_len_query = query_input.shape[1]
# repeat masks h times
if query_mask is not None:
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input
result = self.layer_norm(result)
return result, attention

View File

@ -0,0 +1,67 @@
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv1D
class PostConvNet(dg.Layer):
def __init__(self,
n_mels=80,
num_hidden=512,
filter_size=5,
padding=0,
num_conv=5,
outputs_per_step=1,
use_cudnn=True,
dropout=0.1):
super(PostConvNet, self).__init__()
self.dropout = dropout
self.conv_list = []
self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step,
out_channels = num_hidden,
filter_size = filter_size,
padding = padding,
use_cudnn = use_cudnn,
data_format = "NCT"))
for _ in range(1, num_conv-1):
self.conv_list.append(Conv1D(in_channels = num_hidden,
out_channels = num_hidden,
filter_size = filter_size,
padding = padding,
use_cudnn = use_cudnn,
data_format = "NCT") )
self.conv_list.append(Conv1D(in_channels = num_hidden,
out_channels = n_mels * outputs_per_step,
filter_size = filter_size,
padding = padding,
use_cudnn = use_cudnn,
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(num_conv-1)]
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
def forward(self, input):
input = layers.transpose(input, [0,2,1])
len = input.shape[-1]
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
input = layers.transpose(input, [0,2,1])
return input

View File

@ -0,0 +1,26 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class PreNet(dg.Layer):
"""
Pre Net before passing through the network
"""
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
"""
:param input_size: dimension of input
:param hidden_size: dimension of hidden unit
:param output_size: dimension of output
"""
super(PreNet, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_rate = dropout_rate
self.linear1 = dg.Linear(input_size, hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size)
def forward(self, x):
x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate)
x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate)
return x

View File

@ -2,6 +2,7 @@ import numpy as np
import librosa import librosa
import os, copy import os, copy
from scipy import signal from scipy import signal
import paddle.fluid.layers as layers
def get_positional_table(d_pos_vec, n_position=1024): def get_positional_table(d_pos_vec, n_position=1024):
@ -33,6 +34,28 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
return sinusoid_table return sinusoid_table
def get_non_pad_mask(seq):
return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
def get_attn_key_pad_mask(seq_k, seq_q):
''' For masking out the padding part of key sequence. '''
# Expand to fit the shape of key query attention matrix.
len_q = seq_q.shape[1]
padding_mask = (seq_k != 0).astype(np.float32)
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1])
return padding_mask
def get_triu_tensor(seq_k, seq_q):
''' For make a triu tensor '''
len_k = seq_k.shape[1]
len_q = seq_q.shape[1]
batch_size = seq_k.shape[0]
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
return triu_tensor
def guided_attention(N, T, g=0.2): def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.''' '''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32) W = np.zeros((N, T), dtype=np.float32)
@ -40,3 +63,11 @@ def guided_attention(N, T, g=0.2):
for t_pos in range(W.shape[1]): for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
return W return W
def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001):
input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
label = input * (label * (position_weight - 1) + 1)
return layers.reduce_sum(label, dim=[0, 1])