TransformerTTS precision alignment

This commit is contained in:
lifuchen 2020-01-13 12:37:49 +00:00 committed by chenfeiyu
parent ae88be3419
commit ab0fe8f304
14 changed files with 93 additions and 104 deletions

View File

@ -89,7 +89,7 @@ def transliteration_cleaners(text):
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = add_punctuation(text)
#text = add_punctuation(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)

View File

@ -14,13 +14,11 @@ encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
encoder_output_size: 384
embedding_size: 384
fs_embedding_size: 384
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
decoder_output_size: 384
hidden_size: 384
fs_hidden_size: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
@ -28,6 +26,9 @@ fft_conv1d_padding: 1
dropout: 0.1
transformer_head: 4
embedding_size: 512
hidden_size: 256
warm_up_step: 4000
grad_clip_thresh: 0.1
batch_size: 32
@ -39,5 +40,5 @@ use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint
transformer_step: 20
transformer_step: 1
log_dir: ./log

View File

@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__()

View File

@ -1,5 +1,5 @@
from utils import *
from modules import *
from modules import FFTBlock, LengthRegulator
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols
@ -131,38 +131,38 @@ class FastSpeech(dg.Layer):
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size,
d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head,
d_k=64,
d_v=64,
d_model=cfg.hidden_size,
d_model=cfg.fs_hidden_size,
d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size,
self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size,
out_channels=cfg.duration_predictor_output_size,
filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size,
d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head,
d_k=64,
d_v=64,
d_model=cfg.hidden_size,
d_model=cfg.fs_hidden_size,
d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1)
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels)
self.postnet = PostConvNet(n_mels=80,
self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step)
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=1,
outputs_per_step=cfg.audio.outputs_per_step,
use_cudnn=True,
dropout=0.1)

View File

@ -22,8 +22,8 @@ def add_config_options_to_parser(parser):
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding.")
parser.add_argument('--fs_embedding_size', type=int, default=256,
help="the dim size of embedding of fastspeech.")
parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2,
@ -32,18 +32,14 @@ def add_config_options_to_parser(parser):
help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.")
parser.add_argument('--encoder_output_size', type=int, default=256,
help="the output channel size of encoder.")
parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.")
parser.add_argument('--decoder_output_size', type=int, default=256,
help="the output channel size of decoder.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model.")
parser.add_argument('--fs_hidden_size', type=int, default=256,
help="the hidden size in model of fastspeech.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
@ -57,6 +53,11 @@ def add_config_options_to_parser(parser):
parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model of transformerTTS.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,

View File

@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Pool1D
from parakeet.modules.layers import Conv, Pool1D, Linear
from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np
class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
param_attr = fluid.ParamAttr(name='weight'),
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
padding_idx = None)
self.conv_list = []
self.conv_list.append(Conv(in_channels = embedding_size,
@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(3)]
data_layout='NCHW', epsilon=1e-30) for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.projection = dg.Linear(num_hidden, num_hidden)
self.projection = Linear(num_hidden, num_hidden)
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
@ -90,10 +84,6 @@ class CBHG(dg.Layer):
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
@ -114,16 +104,8 @@ class CBHG(dg.Layer):
data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max',
@ -134,32 +116,24 @@ class CBHG(dg.Layer):
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True,
origin_mode=True,
h_0 = h_0)
@ -216,8 +190,8 @@ class Highwaynet(dg.Layer):
self.linears = []
for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units))
self.gates.append(dg.Linear(num_units, num_units))
self.linears.append(Linear(num_units, num_units))
self.gates.append(Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear)

View File

@ -1,7 +1,7 @@
from parakeet.models.transformerTTS.module import *
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D
from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
@ -13,8 +13,7 @@ class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha',
initializer=fluid.initializer.Constant(value=1.0))
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
@ -39,13 +38,13 @@ class Encoder(dg.Layer):
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
@ -65,21 +64,20 @@ class Decoder(dg.Layer):
def __init__(self, num_hidden, config):
super(Decoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha')
param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.linear = dg.Linear(num_hidden, num_hidden)
self.linear = Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers):
@ -90,8 +88,8 @@ class Decoder(dg.Layer):
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = dg.Linear(num_hidden, 1)
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = Linear(num_hidden, 1)
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5,
@ -115,10 +113,10 @@ class Decoder(dg.Layer):
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
@ -132,14 +130,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel)

View File

@ -2,7 +2,7 @@ import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
import numpy as np
from network import Model, ModelPostNet
from network import TransformerTTS, ModelPostNet
from tqdm import tqdm
from tensorboardX import SummaryWriter
import paddle.fluid as fluid
@ -28,7 +28,7 @@ def synthesis(text_input, cfg):
writer = SummaryWriter(path)
with dg.guard(place):
model = Model(cfg)
model = TransformerTTS(cfg)
model_postnet = ModelPostNet(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))

View File

@ -89,8 +89,6 @@ def main(cfg):
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
print("===============",model.pre_proj.conv.weight.numpy())
print("===============",model.pre_proj.conv.weight.gradient())
model.clear_gradients()
if local_rank==0:

View File

@ -63,7 +63,7 @@ def main(cfg):
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))
@ -78,26 +78,25 @@ def main(cfg):
for epoch in range(cfg.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data
global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
label = np.zeros(stop_preds.shape).astype(np.float32)
text_length = text_length.numpy()
for i in range(label.shape[0]):
label[i][text_length[i] - 1] = 1
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
loss = mel_loss + post_mel_loss + stop_loss
if local_rank==0:
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),

View File

@ -5,6 +5,25 @@ import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Linear(dg.Layer):
def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.dtype = dtype
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.bias = is_bias
if is_bias is not False:
k = math.sqrt(1 / in_features)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
bias_attr = self.bias,)
def forward(self, x):
x = self.linear(x)
return x
class Conv(dg.Layer):
def __init__(self, in_channels, out_channels, filter_size=1,

View File

@ -2,6 +2,7 @@ import math
import numpy as np
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.modules.layers import Linear
class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key):
@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer):
attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
attention = attention + mask
attention = layers.softmax(attention)
attention = layers.dropout(attention, dropout)
# Mask query to ignore padding
if query_mask is not None:
attention = attention * query_mask
@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer):
self.d_q = d_q
self.dropout = dropout
self.key = dg.Linear(num_hidden, num_head * d_k)
self.value = dg.Linear(num_hidden, num_head * d_k)
self.query = dg.Linear(num_hidden, num_head * d_q)
self.key = Linear(num_hidden, num_head * d_k, is_bias=False)
self.value = Linear(num_hidden, num_head * d_k, is_bias=False)
self.query = Linear(num_hidden, num_head * d_q, is_bias=False)
self.scal_attn = ScaledDotProductAttention(d_k)
self.fc = dg.Linear(num_head * d_q, num_hidden)
self.fc = Linear(num_head * d_q * 2, num_hidden)
self.layer_norm = dg.LayerNorm(num_hidden)
@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer):
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.concat([query_input,result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input

View File

@ -16,6 +16,7 @@ class PostConvNet(dg.Layer):
super(PostConvNet, self).__init__()
self.dropout = dropout
self.num_conv = num_conv
self.conv_list = []
self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step,
out_channels = num_hidden,
@ -43,17 +44,9 @@ class PostConvNet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(num_conv-1)]
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
#self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
# data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
@ -67,9 +60,15 @@ class PostConvNet(dg.Layer):
Returns:
output (Variable), Shape(B, T, C), the result after postconvnet.
"""
input = layers.transpose(input, [0,2,1])
len = input.shape[-1]
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
for i in range(self.num_conv-1):
batch_norm = self.batch_norm_list[i]
conv = self.conv_list[i]
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
conv = self.conv_list[self.num_conv-1]
input = conv(input)[:,:,:len]
output = layers.transpose(input, [0,2,1])
return output

View File

@ -1,5 +1,6 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.modules.layers import Linear
class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
@ -14,8 +15,8 @@ class PreNet(dg.Layer):
self.output_size = output_size
self.dropout_rate = dropout_rate
self.linear1 = dg.Linear(input_size, hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size)
self.linear1 = Linear(input_size, hidden_size)
self.linear2 = Linear(hidden_size, output_size)
def forward(self, x):
"""