update fastspeech

This commit is contained in:
lifuchen 2020-01-15 06:10:27 +00:00 committed by chenfeiyu
parent ab0fe8f304
commit f009411b57
13 changed files with 58 additions and 66 deletions

View File

@ -14,7 +14,6 @@ encoder_n_layer: 6
encoder_head: 2 encoder_head: 2
encoder_conv1d_filter_size: 1536 encoder_conv1d_filter_size: 1536
max_sep_len: 2048 max_sep_len: 2048
fs_embedding_size: 384
decoder_n_layer: 6 decoder_n_layer: 6
decoder_head: 2 decoder_head: 2
decoder_conv1d_filter_size: 1536 decoder_conv1d_filter_size: 1536
@ -39,6 +38,6 @@ use_gpu: True
use_data_parallel: False use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1 data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint transtts_path: ../transformerTTS/checkpoint/
transformer_step: 1 transformer_step: 10
log_dir: ./log log_dir: ./log

View File

@ -4,7 +4,7 @@ import utils
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D from parakeet.modules.layers import Conv, Linear
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward from parakeet.modules.feed_forward import PositionwiseFeedForward
@ -113,12 +113,12 @@ class DurationPredictor(dg.Layer):
self.filter_size = filter_size self.filter_size = filter_size
self.dropout = dropout self.dropout = dropout
self.conv1 = Conv1D(in_channels = self.input_size, self.conv1 = Conv(in_channels = self.input_size,
out_channels = self.out_channels, out_channels = self.out_channels,
filter_size = self.filter_size, filter_size = self.filter_size,
padding=1, padding=1,
data_format='NTC') data_format='NTC')
self.conv2 = Conv1D(in_channels = self.out_channels, self.conv2 = Conv(in_channels = self.out_channels,
out_channels = self.out_channels, out_channels = self.out_channels,
filter_size = self.filter_size, filter_size = self.filter_size,
padding=1, padding=1,
@ -126,7 +126,7 @@ class DurationPredictor(dg.Layer):
self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.linear =dg.Linear(self.out_channels, 1) self.linear =Linear(self.out_channels, 1)
def forward(self, encoder_output): def forward(self, encoder_output):
""" """

View File

@ -5,12 +5,12 @@ import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.post_convnet import PostConvNet from parakeet.modules.post_convnet import PostConvNet
from parakeet.modules.layers import Linear
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, def __init__(self,
n_src_vocab, n_src_vocab,
len_max_seq, len_max_seq,
d_word_vec,
n_layers, n_layers,
n_head, n_head,
d_k, d_k,
@ -23,9 +23,9 @@ class Encoder(dg.Layer):
super(Encoder, self).__init__() super(Encoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0) self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_word_vec], self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
@ -70,7 +70,6 @@ class Encoder(dg.Layer):
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, def __init__(self,
len_max_seq, len_max_seq,
d_word_vec,
n_layers, n_layers,
n_head, n_head,
d_k, d_k,
@ -83,8 +82,8 @@ class Decoder(dg.Layer):
super(Decoder, self).__init__() super(Decoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_word_vec], self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
@ -131,11 +130,10 @@ class FastSpeech(dg.Layer):
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len, len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.encoder_n_layer, n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head, n_head=cfg.encoder_head,
d_k=64, d_k=cfg.fs_hidden_size // cfg.encoder_head,
d_v=64, d_v=cfg.fs_hidden_size // cfg.encoder_head,
d_model=cfg.fs_hidden_size, d_model=cfg.fs_hidden_size,
d_inner=cfg.encoder_conv1d_filter_size, d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg.fft_conv1d_filter,
@ -146,17 +144,16 @@ class FastSpeech(dg.Layer):
filter_size=cfg.duration_predictor_filter_size, filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout) dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len, self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.decoder_n_layer, n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head, n_head=cfg.decoder_head,
d_k=64, d_k=cfg.fs_hidden_size // cfg.decoder_head,
d_v=64, d_v=cfg.fs_hidden_size // cfg.decoder_head,
d_model=cfg.fs_hidden_size, d_model=cfg.fs_hidden_size,
d_inner=cfg.decoder_conv1d_filter_size, d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1) dropout=0.1)
self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step) self.mel_linear = Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step)
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels, self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
num_hidden=512, num_hidden=512,
filter_size=5, filter_size=5,

View File

@ -22,8 +22,6 @@ def add_config_options_to_parser(parser):
parser.add_argument('--audio.outputs_per_step', type=int, default=1, parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.") help="the outputs per step.")
parser.add_argument('--fs_embedding_size', type=int, default=256,
help="the dim size of embedding of fastspeech.")
parser.add_argument('--encoder_n_layer', type=int, default=6, parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.") help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2, parser.add_argument('--encoder_head', type=int, default=2,

View File

@ -55,14 +55,13 @@ def main(cfg):
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
transformerTTS = TransformerTTS(cfg) with fluid.unique_name.guard():
model_path = os.path.join(cfg.transtts_path, "transformer") transformerTTS = TransformerTTS(cfg)
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) model_path = os.path.join(cfg.transtts_path, "transformer")
#for param in transformerTTS.state_dict(): model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
# print(param)
transformerTTS.set_dict(model_dict) transformerTTS.set_dict(model_dict)
transformerTTS.eval() transformerTTS.eval()
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.train() model.train()
@ -89,7 +88,6 @@ def main(cfg):
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32) alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
global_step += 1 global_step += 1
#Forward #Forward
@ -104,8 +102,7 @@ def main(cfg):
total_loss = mel_loss + mel_postnet_loss + duration_loss total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0: if local_rank==0:
print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) #print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)

View File

@ -11,8 +11,8 @@ audio:
outputs_per_step: 1 outputs_per_step: 1
max_len: 50 max_len: 50
transformer_step: 1 transformer_step: 10
postnet_step: 1 postnet_step: 10
use_gpu: True use_gpu: True
checkpoint_path: ./checkpoint checkpoint_path: ./checkpoint

View File

@ -18,9 +18,9 @@ grad_clip_thresh: 1.0
batch_size: 32 batch_size: 32
epochs: 10000 epochs: 10000
lr: 0.001 lr: 0.001
save_step: 500 save_step: 10
use_gpu: True use_gpu: True
use_data_parallel: True use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1 data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint save_path: ./checkpoint

View File

@ -35,7 +35,7 @@ class EncoderPrenet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [dg.BatchNorm(num_hidden,
data_layout='NCHW', epsilon=1e-30) for _ in range(3)] data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
@ -57,6 +57,7 @@ class CBHG(dg.Layer):
super(CBHG, self).__init__() super(CBHG, self).__init__()
""" """
:param hidden_size: dimension of hidden unit :param hidden_size: dimension of hidden unit
:param batch_size: batch size
:param K: # of convolution banks :param K: # of convolution banks
:param projection_size: dimension of projection unit :param projection_size: dimension of projection unit
:param num_gru_layers: # of layers of GRUcell :param num_gru_layers: # of layers of GRUcell

View File

@ -10,7 +10,7 @@ from parakeet.modules.post_convnet import PostConvNet
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config): def __init__(self, embedding_size, num_hidden, config, num_head=4):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
@ -24,10 +24,10 @@ class Encoder(dg.Layer):
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden, num_hidden = num_hidden,
use_cudnn=config.use_gpu) use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
@ -61,7 +61,7 @@ class Encoder(dg.Layer):
return x, query_mask, attentions return x, query_mask, attentions
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, num_hidden, config): def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr() param = fluid.ParamAttr()
@ -79,13 +79,13 @@ class Decoder(dg.Layer):
dropout_rate=0.2) dropout_rate=0.2)
self.linear = Linear(num_hidden, num_hidden) self.linear = Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers): for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.attn_layers): for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer) self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)

View File

@ -28,12 +28,15 @@ def synthesis(text_input, cfg):
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
model = TransformerTTS(cfg) with fluid.unique_name.guard():
model_postnet = ModelPostNet(cfg) model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) model.eval()
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
with fluid.unique_name.guard():
model_postnet = ModelPostNet(cfg)
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
model_postnet.eval()
# init input # init input
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
@ -42,9 +45,6 @@ def synthesis(text_input, cfg):
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
model.eval()
model_postnet.eval()
pbar = tqdm(range(cfg.max_len)) pbar = tqdm(range(cfg.max_len))
for i in pbar: for i in pbar:

View File

@ -86,17 +86,17 @@ def main(cfg):
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
label = np.zeros(stop_preds.shape).astype(np.float32) label = (pos_mel == 0).astype(np.float32)
text_length = text_length.numpy() #label = np.zeros(stop_preds.shape).astype(np.float32)
for i in range(label.shape[0]): #text_length = text_length.numpy()
label[i][text_length[i] - 1] = 1 #for i in range(label.shape[0]):
# label[i][text_length[i] - 1] = 1
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) stop_loss = cross_entropy(stop_preds, label)
loss = mel_loss + post_mel_loss + stop_loss loss = mel_loss + post_mel_loss + stop_loss
if local_rank==0: if local_rank==0:
writer.add_scalars('training_loss', { writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(), 'mel_loss':mel_loss.numpy(),

View File

@ -105,7 +105,6 @@ class MultiheadAttention(dg.Layer):
# concat all multihead result # concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.concat([query_input,result], axis=-1) result = layers.concat([query_input,result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout) result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input result = result + query_input

View File

@ -65,9 +65,10 @@ def guided_attention(N, T, g=0.2):
return W return W
def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001): def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
label = input * (label * (position_weight - 1) + 1) output = output * (label * (position_weight - 1) + 1)
return layers.reduce_sum(label, dim=[0, 1])
return layers.reduce_sum(output, dim=[0, 1])