diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/parakeet/models/fastspeech/config/fastspeech.yaml index 37fac16..2a58569 100644 --- a/parakeet/models/fastspeech/config/fastspeech.yaml +++ b/parakeet/models/fastspeech/config/fastspeech.yaml @@ -14,7 +14,6 @@ encoder_n_layer: 6 encoder_head: 2 encoder_conv1d_filter_size: 1536 max_sep_len: 2048 -fs_embedding_size: 384 decoder_n_layer: 6 decoder_head: 2 decoder_conv1d_filter_size: 1536 @@ -39,6 +38,6 @@ use_gpu: True use_data_parallel: False data_path: ../../../dataset/LJSpeech-1.1 -transtts_path: ../transformerTTS/checkpoint -transformer_step: 1 +transtts_path: ../transformerTTS/checkpoint/ +transformer_step: 10 log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py index 137b2d5..633b2bb 100644 --- a/parakeet/models/fastspeech/modules.py +++ b/parakeet/models/fastspeech/modules.py @@ -4,7 +4,7 @@ import utils import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers import paddle.fluid as fluid -from parakeet.modules.layers import Conv1D +from parakeet.modules.layers import Conv, Linear from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.feed_forward import PositionwiseFeedForward @@ -113,12 +113,12 @@ class DurationPredictor(dg.Layer): self.filter_size = filter_size self.dropout = dropout - self.conv1 = Conv1D(in_channels = self.input_size, + self.conv1 = Conv(in_channels = self.input_size, out_channels = self.out_channels, filter_size = self.filter_size, padding=1, data_format='NTC') - self.conv2 = Conv1D(in_channels = self.out_channels, + self.conv2 = Conv(in_channels = self.out_channels, out_channels = self.out_channels, filter_size = self.filter_size, padding=1, @@ -126,7 +126,7 @@ class DurationPredictor(dg.Layer): self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels) - self.linear =dg.Linear(self.out_channels, 1) + self.linear =Linear(self.out_channels, 1) def forward(self, encoder_output): """ diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py index b2d6ca0..dcb76c1 100644 --- a/parakeet/models/fastspeech/network.py +++ b/parakeet/models/fastspeech/network.py @@ -5,12 +5,12 @@ import paddle.fluid as fluid from parakeet.g2p.text.symbols import symbols from parakeet.modules.utils import * from parakeet.modules.post_convnet import PostConvNet +from parakeet.modules.layers import Linear class Encoder(dg.Layer): def __init__(self, n_src_vocab, len_max_seq, - d_word_vec, n_layers, n_head, d_k, @@ -23,9 +23,9 @@ class Encoder(dg.Layer): super(Encoder, self).__init__() n_position = len_max_seq + 1 - self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0) - self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) - self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0) + self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_model], padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), @@ -70,7 +70,6 @@ class Encoder(dg.Layer): class Decoder(dg.Layer): def __init__(self, len_max_seq, - d_word_vec, n_layers, n_head, d_k, @@ -83,8 +82,8 @@ class Decoder(dg.Layer): super(Decoder, self).__init__() n_position = len_max_seq + 1 - self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) - self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_model], padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), @@ -131,11 +130,10 @@ class FastSpeech(dg.Layer): self.encoder = Encoder(n_src_vocab=len(symbols)+1, len_max_seq=cfg.max_sep_len, - d_word_vec=cfg.fs_embedding_size, n_layers=cfg.encoder_n_layer, n_head=cfg.encoder_head, - d_k=64, - d_v=64, + d_k=cfg.fs_hidden_size // cfg.encoder_head, + d_v=cfg.fs_hidden_size // cfg.encoder_head, d_model=cfg.fs_hidden_size, d_inner=cfg.encoder_conv1d_filter_size, fft_conv1d_kernel=cfg.fft_conv1d_filter, @@ -146,17 +144,16 @@ class FastSpeech(dg.Layer): filter_size=cfg.duration_predictor_filter_size, dropout=cfg.dropout) self.decoder = Decoder(len_max_seq=cfg.max_sep_len, - d_word_vec=cfg.fs_embedding_size, n_layers=cfg.decoder_n_layer, n_head=cfg.decoder_head, - d_k=64, - d_v=64, + d_k=cfg.fs_hidden_size // cfg.decoder_head, + d_v=cfg.fs_hidden_size // cfg.decoder_head, d_model=cfg.fs_hidden_size, d_inner=cfg.decoder_conv1d_filter_size, fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_padding=cfg.fft_conv1d_padding, dropout=0.1) - self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step) + self.mel_linear = Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step) self.postnet = PostConvNet(n_mels=cfg.audio.num_mels, num_hidden=512, filter_size=5, diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py index 65e1eb3..b43a8af 100644 --- a/parakeet/models/fastspeech/parse.py +++ b/parakeet/models/fastspeech/parse.py @@ -22,8 +22,6 @@ def add_config_options_to_parser(parser): parser.add_argument('--audio.outputs_per_step', type=int, default=1, help="the outputs per step.") - parser.add_argument('--fs_embedding_size', type=int, default=256, - help="the dim size of embedding of fastspeech.") parser.add_argument('--encoder_n_layer', type=int, default=6, help="the number of FFT Block in encoder.") parser.add_argument('--encoder_head', type=int, default=2, diff --git a/parakeet/models/fastspeech/train.py b/parakeet/models/fastspeech/train.py index 243631c..84b467a 100644 --- a/parakeet/models/fastspeech/train.py +++ b/parakeet/models/fastspeech/train.py @@ -55,14 +55,13 @@ def main(cfg): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - transformerTTS = TransformerTTS(cfg) - model_path = os.path.join(cfg.transtts_path, "transformer") - model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) - #for param in transformerTTS.state_dict(): - # print(param) - - transformerTTS.set_dict(model_dict) - transformerTTS.eval() + with fluid.unique_name.guard(): + transformerTTS = TransformerTTS(cfg) + model_path = os.path.join(cfg.transtts_path, "transformer") + model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) + + transformerTTS.set_dict(model_dict) + transformerTTS.eval() model = FastSpeech(cfg) model.train() @@ -89,7 +88,6 @@ def main(cfg): _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32) - global_step += 1 #Forward @@ -104,8 +102,7 @@ def main(cfg): total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank==0: - print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) - + #print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) diff --git a/parakeet/models/transformerTTS/config/synthesis.yaml b/parakeet/models/transformerTTS/config/synthesis.yaml index c3c3f8c..413e816 100644 --- a/parakeet/models/transformerTTS/config/synthesis.yaml +++ b/parakeet/models/transformerTTS/config/synthesis.yaml @@ -11,8 +11,8 @@ audio: outputs_per_step: 1 max_len: 50 -transformer_step: 1 -postnet_step: 1 +transformer_step: 10 +postnet_step: 10 use_gpu: True checkpoint_path: ./checkpoint diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml index 091758f..c9bd487 100644 --- a/parakeet/models/transformerTTS/config/train_postnet.yaml +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -18,9 +18,9 @@ grad_clip_thresh: 1.0 batch_size: 32 epochs: 10000 lr: 0.001 -save_step: 500 +save_step: 10 use_gpu: True -use_data_parallel: True +use_data_parallel: False data_path: ../../../dataset/LJSpeech-1.1 save_path: ./checkpoint diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index 6318e7c..1a1ebd1 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -35,7 +35,7 @@ class EncoderPrenet(dg.Layer): self.add_sublayer("conv_list_{}".format(i), layer) self.batch_norm_list = [dg.BatchNorm(num_hidden, - data_layout='NCHW', epsilon=1e-30) for _ in range(3)] + data_layout='NCHW') for _ in range(3)] for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) @@ -57,6 +57,7 @@ class CBHG(dg.Layer): super(CBHG, self).__init__() """ :param hidden_size: dimension of hidden unit + :param batch_size: batch size :param K: # of convolution banks :param projection_size: dimension of projection unit :param num_gru_layers: # of layers of GRUcell diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py index 4d122ff..5bbe115 100644 --- a/parakeet/models/transformerTTS/network.py +++ b/parakeet/models/transformerTTS/network.py @@ -10,7 +10,7 @@ from parakeet.modules.post_convnet import PostConvNet class Encoder(dg.Layer): - def __init__(self, embedding_size, num_hidden, config): + def __init__(self, embedding_size, num_hidden, config, num_head=4): super(Encoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) @@ -24,10 +24,10 @@ class Encoder(dg.Layer): self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, num_hidden = num_hidden, use_cudnn=config.use_gpu) - self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] + self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) @@ -61,7 +61,7 @@ class Encoder(dg.Layer): return x, query_mask, attentions class Decoder(dg.Layer): - def __init__(self, num_hidden, config): + def __init__(self, num_hidden, config, num_head=4): super(Decoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr() @@ -79,13 +79,13 @@ class Decoder(dg.Layer): dropout_rate=0.2) self.linear = Linear(num_hidden, num_hidden) - self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] + self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] for i, layer in enumerate(self.selfattn_layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] + self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] for i, layer in enumerate(self.attn_layers): self.add_sublayer("attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py index ee45611..aeb9697 100644 --- a/parakeet/models/transformerTTS/synthesis.py +++ b/parakeet/models/transformerTTS/synthesis.py @@ -28,12 +28,15 @@ def synthesis(text_input, cfg): writer = SummaryWriter(path) with dg.guard(place): - model = TransformerTTS(cfg) - model_postnet = ModelPostNet(cfg) - - model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) - model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) - + with fluid.unique_name.guard(): + model = TransformerTTS(cfg) + model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) + model.eval() + + with fluid.unique_name.guard(): + model_postnet = ModelPostNet(cfg) + model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) + model_postnet.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) @@ -42,9 +45,6 @@ def synthesis(text_input, cfg): pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) - model.eval() - model_postnet.eval() - pbar = tqdm(range(cfg.max_len)) for i in pbar: diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index d8b7389..81fae56 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -86,17 +86,17 @@ def main(cfg): mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) - label = np.zeros(stop_preds.shape).astype(np.float32) - text_length = text_length.numpy() - for i in range(label.shape[0]): - label[i][text_length[i] - 1] = 1 + label = (pos_mel == 0).astype(np.float32) + #label = np.zeros(stop_preds.shape).astype(np.float32) + #text_length = text_length.numpy() + #for i in range(label.shape[0]): + # label[i][text_length[i] - 1] = 1 mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) - stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) + stop_loss = cross_entropy(stop_preds, label) loss = mel_loss + post_mel_loss + stop_loss - if local_rank==0: writer.add_scalars('training_loss', { 'mel_loss':mel_loss.numpy(), diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index 27a155f..627ca32 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -105,7 +105,6 @@ class MultiheadAttention(dg.Layer): # concat all multihead result result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) - result = layers.concat([query_input,result], axis=-1) result = layers.dropout(self.fc(result), self.dropout) result = result + query_input diff --git a/parakeet/modules/utils.py b/parakeet/modules/utils.py index 626d5f2..ab575f9 100644 --- a/parakeet/modules/utils.py +++ b/parakeet/modules/utils.py @@ -65,9 +65,10 @@ def guided_attention(N, T, g=0.2): return W -def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001): - input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) - label = input * (label * (position_weight - 1) + 1) - return layers.reduce_sum(label, dim=[0, 1]) +def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30): + output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) + output = output * (label * (position_weight - 1) + 1) + + return layers.reduce_sum(output, dim=[0, 1])