From ab0fe8f3043842d79b1efbf9d038487cb96f4017 Mon Sep 17 00:00:00 2001 From: lifuchen Date: Mon, 13 Jan 2020 12:37:49 +0000 Subject: [PATCH] TransformerTTS precision alignment --- parakeet/g2p/text/cleaners.py | 2 +- .../models/fastspeech/config/fastspeech.yaml | 11 +++-- parakeet/models/fastspeech/modules.py | 2 - parakeet/models/fastspeech/network.py | 18 ++++---- parakeet/models/fastspeech/parse.py | 17 +++---- parakeet/models/transformerTTS/module.py | 46 ++++--------------- parakeet/models/transformerTTS/network.py | 25 +++++----- parakeet/models/transformerTTS/synthesis.py | 4 +- .../models/transformerTTS/train_postnet.py | 2 - .../transformerTTS/train_transformer.py | 11 ++--- parakeet/modules/layers.py | 19 ++++++++ parakeet/modules/multihead_attention.py | 14 +++--- parakeet/modules/post_convnet.py | 21 ++++----- parakeet/modules/prenet.py | 5 +- 14 files changed, 93 insertions(+), 104 deletions(-) diff --git a/parakeet/g2p/text/cleaners.py b/parakeet/g2p/text/cleaners.py index 86bf9f3..779a977 100644 --- a/parakeet/g2p/text/cleaners.py +++ b/parakeet/g2p/text/cleaners.py @@ -89,7 +89,7 @@ def transliteration_cleaners(text): def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text) - text = add_punctuation(text) + #text = add_punctuation(text) text = lowercase(text) text = expand_numbers(text) text = expand_abbreviations(text) diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/parakeet/models/fastspeech/config/fastspeech.yaml index 947457b..37fac16 100644 --- a/parakeet/models/fastspeech/config/fastspeech.yaml +++ b/parakeet/models/fastspeech/config/fastspeech.yaml @@ -14,13 +14,11 @@ encoder_n_layer: 6 encoder_head: 2 encoder_conv1d_filter_size: 1536 max_sep_len: 2048 -encoder_output_size: 384 -embedding_size: 384 +fs_embedding_size: 384 decoder_n_layer: 6 decoder_head: 2 decoder_conv1d_filter_size: 1536 -decoder_output_size: 384 -hidden_size: 384 +fs_hidden_size: 384 duration_predictor_output_size: 256 duration_predictor_filter_size: 3 fft_conv1d_filter: 3 @@ -28,6 +26,9 @@ fft_conv1d_padding: 1 dropout: 0.1 transformer_head: 4 +embedding_size: 512 +hidden_size: 256 + warm_up_step: 4000 grad_clip_thresh: 0.1 batch_size: 32 @@ -39,5 +40,5 @@ use_data_parallel: False data_path: ../../../dataset/LJSpeech-1.1 transtts_path: ../transformerTTS/checkpoint -transformer_step: 20 +transformer_step: 1 log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py index 621b5c1..137b2d5 100644 --- a/parakeet/models/fastspeech/modules.py +++ b/parakeet/models/fastspeech/modules.py @@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.feed_forward import PositionwiseFeedForward - - class FFTBlock(dg.Layer): def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): super(FFTBlock, self).__init__() diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py index 2f8dc9a..b2d6ca0 100644 --- a/parakeet/models/fastspeech/network.py +++ b/parakeet/models/fastspeech/network.py @@ -1,5 +1,5 @@ from utils import * -from modules import * +from modules import FFTBlock, LengthRegulator import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.g2p.text.symbols import symbols @@ -131,38 +131,38 @@ class FastSpeech(dg.Layer): self.encoder = Encoder(n_src_vocab=len(symbols)+1, len_max_seq=cfg.max_sep_len, - d_word_vec=cfg.embedding_size, + d_word_vec=cfg.fs_embedding_size, n_layers=cfg.encoder_n_layer, n_head=cfg.encoder_head, d_k=64, d_v=64, - d_model=cfg.hidden_size, + d_model=cfg.fs_hidden_size, d_inner=cfg.encoder_conv1d_filter_size, fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_padding=cfg.fft_conv1d_padding, dropout=0.1) - self.length_regulator = LengthRegulator(input_size=cfg.hidden_size, + self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size, out_channels=cfg.duration_predictor_output_size, filter_size=cfg.duration_predictor_filter_size, dropout=cfg.dropout) self.decoder = Decoder(len_max_seq=cfg.max_sep_len, - d_word_vec=cfg.embedding_size, + d_word_vec=cfg.fs_embedding_size, n_layers=cfg.decoder_n_layer, n_head=cfg.decoder_head, d_k=64, d_v=64, - d_model=cfg.hidden_size, + d_model=cfg.fs_hidden_size, d_inner=cfg.decoder_conv1d_filter_size, fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_padding=cfg.fft_conv1d_padding, dropout=0.1) - self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels) - self.postnet = PostConvNet(n_mels=80, + self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step) + self.postnet = PostConvNet(n_mels=cfg.audio.num_mels, num_hidden=512, filter_size=5, padding=int(5 / 2), num_conv=5, - outputs_per_step=1, + outputs_per_step=cfg.audio.outputs_per_step, use_cudnn=True, dropout=0.1) diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py index 4132674..65e1eb3 100644 --- a/parakeet/models/fastspeech/parse.py +++ b/parakeet/models/fastspeech/parse.py @@ -22,8 +22,8 @@ def add_config_options_to_parser(parser): parser.add_argument('--audio.outputs_per_step', type=int, default=1, help="the outputs per step.") - parser.add_argument('--embedding_size', type=int, default=256, - help="the dim size of embedding.") + parser.add_argument('--fs_embedding_size', type=int, default=256, + help="the dim size of embedding of fastspeech.") parser.add_argument('--encoder_n_layer', type=int, default=6, help="the number of FFT Block in encoder.") parser.add_argument('--encoder_head', type=int, default=2, @@ -32,18 +32,14 @@ def add_config_options_to_parser(parser): help="the filter size of conv1d in encoder.") parser.add_argument('--max_sep_len', type=int, default=2048, help="the max length of sequence.") - parser.add_argument('--encoder_output_size', type=int, default=256, - help="the output channel size of encoder.") parser.add_argument('--decoder_n_layer', type=int, default=6, help="the number of FFT Block in decoder.") parser.add_argument('--decoder_head', type=int, default=2, help="the attention head number in decoder.") parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, help="the filter size of conv1d in decoder.") - parser.add_argument('--decoder_output_size', type=int, default=256, - help="the output channel size of decoder.") - parser.add_argument('--hidden_size', type=int, default=256, - help="the hidden size in model.") + parser.add_argument('--fs_hidden_size', type=int, default=256, + help="the hidden size in model of fastspeech.") parser.add_argument('--duration_predictor_output_size', type=int, default=256, help="the output size of duration predictior.") parser.add_argument('--duration_predictor_filter_size', type=int, default=3, @@ -57,6 +53,11 @@ def add_config_options_to_parser(parser): parser.add_argument('--transformer_head', type=int, default=4, help="the attention head num of transformerTTS.") + parser.add_argument('--hidden_size', type=int, default=256, + help="the hidden size in model of transformerTTS.") + parser.add_argument('--embedding_size', type=int, default=256, + help="the dim size of embedding of transformerTTS.") + parser.add_argument('--warm_up_step', type=int, default=4000, help="the warm up step of learning rate.") parser.add_argument('--grad_clip_thresh', type=float, default=1.0, diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index ecacb1b..6318e7c 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers -from parakeet.modules.layers import Conv, Pool1D +from parakeet.modules.layers import Conv, Pool1D, Linear from parakeet.modules.dynamicGRU import DynamicGRU import numpy as np - class EncoderPrenet(dg.Layer): def __init__(self, embedding_size, num_hidden, use_cudnn=True): super(EncoderPrenet, self).__init__() self.embedding_size = embedding_size self.num_hidden = num_hidden self.use_cudnn = use_cudnn - self.embedding = dg.Embedding( size = [len(symbols), embedding_size], - param_attr = fluid.ParamAttr(name='weight'), + self.embedding = dg.Embedding( size = [len(symbols), embedding_size], padding_idx = None) self.conv_list = [] self.conv_list.append(Conv(in_channels = embedding_size, @@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer): self.add_sublayer("conv_list_{}".format(i), layer) self.batch_norm_list = [dg.BatchNorm(num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') for _ in range(3)] + data_layout='NCHW', epsilon=1e-30) for _ in range(3)] for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) - self.projection = dg.Linear(num_hidden, num_hidden) + self.projection = Linear(num_hidden, num_hidden) def forward(self, x): x = self.embedding(x) #(batch_size, seq_len, embending_size) @@ -90,10 +84,6 @@ class CBHG(dg.Layer): self.batchnorm_list = [] for i in range(K): self.batchnorm_list.append(dg.BatchNorm(hidden_size, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', data_layout='NCHW')) for i, layer in enumerate(self.batchnorm_list): @@ -114,16 +104,8 @@ class CBHG(dg.Layer): data_format = "NCT") self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', data_layout='NCHW') self.batchnorm_proj_2 = dg.BatchNorm(projection_size, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', data_layout='NCHW') self.max_pool = Pool1D(pool_size = max_pool_kernel_size, pool_type='max', @@ -134,32 +116,24 @@ class CBHG(dg.Layer): h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = dg.to_variable(h_0) - self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3) - self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3) self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), is_reverse = False, origin_mode = True, h_0 = h_0) self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), is_reverse=True, origin_mode=True, h_0 = h_0) - self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3) - self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3) self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), is_reverse = False, origin_mode = True, h_0 = h_0) self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), is_reverse=True, origin_mode=True, h_0 = h_0) @@ -216,8 +190,8 @@ class Highwaynet(dg.Layer): self.linears = [] for i in range(num_layers): - self.linears.append(dg.Linear(num_units, num_units)) - self.gates.append(dg.Linear(num_units, num_units)) + self.linears.append(Linear(num_units, num_units)) + self.gates.append(Linear(num_units, num_units)) for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): self.add_sublayer("linears_{}".format(i), linear) diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py index 5f353f8..4d122ff 100644 --- a/parakeet/models/transformerTTS/network.py +++ b/parakeet/models/transformerTTS/network.py @@ -1,7 +1,7 @@ from parakeet.models.transformerTTS.module import * import paddle.fluid.dygraph as dg import paddle.fluid as fluid -from parakeet.modules.layers import Conv1D +from parakeet.modules.layers import Conv1D, Linear from parakeet.modules.utils import * from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.feed_forward import PositionwiseFeedForward @@ -13,8 +13,7 @@ class Encoder(dg.Layer): def __init__(self, embedding_size, num_hidden, config): super(Encoder, self).__init__() self.num_hidden = num_hidden - param = fluid.ParamAttr(name='alpha', - initializer=fluid.initializer.Constant(value=1.0)) + param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding(size=[1024, num_hidden], @@ -39,13 +38,13 @@ class Encoder(dg.Layer): else: query_mask, mask = None, None - # Encoder pre_network x = self.encoder_prenet(x) #(N,T,C) # Get positional encoding positional = self.pos_emb(positional) + x = positional * self.alpha + x #(N, T, C) @@ -65,21 +64,20 @@ class Decoder(dg.Layer): def __init__(self, num_hidden, config): super(Decoder, self).__init__() self.num_hidden = num_hidden - param = fluid.ParamAttr(name='alpha') + param = fluid.ParamAttr() self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding(size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( - name='weight', initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) self.decoder_prenet = PreNet(input_size = config.audio.num_mels, hidden_size = num_hidden * 2, output_size = num_hidden, dropout_rate=0.2) - self.linear = dg.Linear(num_hidden, num_hidden) + self.linear = Linear(num_hidden, num_hidden) self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] for i, layer in enumerate(self.selfattn_layers): @@ -90,8 +88,8 @@ class Decoder(dg.Layer): self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) - self.stop_linear = dg.Linear(num_hidden, 1) + self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) + self.stop_linear = Linear(num_hidden, 1) self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, filter_size = 5, padding = 4, num_conv=5, @@ -115,10 +113,10 @@ class Decoder(dg.Layer): mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) m_mask, zero_mask = None, None - + # Decoder pre-network query = self.decoder_prenet(query) - + # Centered position query = self.linear(query) @@ -132,14 +130,13 @@ class Decoder(dg.Layer): # Attention decoder-decoder, encoder-decoder selfattn_list = list() attn_list = list() - + for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) query = ffn(query) selfattn_list.append(attn_dec) attn_list.append(attn_dot) - # Mel linear projection mel_out = self.mel_linear(query) # Post Mel Network @@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer): # key (batch_size, seq_len, channel) # c_mask (batch_size, seq_len) # attns_enc (channel / 2, seq_len, seq_len) - + key, c_mask, attns_enc = self.encoder(characters, pos_text) # mel_output/postnet_output (batch_size, mel_len, n_mel) diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py index 9c89d16..ee45611 100644 --- a/parakeet/models/transformerTTS/synthesis.py +++ b/parakeet/models/transformerTTS/synthesis.py @@ -2,7 +2,7 @@ import os from scipy.io.wavfile import write from parakeet.g2p.en import text_to_sequence import numpy as np -from network import Model, ModelPostNet +from network import TransformerTTS, ModelPostNet from tqdm import tqdm from tensorboardX import SummaryWriter import paddle.fluid as fluid @@ -28,7 +28,7 @@ def synthesis(text_input, cfg): writer = SummaryWriter(path) with dg.guard(place): - model = Model(cfg) + model = TransformerTTS(cfg) model_postnet = ModelPostNet(cfg) model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py index d45a4c6..6870caa 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -89,8 +89,6 @@ def main(cfg): else: loss.backward() optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) - print("===============",model.pre_proj.conv.weight.numpy()) - print("===============",model.pre_proj.conv.weight.gradient()) model.clear_gradients() if local_rank==0: diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index 844c56c..d8b7389 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -63,7 +63,7 @@ def main(cfg): optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), parameter_list=model.parameters()) - reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() if cfg.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")) @@ -78,26 +78,25 @@ def main(cfg): for epoch in range(cfg.epochs): pbar = tqdm(reader) - - for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) character, mel, mel_input, pos_text, pos_mel, text_length = data global_step += 1 - mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) + label = np.zeros(stop_preds.shape).astype(np.float32) text_length = text_length.numpy() for i in range(label.shape[0]): label[i][text_length[i] - 1] = 1 - + mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) loss = mel_loss + post_mel_loss + stop_loss - + + if local_rank==0: writer.add_scalars('training_loss', { 'mel_loss':mel_loss.numpy(), diff --git a/parakeet/modules/layers.py b/parakeet/modules/layers.py index 29a10db..488d7fa 100644 --- a/parakeet/modules/layers.py +++ b/parakeet/modules/layers.py @@ -5,6 +5,25 @@ import paddle from paddle import fluid import paddle.fluid.dygraph as dg +class Linear(dg.Layer): + def __init__(self, in_features, out_features, is_bias=True, dtype="float32"): + super(Linear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.dtype = dtype + self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) + self.bias = is_bias + + if is_bias is not False: + k = math.sqrt(1 / in_features) + self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) + + self.linear = dg.Linear(in_features, out_features, param_attr = self.weight, + bias_attr = self.bias,) + + def forward(self, x): + x = self.linear(x) + return x class Conv(dg.Layer): def __init__(self, in_channels, out_channels, filter_size=1, diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index b2592bb..27a155f 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -2,6 +2,7 @@ import math import numpy as np import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers +from parakeet.modules.layers import Linear class ScaledDotProductAttention(dg.Layer): def __init__(self, d_key): @@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer): attention = attention * mask mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) attention = attention + mask - - + attention = layers.softmax(attention) attention = layers.dropout(attention, dropout) + # Mask query to ignore padding if query_mask is not None: attention = attention * query_mask @@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer): self.d_q = d_q self.dropout = dropout - self.key = dg.Linear(num_hidden, num_head * d_k) - self.value = dg.Linear(num_hidden, num_head * d_k) - self.query = dg.Linear(num_hidden, num_head * d_q) + self.key = Linear(num_hidden, num_head * d_k, is_bias=False) + self.value = Linear(num_hidden, num_head * d_k, is_bias=False) + self.query = Linear(num_hidden, num_head * d_q, is_bias=False) self.scal_attn = ScaledDotProductAttention(d_k) - self.fc = dg.Linear(num_head * d_q, num_hidden) + self.fc = Linear(num_head * d_q * 2, num_hidden) self.layer_norm = dg.LayerNorm(num_hidden) @@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer): result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + result = layers.concat([query_input,result], axis=-1) result = layers.dropout(self.fc(result), self.dropout) result = result + query_input diff --git a/parakeet/modules/post_convnet.py b/parakeet/modules/post_convnet.py index 559d70e..3546c7a 100644 --- a/parakeet/modules/post_convnet.py +++ b/parakeet/modules/post_convnet.py @@ -16,6 +16,7 @@ class PostConvNet(dg.Layer): super(PostConvNet, self).__init__() self.dropout = dropout + self.num_conv = num_conv self.conv_list = [] self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step, out_channels = num_hidden, @@ -43,17 +44,9 @@ class PostConvNet(dg.Layer): self.add_sublayer("conv_list_{}".format(i), layer) self.batch_norm_list = [dg.BatchNorm(num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', data_layout='NCHW') for _ in range(num_conv-1)] - self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW')) + #self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, + # data_layout='NCHW')) for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) @@ -67,9 +60,15 @@ class PostConvNet(dg.Layer): Returns: output (Variable), Shape(B, T, C), the result after postconvnet. """ + input = layers.transpose(input, [0,2,1]) len = input.shape[-1] - for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + for i in range(self.num_conv-1): + batch_norm = self.batch_norm_list[i] + conv = self.conv_list[i] + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) + conv = self.conv_list[self.num_conv-1] + input = conv(input)[:,:,:len] output = layers.transpose(input, [0,2,1]) return output \ No newline at end of file diff --git a/parakeet/modules/prenet.py b/parakeet/modules/prenet.py index 4ea50e1..32eefd0 100644 --- a/parakeet/modules/prenet.py +++ b/parakeet/modules/prenet.py @@ -1,5 +1,6 @@ import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers +from parakeet.modules.layers import Linear class PreNet(dg.Layer): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): @@ -14,8 +15,8 @@ class PreNet(dg.Layer): self.output_size = output_size self.dropout_rate = dropout_rate - self.linear1 = dg.Linear(input_size, hidden_size) - self.linear2 = dg.Linear(hidden_size, output_size) + self.linear1 = Linear(input_size, hidden_size) + self.linear2 = Linear(hidden_size, output_size) def forward(self, x): """