diff --git a/examples/deepvoice3/train.py b/examples/deepvoice3/train.py index 0c77402..ee64fea 100644 --- a/examples/deepvoice3/train.py +++ b/examples/deepvoice3/train.py @@ -227,8 +227,12 @@ if __name__ == "__main__": lin_specs, done_flags, text_lengths, frames) l = criterion.compose_loss(losses) l.backward() + # record learning rate before updating + writer.add_scalar("learning_rate", + optim._learning_rate.step().numpy(), + global_step) optim.minimize(l, grad_clip=gradient_clipper) - dv3.clear_gradients() + optim.clear_gradients() # ==================all kinds of tedious things================= for k in epoch_loss.keys(): @@ -237,6 +241,7 @@ if __name__ == "__main__": # record step loss into tensorboard step_loss = {k: v.numpy()[0] for k, v in losses.items()} + print(step_loss) for k, v in step_loss.items(): writer.add_scalar(k, v, global_step) @@ -276,7 +281,7 @@ if __name__ == "__main__": "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] - for idx, sent in sentences: + for idx, sent in enumerate(sentences): wav, attn = eval_model(dv3, sent, replace_pronounciation_prob, min_level_db, ref_level_db, diff --git a/examples/deepvoice3/utils.py b/examples/deepvoice3/utils.py index 91cf89b..4e9f5cf 100644 --- a/examples/deepvoice3/utils.py +++ b/examples/deepvoice3/utils.py @@ -50,7 +50,7 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, embed_dim, n_speakers, speaker_dim, - padding_idx=padding_idx, + padding_idx=None, embedding_weight_std=embedding_std, convolutions=encoder_convolutions, max_positions=max_positions, @@ -122,6 +122,7 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db, text = np.expand_dims(text, 0) text_positions = np.expand_dims(text_positions, 0) + model.eval() mel_outputs, linear_outputs, alignments, done = model.transduce( dg.to_variable(text), dg.to_variable(text_positions)) linear_outputs_np = linear_outputs.numpy()[0].T # (C, T) diff --git a/parakeet/models/deepvoice3/attention.py b/parakeet/models/deepvoice3/attention.py index df532ac..8f2c2c5 100644 --- a/parakeet/models/deepvoice3/attention.py +++ b/parakeet/models/deepvoice3/attention.py @@ -3,6 +3,7 @@ from collections import namedtuple from paddle import fluid import paddle.fluid.dygraph as dg import paddle.fluid.layers as F +import paddle.fluid.initializer as I from parakeet.modules.weight_norm import Linear WindowRange = namedtuple("WindowRange", ["backward", "ahead"]) @@ -17,12 +18,24 @@ class Attention(dg.Layer): key_projection=True, value_projection=True): super(Attention, self).__init__() - self.query_proj = Linear(query_dim, embed_dim) + std = np.sqrt(1 / query_dim) + self.query_proj = Linear(query_dim, + embed_dim, + param_attr=I.Normal(scale=std)) if key_projection: - self.key_proj = Linear(embed_dim, embed_dim) + std = np.sqrt(1 / embed_dim) + self.key_proj = Linear(embed_dim, + embed_dim, + param_attr=I.Normal(scale=std)) if value_projection: - self.value_proj = Linear(embed_dim, embed_dim) - self.out_proj = Linear(embed_dim, query_dim) + std = np.sqrt(1 / embed_dim) + self.value_proj = Linear(embed_dim, + embed_dim, + param_attr=I.Normal(scale=std)) + std = np.sqrt(1 / embed_dim) + self.out_proj = Linear(embed_dim, + query_dim, + param_attr=I.Normal(scale=std)) self.key_projection = key_projection self.value_projection = value_projection diff --git a/parakeet/models/deepvoice3/conv1dglu.py b/parakeet/models/deepvoice3/conv1dglu.py index 728b13e..23f0109 100644 --- a/parakeet/models/deepvoice3/conv1dglu.py +++ b/parakeet/models/deepvoice3/conv1dglu.py @@ -42,8 +42,6 @@ class Conv1DGLU(dg.Layer): # weight init and dropout self.std_mul = std_mul self.dropout = dropout - c_in = filter_size * in_channels - std = np.sqrt(std_mul * (1 - dropout) / c_in) self.residual = residual if residual: @@ -51,6 +49,7 @@ class Conv1DGLU(dg.Layer): in_channels == num_filters ), "this block uses residual connection"\ "the input_channes should equals num_filters" + std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels)) self.conv = Conv1DCell(in_channels, 2 * num_filters, filter_size, diff --git a/parakeet/models/deepvoice3/converter.py b/parakeet/models/deepvoice3/converter.py index b88cdb7..7f94805 100644 --- a/parakeet/models/deepvoice3/converter.py +++ b/parakeet/models/deepvoice3/converter.py @@ -13,11 +13,12 @@ from parakeet.models.deepvoice3.encoder import ConvSpec def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): # upsampling convolitions upsampling_convolutions = [ - Conv1DTranspose(target_channels, - target_channels, - 2, - stride=2, - param_attr=I.Normal(np.sqrt(1 / target_channels))), + Conv1DTranspose( + target_channels, + target_channels, + 2, + stride=2, + param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))), Conv1DGLU(n_speakers, speaker_dim, target_channels, @@ -34,12 +35,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): dilation=3, std_mul=4., dropout=dropout), - Conv1DTranspose(target_channels, - target_channels, - 2, - stride=2, - param_attr=I.Normal(scale=np.sqrt(4. / - target_channels))), + Conv1DTranspose( + target_channels, + target_channels, + 2, + stride=2, + param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))), Conv1DGLU(n_speakers, speaker_dim, target_channels, @@ -62,12 +63,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): upsampling_convolutions = [ - Conv1DTranspose(target_channels, - target_channels, - 2, - stride=2, - param_attr=I.Normal(scale=np.sqrt(1. / - target_channels))), + Conv1DTranspose( + target_channels, + target_channels, + 2, + stride=2, + param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))), Conv1DGLU(n_speakers, speaker_dim, target_channels, diff --git a/parakeet/models/deepvoice3/encoder.py b/parakeet/models/deepvoice3/encoder.py index c89cf3d..a50ae83 100644 --- a/parakeet/models/deepvoice3/encoder.py +++ b/parakeet/models/deepvoice3/encoder.py @@ -32,7 +32,7 @@ class Encoder(dg.Layer): self.dropout = dropout if n_speakers > 1: - std = np.sqrt((1 - dropout) / speaker_dim) # CAUTION: keep_prob + std = np.sqrt((1 - dropout) / speaker_dim) self.sp_proj1 = Linear(speaker_dim, embed_dim, param_attr=I.Normal(scale=std)) diff --git a/parakeet/models/deepvoice3/loss.py b/parakeet/models/deepvoice3/loss.py index 88362a0..0832c07 100644 --- a/parakeet/models/deepvoice3/loss.py +++ b/parakeet/models/deepvoice3/loss.py @@ -203,16 +203,21 @@ class TTSLoss(object): result = { "mel": mel_loss if compute_mel_loss else None, + "mel_l1_loss": mel_l1_loss if compute_mel_loss else None, + "mel_bce_loss": mel_bce_loss if compute_mel_loss else None, "lin": lin_loss if compute_lin_loss else None, + "lin_l1_loss": lin_l1_loss if compute_lin_loss else None, + "lin_bce_loss": lin_bce_loss if compute_lin_loss else None, "done": done_loss if compute_done_loss else None, "attn": attn_loss if compute_attn_loss else None, } + return result @staticmethod def compose_loss(result): total_loss = 0. - for v in result.values(): - if v is not None: - total_loss += v + for k in ["mel", "lin", "done", "attn"]: + if result[k] is not None: + total_loss += result[k] return total_loss \ No newline at end of file diff --git a/parakeet/models/deepvoice3/position_embedding.py b/parakeet/models/deepvoice3/position_embedding.py index a42ff88..aefb00c 100644 --- a/parakeet/models/deepvoice3/position_embedding.py +++ b/parakeet/models/deepvoice3/position_embedding.py @@ -42,7 +42,7 @@ def position_encoding_init(n_position, embed_range = 2 * (np.arange(d_pos_vec) // 2) radians = position_rate \ * indices_range \ - * np.power(1e4, embed_range / d_pos_vec) + / np.power(1.e4, embed_range / d_pos_vec) if padding_idx is not None: radians[padding_idx] = 0. return radians