deepvoice3: fix a bug in position embedding, fix initialization details for converter and attention.

This commit is contained in:
chenfeiyu 2020-02-17 05:03:39 +00:00
parent 5ad005fd9a
commit 70e271ed95
8 changed files with 55 additions and 31 deletions

View File

@ -227,8 +227,12 @@ if __name__ == "__main__":
lin_specs, done_flags, text_lengths, frames) lin_specs, done_flags, text_lengths, frames)
l = criterion.compose_loss(losses) l = criterion.compose_loss(losses)
l.backward() l.backward()
# record learning rate before updating
writer.add_scalar("learning_rate",
optim._learning_rate.step().numpy(),
global_step)
optim.minimize(l, grad_clip=gradient_clipper) optim.minimize(l, grad_clip=gradient_clipper)
dv3.clear_gradients() optim.clear_gradients()
# ==================all kinds of tedious things================= # ==================all kinds of tedious things=================
for k in epoch_loss.keys(): for k in epoch_loss.keys():
@ -237,6 +241,7 @@ if __name__ == "__main__":
# record step loss into tensorboard # record step loss into tensorboard
step_loss = {k: v.numpy()[0] for k, v in losses.items()} step_loss = {k: v.numpy()[0] for k, v in losses.items()}
print(step_loss)
for k, v in step_loss.items(): for k, v in step_loss.items():
writer.add_scalar(k, v, global_step) writer.add_scalar(k, v, global_step)
@ -276,7 +281,7 @@ if __name__ == "__main__":
"Please call Stella.", "Please call Stella.",
"Some have accepted this as a miracle without any physical explanation.", "Some have accepted this as a miracle without any physical explanation.",
] ]
for idx, sent in sentences: for idx, sent in enumerate(sentences):
wav, attn = eval_model(dv3, sent, wav, attn = eval_model(dv3, sent,
replace_pronounciation_prob, replace_pronounciation_prob,
min_level_db, ref_level_db, min_level_db, ref_level_db,

View File

@ -50,7 +50,7 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
embed_dim, embed_dim,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
padding_idx=padding_idx, padding_idx=None,
embedding_weight_std=embedding_std, embedding_weight_std=embedding_std,
convolutions=encoder_convolutions, convolutions=encoder_convolutions,
max_positions=max_positions, max_positions=max_positions,
@ -122,6 +122,7 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
text = np.expand_dims(text, 0) text = np.expand_dims(text, 0)
text_positions = np.expand_dims(text_positions, 0) text_positions = np.expand_dims(text_positions, 0)
model.eval()
mel_outputs, linear_outputs, alignments, done = model.transduce( mel_outputs, linear_outputs, alignments, done = model.transduce(
dg.to_variable(text), dg.to_variable(text_positions)) dg.to_variable(text), dg.to_variable(text_positions))
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T) linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)

View File

@ -3,6 +3,7 @@ from collections import namedtuple
from paddle import fluid from paddle import fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as F import paddle.fluid.layers as F
import paddle.fluid.initializer as I
from parakeet.modules.weight_norm import Linear from parakeet.modules.weight_norm import Linear
WindowRange = namedtuple("WindowRange", ["backward", "ahead"]) WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
@ -17,12 +18,24 @@ class Attention(dg.Layer):
key_projection=True, key_projection=True,
value_projection=True): value_projection=True):
super(Attention, self).__init__() super(Attention, self).__init__()
self.query_proj = Linear(query_dim, embed_dim) std = np.sqrt(1 / query_dim)
self.query_proj = Linear(query_dim,
embed_dim,
param_attr=I.Normal(scale=std))
if key_projection: if key_projection:
self.key_proj = Linear(embed_dim, embed_dim) std = np.sqrt(1 / embed_dim)
self.key_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
if value_projection: if value_projection:
self.value_proj = Linear(embed_dim, embed_dim) std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim, query_dim) self.value_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim,
query_dim,
param_attr=I.Normal(scale=std))
self.key_projection = key_projection self.key_projection = key_projection
self.value_projection = value_projection self.value_projection = value_projection

View File

@ -42,8 +42,6 @@ class Conv1DGLU(dg.Layer):
# weight init and dropout # weight init and dropout
self.std_mul = std_mul self.std_mul = std_mul
self.dropout = dropout self.dropout = dropout
c_in = filter_size * in_channels
std = np.sqrt(std_mul * (1 - dropout) / c_in)
self.residual = residual self.residual = residual
if residual: if residual:
@ -51,6 +49,7 @@ class Conv1DGLU(dg.Layer):
in_channels == num_filters in_channels == num_filters
), "this block uses residual connection"\ ), "this block uses residual connection"\
"the input_channes should equals num_filters" "the input_channes should equals num_filters"
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
self.conv = Conv1DCell(in_channels, self.conv = Conv1DCell(in_channels,
2 * num_filters, 2 * num_filters,
filter_size, filter_size,

View File

@ -13,11 +13,12 @@ from parakeet.models.deepvoice3.encoder import ConvSpec
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
# upsampling convolitions # upsampling convolitions
upsampling_convolutions = [ upsampling_convolutions = [
Conv1DTranspose(target_channels, Conv1DTranspose(
target_channels, target_channels,
2, target_channels,
stride=2, 2,
param_attr=I.Normal(np.sqrt(1 / target_channels))), stride=2,
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(n_speakers,
speaker_dim, speaker_dim,
target_channels, target_channels,
@ -34,12 +35,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
dilation=3, dilation=3,
std_mul=4., std_mul=4.,
dropout=dropout), dropout=dropout),
Conv1DTranspose(target_channels, Conv1DTranspose(
target_channels, target_channels,
2, target_channels,
stride=2, 2,
param_attr=I.Normal(scale=np.sqrt(4. / stride=2,
target_channels))), param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(n_speakers,
speaker_dim, speaker_dim,
target_channels, target_channels,
@ -62,12 +63,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [ upsampling_convolutions = [
Conv1DTranspose(target_channels, Conv1DTranspose(
target_channels, target_channels,
2, target_channels,
stride=2, 2,
param_attr=I.Normal(scale=np.sqrt(1. / stride=2,
target_channels))), param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(n_speakers,
speaker_dim, speaker_dim,
target_channels, target_channels,

View File

@ -32,7 +32,7 @@ class Encoder(dg.Layer):
self.dropout = dropout self.dropout = dropout
if n_speakers > 1: if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) # CAUTION: keep_prob std = np.sqrt((1 - dropout) / speaker_dim)
self.sp_proj1 = Linear(speaker_dim, self.sp_proj1 = Linear(speaker_dim,
embed_dim, embed_dim,
param_attr=I.Normal(scale=std)) param_attr=I.Normal(scale=std))

View File

@ -203,16 +203,21 @@ class TTSLoss(object):
result = { result = {
"mel": mel_loss if compute_mel_loss else None, "mel": mel_loss if compute_mel_loss else None,
"mel_l1_loss": mel_l1_loss if compute_mel_loss else None,
"mel_bce_loss": mel_bce_loss if compute_mel_loss else None,
"lin": lin_loss if compute_lin_loss else None, "lin": lin_loss if compute_lin_loss else None,
"lin_l1_loss": lin_l1_loss if compute_lin_loss else None,
"lin_bce_loss": lin_bce_loss if compute_lin_loss else None,
"done": done_loss if compute_done_loss else None, "done": done_loss if compute_done_loss else None,
"attn": attn_loss if compute_attn_loss else None, "attn": attn_loss if compute_attn_loss else None,
} }
return result return result
@staticmethod @staticmethod
def compose_loss(result): def compose_loss(result):
total_loss = 0. total_loss = 0.
for v in result.values(): for k in ["mel", "lin", "done", "attn"]:
if v is not None: if result[k] is not None:
total_loss += v total_loss += result[k]
return total_loss return total_loss

View File

@ -42,7 +42,7 @@ def position_encoding_init(n_position,
embed_range = 2 * (np.arange(d_pos_vec) // 2) embed_range = 2 * (np.arange(d_pos_vec) // 2)
radians = position_rate \ radians = position_rate \
* indices_range \ * indices_range \
* np.power(1e4, embed_range / d_pos_vec) / np.power(1.e4, embed_range / d_pos_vec)
if padding_idx is not None: if padding_idx is not None:
radians[padding_idx] = 0. radians[padding_idx] = 0.
return radians return radians