deepvoice3: fix a bug in position embedding, fix initialization details for converter and attention.

This commit is contained in:
chenfeiyu 2020-02-17 05:03:39 +00:00
parent 5ad005fd9a
commit 70e271ed95
8 changed files with 55 additions and 31 deletions

View File

@ -227,8 +227,12 @@ if __name__ == "__main__":
lin_specs, done_flags, text_lengths, frames)
l = criterion.compose_loss(losses)
l.backward()
# record learning rate before updating
writer.add_scalar("learning_rate",
optim._learning_rate.step().numpy(),
global_step)
optim.minimize(l, grad_clip=gradient_clipper)
dv3.clear_gradients()
optim.clear_gradients()
# ==================all kinds of tedious things=================
for k in epoch_loss.keys():
@ -237,6 +241,7 @@ if __name__ == "__main__":
# record step loss into tensorboard
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
print(step_loss)
for k, v in step_loss.items():
writer.add_scalar(k, v, global_step)
@ -276,7 +281,7 @@ if __name__ == "__main__":
"Please call Stella.",
"Some have accepted this as a miracle without any physical explanation.",
]
for idx, sent in sentences:
for idx, sent in enumerate(sentences):
wav, attn = eval_model(dv3, sent,
replace_pronounciation_prob,
min_level_db, ref_level_db,

View File

@ -50,7 +50,7 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=padding_idx,
padding_idx=None,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
@ -122,6 +122,7 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
text = np.expand_dims(text, 0)
text_positions = np.expand_dims(text_positions, 0)
model.eval()
mel_outputs, linear_outputs, alignments, done = model.transduce(
dg.to_variable(text), dg.to_variable(text_positions))
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)

View File

@ -3,6 +3,7 @@ from collections import namedtuple
from paddle import fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
from parakeet.modules.weight_norm import Linear
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
@ -17,12 +18,24 @@ class Attention(dg.Layer):
key_projection=True,
value_projection=True):
super(Attention, self).__init__()
self.query_proj = Linear(query_dim, embed_dim)
std = np.sqrt(1 / query_dim)
self.query_proj = Linear(query_dim,
embed_dim,
param_attr=I.Normal(scale=std))
if key_projection:
self.key_proj = Linear(embed_dim, embed_dim)
std = np.sqrt(1 / embed_dim)
self.key_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
if value_projection:
self.value_proj = Linear(embed_dim, embed_dim)
self.out_proj = Linear(embed_dim, query_dim)
std = np.sqrt(1 / embed_dim)
self.value_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim,
query_dim,
param_attr=I.Normal(scale=std))
self.key_projection = key_projection
self.value_projection = value_projection

View File

@ -42,8 +42,6 @@ class Conv1DGLU(dg.Layer):
# weight init and dropout
self.std_mul = std_mul
self.dropout = dropout
c_in = filter_size * in_channels
std = np.sqrt(std_mul * (1 - dropout) / c_in)
self.residual = residual
if residual:
@ -51,6 +49,7 @@ class Conv1DGLU(dg.Layer):
in_channels == num_filters
), "this block uses residual connection"\
"the input_channes should equals num_filters"
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
self.conv = Conv1DCell(in_channels,
2 * num_filters,
filter_size,

View File

@ -13,11 +13,12 @@ from parakeet.models.deepvoice3.encoder import ConvSpec
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
# upsampling convolitions
upsampling_convolutions = [
Conv1DTranspose(target_channels,
Conv1DTranspose(
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(np.sqrt(1 / target_channels))),
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
@ -34,12 +35,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
dilation=3,
std_mul=4.,
dropout=dropout),
Conv1DTranspose(target_channels,
Conv1DTranspose(
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(4. /
target_channels))),
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
@ -62,12 +63,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [
Conv1DTranspose(target_channels,
Conv1DTranspose(
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(1. /
target_channels))),
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,

View File

@ -32,7 +32,7 @@ class Encoder(dg.Layer):
self.dropout = dropout
if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) # CAUTION: keep_prob
std = np.sqrt((1 - dropout) / speaker_dim)
self.sp_proj1 = Linear(speaker_dim,
embed_dim,
param_attr=I.Normal(scale=std))

View File

@ -203,16 +203,21 @@ class TTSLoss(object):
result = {
"mel": mel_loss if compute_mel_loss else None,
"mel_l1_loss": mel_l1_loss if compute_mel_loss else None,
"mel_bce_loss": mel_bce_loss if compute_mel_loss else None,
"lin": lin_loss if compute_lin_loss else None,
"lin_l1_loss": lin_l1_loss if compute_lin_loss else None,
"lin_bce_loss": lin_bce_loss if compute_lin_loss else None,
"done": done_loss if compute_done_loss else None,
"attn": attn_loss if compute_attn_loss else None,
}
return result
@staticmethod
def compose_loss(result):
total_loss = 0.
for v in result.values():
if v is not None:
total_loss += v
for k in ["mel", "lin", "done", "attn"]:
if result[k] is not None:
total_loss += result[k]
return total_loss

View File

@ -42,7 +42,7 @@ def position_encoding_init(n_position,
embed_range = 2 * (np.arange(d_pos_vec) // 2)
radians = position_rate \
* indices_range \
* np.power(1e4, embed_range / d_pos_vec)
/ np.power(1.e4, embed_range / d_pos_vec)
if padding_idx is not None:
radians[padding_idx] = 0.
return radians