deepvoice3: fix a bug in position embedding, fix initialization details for converter and attention.
This commit is contained in:
parent
5ad005fd9a
commit
70e271ed95
|
@ -227,8 +227,12 @@ if __name__ == "__main__":
|
|||
lin_specs, done_flags, text_lengths, frames)
|
||||
l = criterion.compose_loss(losses)
|
||||
l.backward()
|
||||
# record learning rate before updating
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy(),
|
||||
global_step)
|
||||
optim.minimize(l, grad_clip=gradient_clipper)
|
||||
dv3.clear_gradients()
|
||||
optim.clear_gradients()
|
||||
|
||||
# ==================all kinds of tedious things=================
|
||||
for k in epoch_loss.keys():
|
||||
|
@ -237,6 +241,7 @@ if __name__ == "__main__":
|
|||
|
||||
# record step loss into tensorboard
|
||||
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
|
||||
print(step_loss)
|
||||
for k, v in step_loss.items():
|
||||
writer.add_scalar(k, v, global_step)
|
||||
|
||||
|
@ -276,7 +281,7 @@ if __name__ == "__main__":
|
|||
"Please call Stella.",
|
||||
"Some have accepted this as a miracle without any physical explanation.",
|
||||
]
|
||||
for idx, sent in sentences:
|
||||
for idx, sent in enumerate(sentences):
|
||||
wav, attn = eval_model(dv3, sent,
|
||||
replace_pronounciation_prob,
|
||||
min_level_db, ref_level_db,
|
||||
|
|
|
@ -50,7 +50,7 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
|||
embed_dim,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
padding_idx=padding_idx,
|
||||
padding_idx=None,
|
||||
embedding_weight_std=embedding_std,
|
||||
convolutions=encoder_convolutions,
|
||||
max_positions=max_positions,
|
||||
|
@ -122,6 +122,7 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
|
|||
|
||||
text = np.expand_dims(text, 0)
|
||||
text_positions = np.expand_dims(text_positions, 0)
|
||||
model.eval()
|
||||
mel_outputs, linear_outputs, alignments, done = model.transduce(
|
||||
dg.to_variable(text), dg.to_variable(text_positions))
|
||||
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)
|
||||
|
|
|
@ -3,6 +3,7 @@ from collections import namedtuple
|
|||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as F
|
||||
import paddle.fluid.initializer as I
|
||||
|
||||
from parakeet.modules.weight_norm import Linear
|
||||
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
|
||||
|
@ -17,12 +18,24 @@ class Attention(dg.Layer):
|
|||
key_projection=True,
|
||||
value_projection=True):
|
||||
super(Attention, self).__init__()
|
||||
self.query_proj = Linear(query_dim, embed_dim)
|
||||
std = np.sqrt(1 / query_dim)
|
||||
self.query_proj = Linear(query_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
if key_projection:
|
||||
self.key_proj = Linear(embed_dim, embed_dim)
|
||||
std = np.sqrt(1 / embed_dim)
|
||||
self.key_proj = Linear(embed_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
if value_projection:
|
||||
self.value_proj = Linear(embed_dim, embed_dim)
|
||||
self.out_proj = Linear(embed_dim, query_dim)
|
||||
std = np.sqrt(1 / embed_dim)
|
||||
self.value_proj = Linear(embed_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
std = np.sqrt(1 / embed_dim)
|
||||
self.out_proj = Linear(embed_dim,
|
||||
query_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
|
||||
self.key_projection = key_projection
|
||||
self.value_projection = value_projection
|
||||
|
|
|
@ -42,8 +42,6 @@ class Conv1DGLU(dg.Layer):
|
|||
# weight init and dropout
|
||||
self.std_mul = std_mul
|
||||
self.dropout = dropout
|
||||
c_in = filter_size * in_channels
|
||||
std = np.sqrt(std_mul * (1 - dropout) / c_in)
|
||||
|
||||
self.residual = residual
|
||||
if residual:
|
||||
|
@ -51,6 +49,7 @@ class Conv1DGLU(dg.Layer):
|
|||
in_channels == num_filters
|
||||
), "this block uses residual connection"\
|
||||
"the input_channes should equals num_filters"
|
||||
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
|
||||
self.conv = Conv1DCell(in_channels,
|
||||
2 * num_filters,
|
||||
filter_size,
|
||||
|
|
|
@ -13,11 +13,12 @@ from parakeet.models.deepvoice3.encoder import ConvSpec
|
|||
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||
# upsampling convolitions
|
||||
upsampling_convolutions = [
|
||||
Conv1DTranspose(target_channels,
|
||||
Conv1DTranspose(
|
||||
target_channels,
|
||||
target_channels,
|
||||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(np.sqrt(1 / target_channels))),
|
||||
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
|
@ -34,12 +35,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
|||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout),
|
||||
Conv1DTranspose(target_channels,
|
||||
Conv1DTranspose(
|
||||
target_channels,
|
||||
target_channels,
|
||||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(scale=np.sqrt(4. /
|
||||
target_channels))),
|
||||
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
|
@ -62,12 +63,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
|||
|
||||
def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||
upsampling_convolutions = [
|
||||
Conv1DTranspose(target_channels,
|
||||
Conv1DTranspose(
|
||||
target_channels,
|
||||
target_channels,
|
||||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(scale=np.sqrt(1. /
|
||||
target_channels))),
|
||||
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
|
|
|
@ -32,7 +32,7 @@ class Encoder(dg.Layer):
|
|||
|
||||
self.dropout = dropout
|
||||
if n_speakers > 1:
|
||||
std = np.sqrt((1 - dropout) / speaker_dim) # CAUTION: keep_prob
|
||||
std = np.sqrt((1 - dropout) / speaker_dim)
|
||||
self.sp_proj1 = Linear(speaker_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
|
|
|
@ -203,16 +203,21 @@ class TTSLoss(object):
|
|||
|
||||
result = {
|
||||
"mel": mel_loss if compute_mel_loss else None,
|
||||
"mel_l1_loss": mel_l1_loss if compute_mel_loss else None,
|
||||
"mel_bce_loss": mel_bce_loss if compute_mel_loss else None,
|
||||
"lin": lin_loss if compute_lin_loss else None,
|
||||
"lin_l1_loss": lin_l1_loss if compute_lin_loss else None,
|
||||
"lin_bce_loss": lin_bce_loss if compute_lin_loss else None,
|
||||
"done": done_loss if compute_done_loss else None,
|
||||
"attn": attn_loss if compute_attn_loss else None,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def compose_loss(result):
|
||||
total_loss = 0.
|
||||
for v in result.values():
|
||||
if v is not None:
|
||||
total_loss += v
|
||||
for k in ["mel", "lin", "done", "attn"]:
|
||||
if result[k] is not None:
|
||||
total_loss += result[k]
|
||||
return total_loss
|
|
@ -42,7 +42,7 @@ def position_encoding_init(n_position,
|
|||
embed_range = 2 * (np.arange(d_pos_vec) // 2)
|
||||
radians = position_rate \
|
||||
* indices_range \
|
||||
* np.power(1e4, embed_range / d_pos_vec)
|
||||
/ np.power(1.e4, embed_range / d_pos_vec)
|
||||
if padding_idx is not None:
|
||||
radians[padding_idx] = 0.
|
||||
return radians
|
||||
|
|
Loading…
Reference in New Issue