Merge branch 'master' into 'master'
deepvoice3: fix a bug in position embedding, fix initialization details for… See merge request !13
This commit is contained in:
commit
5b442aaae2
|
@ -227,8 +227,12 @@ if __name__ == "__main__":
|
||||||
lin_specs, done_flags, text_lengths, frames)
|
lin_specs, done_flags, text_lengths, frames)
|
||||||
l = criterion.compose_loss(losses)
|
l = criterion.compose_loss(losses)
|
||||||
l.backward()
|
l.backward()
|
||||||
|
# record learning rate before updating
|
||||||
|
writer.add_scalar("learning_rate",
|
||||||
|
optim._learning_rate.step().numpy(),
|
||||||
|
global_step)
|
||||||
optim.minimize(l, grad_clip=gradient_clipper)
|
optim.minimize(l, grad_clip=gradient_clipper)
|
||||||
dv3.clear_gradients()
|
optim.clear_gradients()
|
||||||
|
|
||||||
# ==================all kinds of tedious things=================
|
# ==================all kinds of tedious things=================
|
||||||
for k in epoch_loss.keys():
|
for k in epoch_loss.keys():
|
||||||
|
@ -237,6 +241,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# record step loss into tensorboard
|
# record step loss into tensorboard
|
||||||
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
|
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
|
||||||
|
print(step_loss)
|
||||||
for k, v in step_loss.items():
|
for k, v in step_loss.items():
|
||||||
writer.add_scalar(k, v, global_step)
|
writer.add_scalar(k, v, global_step)
|
||||||
|
|
||||||
|
@ -276,7 +281,7 @@ if __name__ == "__main__":
|
||||||
"Please call Stella.",
|
"Please call Stella.",
|
||||||
"Some have accepted this as a miracle without any physical explanation.",
|
"Some have accepted this as a miracle without any physical explanation.",
|
||||||
]
|
]
|
||||||
for idx, sent in sentences:
|
for idx, sent in enumerate(sentences):
|
||||||
wav, attn = eval_model(dv3, sent,
|
wav, attn = eval_model(dv3, sent,
|
||||||
replace_pronounciation_prob,
|
replace_pronounciation_prob,
|
||||||
min_level_db, ref_level_db,
|
min_level_db, ref_level_db,
|
||||||
|
|
|
@ -50,7 +50,7 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
||||||
embed_dim,
|
embed_dim,
|
||||||
n_speakers,
|
n_speakers,
|
||||||
speaker_dim,
|
speaker_dim,
|
||||||
padding_idx=padding_idx,
|
padding_idx=None,
|
||||||
embedding_weight_std=embedding_std,
|
embedding_weight_std=embedding_std,
|
||||||
convolutions=encoder_convolutions,
|
convolutions=encoder_convolutions,
|
||||||
max_positions=max_positions,
|
max_positions=max_positions,
|
||||||
|
@ -122,6 +122,7 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
|
||||||
|
|
||||||
text = np.expand_dims(text, 0)
|
text = np.expand_dims(text, 0)
|
||||||
text_positions = np.expand_dims(text_positions, 0)
|
text_positions = np.expand_dims(text_positions, 0)
|
||||||
|
model.eval()
|
||||||
mel_outputs, linear_outputs, alignments, done = model.transduce(
|
mel_outputs, linear_outputs, alignments, done = model.transduce(
|
||||||
dg.to_variable(text), dg.to_variable(text_positions))
|
dg.to_variable(text), dg.to_variable(text_positions))
|
||||||
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)
|
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from collections import namedtuple
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid.layers as F
|
import paddle.fluid.layers as F
|
||||||
|
import paddle.fluid.initializer as I
|
||||||
|
|
||||||
from parakeet.modules.weight_norm import Linear
|
from parakeet.modules.weight_norm import Linear
|
||||||
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
|
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
|
||||||
|
@ -17,12 +18,24 @@ class Attention(dg.Layer):
|
||||||
key_projection=True,
|
key_projection=True,
|
||||||
value_projection=True):
|
value_projection=True):
|
||||||
super(Attention, self).__init__()
|
super(Attention, self).__init__()
|
||||||
self.query_proj = Linear(query_dim, embed_dim)
|
std = np.sqrt(1 / query_dim)
|
||||||
|
self.query_proj = Linear(query_dim,
|
||||||
|
embed_dim,
|
||||||
|
param_attr=I.Normal(scale=std))
|
||||||
if key_projection:
|
if key_projection:
|
||||||
self.key_proj = Linear(embed_dim, embed_dim)
|
std = np.sqrt(1 / embed_dim)
|
||||||
|
self.key_proj = Linear(embed_dim,
|
||||||
|
embed_dim,
|
||||||
|
param_attr=I.Normal(scale=std))
|
||||||
if value_projection:
|
if value_projection:
|
||||||
self.value_proj = Linear(embed_dim, embed_dim)
|
std = np.sqrt(1 / embed_dim)
|
||||||
self.out_proj = Linear(embed_dim, query_dim)
|
self.value_proj = Linear(embed_dim,
|
||||||
|
embed_dim,
|
||||||
|
param_attr=I.Normal(scale=std))
|
||||||
|
std = np.sqrt(1 / embed_dim)
|
||||||
|
self.out_proj = Linear(embed_dim,
|
||||||
|
query_dim,
|
||||||
|
param_attr=I.Normal(scale=std))
|
||||||
|
|
||||||
self.key_projection = key_projection
|
self.key_projection = key_projection
|
||||||
self.value_projection = value_projection
|
self.value_projection = value_projection
|
||||||
|
|
|
@ -42,8 +42,6 @@ class Conv1DGLU(dg.Layer):
|
||||||
# weight init and dropout
|
# weight init and dropout
|
||||||
self.std_mul = std_mul
|
self.std_mul = std_mul
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
c_in = filter_size * in_channels
|
|
||||||
std = np.sqrt(std_mul * (1 - dropout) / c_in)
|
|
||||||
|
|
||||||
self.residual = residual
|
self.residual = residual
|
||||||
if residual:
|
if residual:
|
||||||
|
@ -51,6 +49,7 @@ class Conv1DGLU(dg.Layer):
|
||||||
in_channels == num_filters
|
in_channels == num_filters
|
||||||
), "this block uses residual connection"\
|
), "this block uses residual connection"\
|
||||||
"the input_channes should equals num_filters"
|
"the input_channes should equals num_filters"
|
||||||
|
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
|
||||||
self.conv = Conv1DCell(in_channels,
|
self.conv = Conv1DCell(in_channels,
|
||||||
2 * num_filters,
|
2 * num_filters,
|
||||||
filter_size,
|
filter_size,
|
||||||
|
|
|
@ -13,11 +13,12 @@ from parakeet.models.deepvoice3.encoder import ConvSpec
|
||||||
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||||
# upsampling convolitions
|
# upsampling convolitions
|
||||||
upsampling_convolutions = [
|
upsampling_convolutions = [
|
||||||
Conv1DTranspose(target_channels,
|
Conv1DTranspose(
|
||||||
|
target_channels,
|
||||||
target_channels,
|
target_channels,
|
||||||
2,
|
2,
|
||||||
stride=2,
|
stride=2,
|
||||||
param_attr=I.Normal(np.sqrt(1 / target_channels))),
|
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
|
||||||
Conv1DGLU(n_speakers,
|
Conv1DGLU(n_speakers,
|
||||||
speaker_dim,
|
speaker_dim,
|
||||||
target_channels,
|
target_channels,
|
||||||
|
@ -34,12 +35,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||||
dilation=3,
|
dilation=3,
|
||||||
std_mul=4.,
|
std_mul=4.,
|
||||||
dropout=dropout),
|
dropout=dropout),
|
||||||
Conv1DTranspose(target_channels,
|
Conv1DTranspose(
|
||||||
|
target_channels,
|
||||||
target_channels,
|
target_channels,
|
||||||
2,
|
2,
|
||||||
stride=2,
|
stride=2,
|
||||||
param_attr=I.Normal(scale=np.sqrt(4. /
|
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
|
||||||
target_channels))),
|
|
||||||
Conv1DGLU(n_speakers,
|
Conv1DGLU(n_speakers,
|
||||||
speaker_dim,
|
speaker_dim,
|
||||||
target_channels,
|
target_channels,
|
||||||
|
@ -62,12 +63,12 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||||
|
|
||||||
def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||||
upsampling_convolutions = [
|
upsampling_convolutions = [
|
||||||
Conv1DTranspose(target_channels,
|
Conv1DTranspose(
|
||||||
|
target_channels,
|
||||||
target_channels,
|
target_channels,
|
||||||
2,
|
2,
|
||||||
stride=2,
|
stride=2,
|
||||||
param_attr=I.Normal(scale=np.sqrt(1. /
|
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
|
||||||
target_channels))),
|
|
||||||
Conv1DGLU(n_speakers,
|
Conv1DGLU(n_speakers,
|
||||||
speaker_dim,
|
speaker_dim,
|
||||||
target_channels,
|
target_channels,
|
||||||
|
|
|
@ -32,7 +32,7 @@ class Encoder(dg.Layer):
|
||||||
|
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
if n_speakers > 1:
|
if n_speakers > 1:
|
||||||
std = np.sqrt((1 - dropout) / speaker_dim) # CAUTION: keep_prob
|
std = np.sqrt((1 - dropout) / speaker_dim)
|
||||||
self.sp_proj1 = Linear(speaker_dim,
|
self.sp_proj1 = Linear(speaker_dim,
|
||||||
embed_dim,
|
embed_dim,
|
||||||
param_attr=I.Normal(scale=std))
|
param_attr=I.Normal(scale=std))
|
||||||
|
|
|
@ -203,16 +203,21 @@ class TTSLoss(object):
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"mel": mel_loss if compute_mel_loss else None,
|
"mel": mel_loss if compute_mel_loss else None,
|
||||||
|
"mel_l1_loss": mel_l1_loss if compute_mel_loss else None,
|
||||||
|
"mel_bce_loss": mel_bce_loss if compute_mel_loss else None,
|
||||||
"lin": lin_loss if compute_lin_loss else None,
|
"lin": lin_loss if compute_lin_loss else None,
|
||||||
|
"lin_l1_loss": lin_l1_loss if compute_lin_loss else None,
|
||||||
|
"lin_bce_loss": lin_bce_loss if compute_lin_loss else None,
|
||||||
"done": done_loss if compute_done_loss else None,
|
"done": done_loss if compute_done_loss else None,
|
||||||
"attn": attn_loss if compute_attn_loss else None,
|
"attn": attn_loss if compute_attn_loss else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def compose_loss(result):
|
def compose_loss(result):
|
||||||
total_loss = 0.
|
total_loss = 0.
|
||||||
for v in result.values():
|
for k in ["mel", "lin", "done", "attn"]:
|
||||||
if v is not None:
|
if result[k] is not None:
|
||||||
total_loss += v
|
total_loss += result[k]
|
||||||
return total_loss
|
return total_loss
|
|
@ -42,7 +42,7 @@ def position_encoding_init(n_position,
|
||||||
embed_range = 2 * (np.arange(d_pos_vec) // 2)
|
embed_range = 2 * (np.arange(d_pos_vec) // 2)
|
||||||
radians = position_rate \
|
radians = position_rate \
|
||||||
* indices_range \
|
* indices_range \
|
||||||
* np.power(1e4, embed_range / d_pos_vec)
|
/ np.power(1.e4, embed_range / d_pos_vec)
|
||||||
if padding_idx is not None:
|
if padding_idx is not None:
|
||||||
radians[padding_idx] = 0.
|
radians[padding_idx] = 0.
|
||||||
return radians
|
return radians
|
||||||
|
|
Loading…
Reference in New Issue