update fastspeech
This commit is contained in:
parent
ab0fe8f304
commit
f009411b57
|
@ -14,7 +14,6 @@ encoder_n_layer: 6
|
||||||
encoder_head: 2
|
encoder_head: 2
|
||||||
encoder_conv1d_filter_size: 1536
|
encoder_conv1d_filter_size: 1536
|
||||||
max_sep_len: 2048
|
max_sep_len: 2048
|
||||||
fs_embedding_size: 384
|
|
||||||
decoder_n_layer: 6
|
decoder_n_layer: 6
|
||||||
decoder_head: 2
|
decoder_head: 2
|
||||||
decoder_conv1d_filter_size: 1536
|
decoder_conv1d_filter_size: 1536
|
||||||
|
@ -39,6 +38,6 @@ use_gpu: True
|
||||||
use_data_parallel: False
|
use_data_parallel: False
|
||||||
|
|
||||||
data_path: ../../../dataset/LJSpeech-1.1
|
data_path: ../../../dataset/LJSpeech-1.1
|
||||||
transtts_path: ../transformerTTS/checkpoint
|
transtts_path: ../transformerTTS/checkpoint/
|
||||||
transformer_step: 1
|
transformer_step: 10
|
||||||
log_dir: ./log
|
log_dir: ./log
|
|
@ -4,7 +4,7 @@ import utils
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid.layers as layers
|
import paddle.fluid.layers as layers
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
from parakeet.modules.layers import Conv1D
|
from parakeet.modules.layers import Conv, Linear
|
||||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||||
|
|
||||||
|
@ -113,12 +113,12 @@ class DurationPredictor(dg.Layer):
|
||||||
self.filter_size = filter_size
|
self.filter_size = filter_size
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
|
|
||||||
self.conv1 = Conv1D(in_channels = self.input_size,
|
self.conv1 = Conv(in_channels = self.input_size,
|
||||||
out_channels = self.out_channels,
|
out_channels = self.out_channels,
|
||||||
filter_size = self.filter_size,
|
filter_size = self.filter_size,
|
||||||
padding=1,
|
padding=1,
|
||||||
data_format='NTC')
|
data_format='NTC')
|
||||||
self.conv2 = Conv1D(in_channels = self.out_channels,
|
self.conv2 = Conv(in_channels = self.out_channels,
|
||||||
out_channels = self.out_channels,
|
out_channels = self.out_channels,
|
||||||
filter_size = self.filter_size,
|
filter_size = self.filter_size,
|
||||||
padding=1,
|
padding=1,
|
||||||
|
@ -126,7 +126,7 @@ class DurationPredictor(dg.Layer):
|
||||||
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
||||||
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
||||||
|
|
||||||
self.linear =dg.Linear(self.out_channels, 1)
|
self.linear =Linear(self.out_channels, 1)
|
||||||
|
|
||||||
def forward(self, encoder_output):
|
def forward(self, encoder_output):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -5,12 +5,12 @@ import paddle.fluid as fluid
|
||||||
from parakeet.g2p.text.symbols import symbols
|
from parakeet.g2p.text.symbols import symbols
|
||||||
from parakeet.modules.utils import *
|
from parakeet.modules.utils import *
|
||||||
from parakeet.modules.post_convnet import PostConvNet
|
from parakeet.modules.post_convnet import PostConvNet
|
||||||
|
from parakeet.modules.layers import Linear
|
||||||
|
|
||||||
class Encoder(dg.Layer):
|
class Encoder(dg.Layer):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
n_src_vocab,
|
n_src_vocab,
|
||||||
len_max_seq,
|
len_max_seq,
|
||||||
d_word_vec,
|
|
||||||
n_layers,
|
n_layers,
|
||||||
n_head,
|
n_head,
|
||||||
d_k,
|
d_k,
|
||||||
|
@ -23,9 +23,9 @@ class Encoder(dg.Layer):
|
||||||
super(Encoder, self).__init__()
|
super(Encoder, self).__init__()
|
||||||
n_position = len_max_seq + 1
|
n_position = len_max_seq + 1
|
||||||
|
|
||||||
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0)
|
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
|
||||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
|
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||||
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
|
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||||
padding_idx=0,
|
padding_idx=0,
|
||||||
param_attr=fluid.ParamAttr(
|
param_attr=fluid.ParamAttr(
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
|
@ -70,7 +70,6 @@ class Encoder(dg.Layer):
|
||||||
class Decoder(dg.Layer):
|
class Decoder(dg.Layer):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
len_max_seq,
|
len_max_seq,
|
||||||
d_word_vec,
|
|
||||||
n_layers,
|
n_layers,
|
||||||
n_head,
|
n_head,
|
||||||
d_k,
|
d_k,
|
||||||
|
@ -83,8 +82,8 @@ class Decoder(dg.Layer):
|
||||||
super(Decoder, self).__init__()
|
super(Decoder, self).__init__()
|
||||||
|
|
||||||
n_position = len_max_seq + 1
|
n_position = len_max_seq + 1
|
||||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
|
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||||
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
|
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||||
padding_idx=0,
|
padding_idx=0,
|
||||||
param_attr=fluid.ParamAttr(
|
param_attr=fluid.ParamAttr(
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
|
@ -131,11 +130,10 @@ class FastSpeech(dg.Layer):
|
||||||
|
|
||||||
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
|
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
|
||||||
len_max_seq=cfg.max_sep_len,
|
len_max_seq=cfg.max_sep_len,
|
||||||
d_word_vec=cfg.fs_embedding_size,
|
|
||||||
n_layers=cfg.encoder_n_layer,
|
n_layers=cfg.encoder_n_layer,
|
||||||
n_head=cfg.encoder_head,
|
n_head=cfg.encoder_head,
|
||||||
d_k=64,
|
d_k=cfg.fs_hidden_size // cfg.encoder_head,
|
||||||
d_v=64,
|
d_v=cfg.fs_hidden_size // cfg.encoder_head,
|
||||||
d_model=cfg.fs_hidden_size,
|
d_model=cfg.fs_hidden_size,
|
||||||
d_inner=cfg.encoder_conv1d_filter_size,
|
d_inner=cfg.encoder_conv1d_filter_size,
|
||||||
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||||
|
@ -146,17 +144,16 @@ class FastSpeech(dg.Layer):
|
||||||
filter_size=cfg.duration_predictor_filter_size,
|
filter_size=cfg.duration_predictor_filter_size,
|
||||||
dropout=cfg.dropout)
|
dropout=cfg.dropout)
|
||||||
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
|
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
|
||||||
d_word_vec=cfg.fs_embedding_size,
|
|
||||||
n_layers=cfg.decoder_n_layer,
|
n_layers=cfg.decoder_n_layer,
|
||||||
n_head=cfg.decoder_head,
|
n_head=cfg.decoder_head,
|
||||||
d_k=64,
|
d_k=cfg.fs_hidden_size // cfg.decoder_head,
|
||||||
d_v=64,
|
d_v=cfg.fs_hidden_size // cfg.decoder_head,
|
||||||
d_model=cfg.fs_hidden_size,
|
d_model=cfg.fs_hidden_size,
|
||||||
d_inner=cfg.decoder_conv1d_filter_size,
|
d_inner=cfg.decoder_conv1d_filter_size,
|
||||||
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||||
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||||
dropout=0.1)
|
dropout=0.1)
|
||||||
self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step)
|
self.mel_linear = Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step)
|
||||||
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
|
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
|
||||||
num_hidden=512,
|
num_hidden=512,
|
||||||
filter_size=5,
|
filter_size=5,
|
||||||
|
|
|
@ -22,8 +22,6 @@ def add_config_options_to_parser(parser):
|
||||||
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||||
help="the outputs per step.")
|
help="the outputs per step.")
|
||||||
|
|
||||||
parser.add_argument('--fs_embedding_size', type=int, default=256,
|
|
||||||
help="the dim size of embedding of fastspeech.")
|
|
||||||
parser.add_argument('--encoder_n_layer', type=int, default=6,
|
parser.add_argument('--encoder_n_layer', type=int, default=6,
|
||||||
help="the number of FFT Block in encoder.")
|
help="the number of FFT Block in encoder.")
|
||||||
parser.add_argument('--encoder_head', type=int, default=2,
|
parser.add_argument('--encoder_head', type=int, default=2,
|
||||||
|
|
|
@ -55,14 +55,13 @@ def main(cfg):
|
||||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
transformerTTS = TransformerTTS(cfg)
|
with fluid.unique_name.guard():
|
||||||
model_path = os.path.join(cfg.transtts_path, "transformer")
|
transformerTTS = TransformerTTS(cfg)
|
||||||
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
|
model_path = os.path.join(cfg.transtts_path, "transformer")
|
||||||
#for param in transformerTTS.state_dict():
|
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
|
||||||
# print(param)
|
|
||||||
|
|
||||||
transformerTTS.set_dict(model_dict)
|
transformerTTS.set_dict(model_dict)
|
||||||
transformerTTS.eval()
|
transformerTTS.eval()
|
||||||
|
|
||||||
model = FastSpeech(cfg)
|
model = FastSpeech(cfg)
|
||||||
model.train()
|
model.train()
|
||||||
|
@ -89,7 +88,6 @@ def main(cfg):
|
||||||
|
|
||||||
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
|
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
|
||||||
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
|
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
|
||||||
|
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
#Forward
|
#Forward
|
||||||
|
@ -104,8 +102,7 @@ def main(cfg):
|
||||||
total_loss = mel_loss + mel_postnet_loss + duration_loss
|
total_loss = mel_loss + mel_postnet_loss + duration_loss
|
||||||
|
|
||||||
if local_rank==0:
|
if local_rank==0:
|
||||||
print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
|
#print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
|
||||||
|
|
||||||
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
|
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
|
||||||
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
|
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
|
||||||
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
|
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
|
||||||
|
|
|
@ -11,8 +11,8 @@ audio:
|
||||||
outputs_per_step: 1
|
outputs_per_step: 1
|
||||||
|
|
||||||
max_len: 50
|
max_len: 50
|
||||||
transformer_step: 1
|
transformer_step: 10
|
||||||
postnet_step: 1
|
postnet_step: 10
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
|
|
||||||
checkpoint_path: ./checkpoint
|
checkpoint_path: ./checkpoint
|
||||||
|
|
|
@ -18,9 +18,9 @@ grad_clip_thresh: 1.0
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
epochs: 10000
|
epochs: 10000
|
||||||
lr: 0.001
|
lr: 0.001
|
||||||
save_step: 500
|
save_step: 10
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: True
|
use_data_parallel: False
|
||||||
|
|
||||||
data_path: ../../../dataset/LJSpeech-1.1
|
data_path: ../../../dataset/LJSpeech-1.1
|
||||||
save_path: ./checkpoint
|
save_path: ./checkpoint
|
||||||
|
|
|
@ -35,7 +35,7 @@ class EncoderPrenet(dg.Layer):
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||||
|
|
||||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||||
data_layout='NCHW', epsilon=1e-30) for _ in range(3)]
|
data_layout='NCHW') for _ in range(3)]
|
||||||
|
|
||||||
for i, layer in enumerate(self.batch_norm_list):
|
for i, layer in enumerate(self.batch_norm_list):
|
||||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||||
|
@ -57,6 +57,7 @@ class CBHG(dg.Layer):
|
||||||
super(CBHG, self).__init__()
|
super(CBHG, self).__init__()
|
||||||
"""
|
"""
|
||||||
:param hidden_size: dimension of hidden unit
|
:param hidden_size: dimension of hidden unit
|
||||||
|
:param batch_size: batch size
|
||||||
:param K: # of convolution banks
|
:param K: # of convolution banks
|
||||||
:param projection_size: dimension of projection unit
|
:param projection_size: dimension of projection unit
|
||||||
:param num_gru_layers: # of layers of GRUcell
|
:param num_gru_layers: # of layers of GRUcell
|
||||||
|
|
|
@ -10,7 +10,7 @@ from parakeet.modules.post_convnet import PostConvNet
|
||||||
|
|
||||||
|
|
||||||
class Encoder(dg.Layer):
|
class Encoder(dg.Layer):
|
||||||
def __init__(self, embedding_size, num_hidden, config):
|
def __init__(self, embedding_size, num_hidden, config, num_head=4):
|
||||||
super(Encoder, self).__init__()
|
super(Encoder, self).__init__()
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
|
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
|
||||||
|
@ -24,10 +24,10 @@ class Encoder(dg.Layer):
|
||||||
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
||||||
num_hidden = num_hidden,
|
num_hidden = num_hidden,
|
||||||
use_cudnn=config.use_gpu)
|
use_cudnn=config.use_gpu)
|
||||||
self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
|
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.layers):
|
for i, layer in enumerate(self.layers):
|
||||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.ffns):
|
for i, layer in enumerate(self.ffns):
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
self.add_sublayer("ffns_{}".format(i), layer)
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ class Encoder(dg.Layer):
|
||||||
return x, query_mask, attentions
|
return x, query_mask, attentions
|
||||||
|
|
||||||
class Decoder(dg.Layer):
|
class Decoder(dg.Layer):
|
||||||
def __init__(self, num_hidden, config):
|
def __init__(self, num_hidden, config, num_head=4):
|
||||||
super(Decoder, self).__init__()
|
super(Decoder, self).__init__()
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
param = fluid.ParamAttr()
|
param = fluid.ParamAttr()
|
||||||
|
@ -79,13 +79,13 @@ class Decoder(dg.Layer):
|
||||||
dropout_rate=0.2)
|
dropout_rate=0.2)
|
||||||
self.linear = Linear(num_hidden, num_hidden)
|
self.linear = Linear(num_hidden, num_hidden)
|
||||||
|
|
||||||
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
|
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.selfattn_layers):
|
for i, layer in enumerate(self.selfattn_layers):
|
||||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||||
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
|
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.attn_layers):
|
for i, layer in enumerate(self.attn_layers):
|
||||||
self.add_sublayer("attn_{}".format(i), layer)
|
self.add_sublayer("attn_{}".format(i), layer)
|
||||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
|
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.ffns):
|
for i, layer in enumerate(self.ffns):
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
self.add_sublayer("ffns_{}".format(i), layer)
|
||||||
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
||||||
|
|
|
@ -28,12 +28,15 @@ def synthesis(text_input, cfg):
|
||||||
writer = SummaryWriter(path)
|
writer = SummaryWriter(path)
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
model = TransformerTTS(cfg)
|
with fluid.unique_name.guard():
|
||||||
model_postnet = ModelPostNet(cfg)
|
model = TransformerTTS(cfg)
|
||||||
|
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
|
||||||
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
|
model.eval()
|
||||||
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
|
|
||||||
|
|
||||||
|
with fluid.unique_name.guard():
|
||||||
|
model_postnet = ModelPostNet(cfg)
|
||||||
|
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
|
||||||
|
model_postnet.eval()
|
||||||
# init input
|
# init input
|
||||||
text = np.asarray(text_to_sequence(text_input))
|
text = np.asarray(text_to_sequence(text_input))
|
||||||
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
|
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
|
||||||
|
@ -42,9 +45,6 @@ def synthesis(text_input, cfg):
|
||||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
||||||
|
|
||||||
|
|
||||||
model.eval()
|
|
||||||
model_postnet.eval()
|
|
||||||
|
|
||||||
pbar = tqdm(range(cfg.max_len))
|
pbar = tqdm(range(cfg.max_len))
|
||||||
|
|
||||||
for i in pbar:
|
for i in pbar:
|
||||||
|
|
|
@ -86,17 +86,17 @@ def main(cfg):
|
||||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
|
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
|
||||||
|
|
||||||
|
|
||||||
label = np.zeros(stop_preds.shape).astype(np.float32)
|
label = (pos_mel == 0).astype(np.float32)
|
||||||
text_length = text_length.numpy()
|
#label = np.zeros(stop_preds.shape).astype(np.float32)
|
||||||
for i in range(label.shape[0]):
|
#text_length = text_length.numpy()
|
||||||
label[i][text_length[i] - 1] = 1
|
#for i in range(label.shape[0]):
|
||||||
|
# label[i][text_length[i] - 1] = 1
|
||||||
|
|
||||||
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||||
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||||
stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
|
stop_loss = cross_entropy(stop_preds, label)
|
||||||
loss = mel_loss + post_mel_loss + stop_loss
|
loss = mel_loss + post_mel_loss + stop_loss
|
||||||
|
|
||||||
|
|
||||||
if local_rank==0:
|
if local_rank==0:
|
||||||
writer.add_scalars('training_loss', {
|
writer.add_scalars('training_loss', {
|
||||||
'mel_loss':mel_loss.numpy(),
|
'mel_loss':mel_loss.numpy(),
|
||||||
|
|
|
@ -105,7 +105,6 @@ class MultiheadAttention(dg.Layer):
|
||||||
# concat all multihead result
|
# concat all multihead result
|
||||||
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
||||||
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
|
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
|
||||||
|
|
||||||
result = layers.concat([query_input,result], axis=-1)
|
result = layers.concat([query_input,result], axis=-1)
|
||||||
result = layers.dropout(self.fc(result), self.dropout)
|
result = layers.dropout(self.fc(result), self.dropout)
|
||||||
result = result + query_input
|
result = result + query_input
|
||||||
|
|
|
@ -65,9 +65,10 @@ def guided_attention(N, T, g=0.2):
|
||||||
return W
|
return W
|
||||||
|
|
||||||
|
|
||||||
def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001):
|
def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
|
||||||
input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
|
output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
|
||||||
label = input * (label * (position_weight - 1) + 1)
|
output = output * (label * (position_weight - 1) + 1)
|
||||||
return layers.reduce_sum(label, dim=[0, 1])
|
|
||||||
|
return layers.reduce_sum(output, dim=[0, 1])
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue