TransformerTTS precision alignment

This commit is contained in:
lifuchen 2020-01-13 12:37:49 +00:00 committed by chenfeiyu
parent ae88be3419
commit ab0fe8f304
14 changed files with 93 additions and 104 deletions

View File

@ -89,7 +89,7 @@ def transliteration_cleaners(text):
def english_cleaners(text): def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.''' '''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text) text = convert_to_ascii(text)
text = add_punctuation(text) #text = add_punctuation(text)
text = lowercase(text) text = lowercase(text)
text = expand_numbers(text) text = expand_numbers(text)
text = expand_abbreviations(text) text = expand_abbreviations(text)

View File

@ -14,13 +14,11 @@ encoder_n_layer: 6
encoder_head: 2 encoder_head: 2
encoder_conv1d_filter_size: 1536 encoder_conv1d_filter_size: 1536
max_sep_len: 2048 max_sep_len: 2048
encoder_output_size: 384 fs_embedding_size: 384
embedding_size: 384
decoder_n_layer: 6 decoder_n_layer: 6
decoder_head: 2 decoder_head: 2
decoder_conv1d_filter_size: 1536 decoder_conv1d_filter_size: 1536
decoder_output_size: 384 fs_hidden_size: 384
hidden_size: 384
duration_predictor_output_size: 256 duration_predictor_output_size: 256
duration_predictor_filter_size: 3 duration_predictor_filter_size: 3
fft_conv1d_filter: 3 fft_conv1d_filter: 3
@ -28,6 +26,9 @@ fft_conv1d_padding: 1
dropout: 0.1 dropout: 0.1
transformer_head: 4 transformer_head: 4
embedding_size: 512
hidden_size: 256
warm_up_step: 4000 warm_up_step: 4000
grad_clip_thresh: 0.1 grad_clip_thresh: 0.1
batch_size: 32 batch_size: 32
@ -39,5 +40,5 @@ use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1 data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint transtts_path: ../transformerTTS/checkpoint
transformer_step: 20 transformer_step: 1
log_dir: ./log log_dir: ./log

View File

@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer): class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__() super(FFTBlock, self).__init__()

View File

@ -1,5 +1,5 @@
from utils import * from utils import *
from modules import * from modules import FFTBlock, LengthRegulator
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
@ -131,38 +131,38 @@ class FastSpeech(dg.Layer):
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len, len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size, d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.encoder_n_layer, n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head, n_head=cfg.encoder_head,
d_k=64, d_k=64,
d_v=64, d_v=64,
d_model=cfg.hidden_size, d_model=cfg.fs_hidden_size,
d_inner=cfg.encoder_conv1d_filter_size, d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1) dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size, self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size,
out_channels=cfg.duration_predictor_output_size, out_channels=cfg.duration_predictor_output_size,
filter_size=cfg.duration_predictor_filter_size, filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout) dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len, self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size, d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.decoder_n_layer, n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head, n_head=cfg.decoder_head,
d_k=64, d_k=64,
d_v=64, d_v=64,
d_model=cfg.hidden_size, d_model=cfg.fs_hidden_size,
d_inner=cfg.decoder_conv1d_filter_size, d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1) dropout=0.1)
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels) self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step)
self.postnet = PostConvNet(n_mels=80, self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
num_hidden=512, num_hidden=512,
filter_size=5, filter_size=5,
padding=int(5 / 2), padding=int(5 / 2),
num_conv=5, num_conv=5,
outputs_per_step=1, outputs_per_step=cfg.audio.outputs_per_step,
use_cudnn=True, use_cudnn=True,
dropout=0.1) dropout=0.1)

View File

@ -22,8 +22,8 @@ def add_config_options_to_parser(parser):
parser.add_argument('--audio.outputs_per_step', type=int, default=1, parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.") help="the outputs per step.")
parser.add_argument('--embedding_size', type=int, default=256, parser.add_argument('--fs_embedding_size', type=int, default=256,
help="the dim size of embedding.") help="the dim size of embedding of fastspeech.")
parser.add_argument('--encoder_n_layer', type=int, default=6, parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.") help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2, parser.add_argument('--encoder_head', type=int, default=2,
@ -32,18 +32,14 @@ def add_config_options_to_parser(parser):
help="the filter size of conv1d in encoder.") help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048, parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.") help="the max length of sequence.")
parser.add_argument('--encoder_output_size', type=int, default=256,
help="the output channel size of encoder.")
parser.add_argument('--decoder_n_layer', type=int, default=6, parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.") help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2, parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.") help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.") help="the filter size of conv1d in decoder.")
parser.add_argument('--decoder_output_size', type=int, default=256, parser.add_argument('--fs_hidden_size', type=int, default=256,
help="the output channel size of decoder.") help="the hidden size in model of fastspeech.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256, parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.") help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3, parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
@ -57,6 +53,11 @@ def add_config_options_to_parser(parser):
parser.add_argument('--transformer_head', type=int, default=4, parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.") help="the attention head num of transformerTTS.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model of transformerTTS.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000, parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.") help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0, parser.add_argument('--grad_clip_thresh', type=float, default=1.0,

View File

@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Pool1D from parakeet.modules.layers import Conv, Pool1D, Linear
from parakeet.modules.dynamicGRU import DynamicGRU from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np import numpy as np
class EncoderPrenet(dg.Layer): class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True): def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__() super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size], self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
param_attr = fluid.ParamAttr(name='weight'),
padding_idx = None) padding_idx = None)
self.conv_list = [] self.conv_list = []
self.conv_list.append(Conv(in_channels = embedding_size, self.conv_list.append(Conv(in_channels = embedding_size,
@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'), data_layout='NCHW', epsilon=1e-30) for _ in range(3)]
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.projection = dg.Linear(num_hidden, num_hidden) self.projection = Linear(num_hidden, num_hidden)
def forward(self, x): def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size) x = self.embedding(x) #(batch_size, seq_len, embending_size)
@ -90,10 +84,6 @@ class CBHG(dg.Layer):
self.batchnorm_list = [] self.batchnorm_list = []
for i in range(K): for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size, self.batchnorm_list.append(dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')) data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list): for i, layer in enumerate(self.batchnorm_list):
@ -114,16 +104,8 @@ class CBHG(dg.Layer):
data_format = "NCT") data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size, self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size, self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max', pool_type='max',
@ -134,32 +116,24 @@ class CBHG(dg.Layer):
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0) h_0 = dg.to_variable(h_0)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False, is_reverse = False,
origin_mode = True, origin_mode = True,
h_0 = h_0) h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True, is_reverse=True,
origin_mode=True, origin_mode=True,
h_0 = h_0) h_0 = h_0)
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False, is_reverse = False,
origin_mode = True, origin_mode = True,
h_0 = h_0) h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True, is_reverse=True,
origin_mode=True, origin_mode=True,
h_0 = h_0) h_0 = h_0)
@ -216,8 +190,8 @@ class Highwaynet(dg.Layer):
self.linears = [] self.linears = []
for i in range(num_layers): for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units)) self.linears.append(Linear(num_units, num_units))
self.gates.append(dg.Linear(num_units, num_units)) self.gates.append(Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("linears_{}".format(i), linear)

View File

@ -1,7 +1,7 @@
from parakeet.models.transformerTTS.module import * from parakeet.models.transformerTTS.module import *
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward from parakeet.modules.feed_forward import PositionwiseFeedForward
@ -13,8 +13,7 @@ class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config): def __init__(self, embedding_size, num_hidden, config):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha', param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden], self.pos_emb = dg.Embedding(size=[1024, num_hidden],
@ -39,13 +38,13 @@ class Encoder(dg.Layer):
else: else:
query_mask, mask = None, None query_mask, mask = None, None
# Encoder pre_network # Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C) x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding # Get positional encoding
positional = self.pos_emb(positional) positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C) x = positional * self.alpha + x #(N, T, C)
@ -65,21 +64,20 @@ class Decoder(dg.Layer):
def __init__(self, num_hidden, config): def __init__(self, num_hidden, config):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha') param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden], self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False)) trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels, self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
hidden_size = num_hidden * 2, hidden_size = num_hidden * 2,
output_size = num_hidden, output_size = num_hidden,
dropout_rate=0.2) dropout_rate=0.2)
self.linear = dg.Linear(num_hidden, num_hidden) self.linear = Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers): for i, layer in enumerate(self.selfattn_layers):
@ -90,8 +88,8 @@ class Decoder(dg.Layer):
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = dg.Linear(num_hidden, 1) self.stop_linear = Linear(num_hidden, 1)
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5, filter_size = 5, padding = 4, num_conv=5,
@ -115,10 +113,10 @@ class Decoder(dg.Layer):
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None m_mask, zero_mask = None, None
# Decoder pre-network # Decoder pre-network
query = self.decoder_prenet(query) query = self.decoder_prenet(query)
# Centered position # Centered position
query = self.linear(query) query = self.linear(query)
@ -132,14 +130,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder # Attention decoder-decoder, encoder-decoder
selfattn_list = list() selfattn_list = list()
attn_list = list() attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query) query = ffn(query)
selfattn_list.append(attn_dec) selfattn_list.append(attn_dec)
attn_list.append(attn_dot) attn_list.append(attn_dot)
# Mel linear projection # Mel linear projection
mel_out = self.mel_linear(query) mel_out = self.mel_linear(query)
# Post Mel Network # Post Mel Network
@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer):
# key (batch_size, seq_len, channel) # key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len) # c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len) # attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text) key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel) # mel_output/postnet_output (batch_size, mel_len, n_mel)

View File

@ -2,7 +2,7 @@ import os
from scipy.io.wavfile import write from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
import numpy as np import numpy as np
from network import Model, ModelPostNet from network import TransformerTTS, ModelPostNet
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import paddle.fluid as fluid import paddle.fluid as fluid
@ -28,7 +28,7 @@ def synthesis(text_input, cfg):
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
model = Model(cfg) model = TransformerTTS(cfg)
model_postnet = ModelPostNet(cfg) model_postnet = ModelPostNet(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))

View File

@ -89,8 +89,6 @@ def main(cfg):
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
print("===============",model.pre_proj.conv.weight.numpy())
print("===============",model.pre_proj.conv.weight.gradient())
model.clear_gradients() model.clear_gradients()
if local_rank==0: if local_rank==0:

View File

@ -63,7 +63,7 @@ def main(cfg):
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
parameter_list=model.parameters()) parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank).reader() reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None: if cfg.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")) model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))
@ -78,26 +78,25 @@ def main(cfg):
for epoch in range(cfg.epochs): for epoch in range(cfg.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data character, mel, mel_input, pos_text, pos_mel, text_length = data
global_step += 1 global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
label = np.zeros(stop_preds.shape).astype(np.float32) label = np.zeros(stop_preds.shape).astype(np.float32)
text_length = text_length.numpy() text_length = text_length.numpy()
for i in range(label.shape[0]): for i in range(label.shape[0]):
label[i][text_length[i] - 1] = 1 label[i][text_length[i] - 1] = 1
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
loss = mel_loss + post_mel_loss + stop_loss loss = mel_loss + post_mel_loss + stop_loss
if local_rank==0: if local_rank==0:
writer.add_scalars('training_loss', { writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(), 'mel_loss':mel_loss.numpy(),

View File

@ -5,6 +5,25 @@ import paddle
from paddle import fluid from paddle import fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
class Linear(dg.Layer):
def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.dtype = dtype
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.bias = is_bias
if is_bias is not False:
k = math.sqrt(1 / in_features)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
bias_attr = self.bias,)
def forward(self, x):
x = self.linear(x)
return x
class Conv(dg.Layer): class Conv(dg.Layer):
def __init__(self, in_channels, out_channels, filter_size=1, def __init__(self, in_channels, out_channels, filter_size=1,

View File

@ -2,6 +2,7 @@ import math
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.layers import Linear
class ScaledDotProductAttention(dg.Layer): class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key): def __init__(self, d_key):
@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer):
attention = attention * mask attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
attention = attention + mask attention = attention + mask
attention = layers.softmax(attention) attention = layers.softmax(attention)
attention = layers.dropout(attention, dropout) attention = layers.dropout(attention, dropout)
# Mask query to ignore padding # Mask query to ignore padding
if query_mask is not None: if query_mask is not None:
attention = attention * query_mask attention = attention * query_mask
@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer):
self.d_q = d_q self.d_q = d_q
self.dropout = dropout self.dropout = dropout
self.key = dg.Linear(num_hidden, num_head * d_k) self.key = Linear(num_hidden, num_head * d_k, is_bias=False)
self.value = dg.Linear(num_hidden, num_head * d_k) self.value = Linear(num_hidden, num_head * d_k, is_bias=False)
self.query = dg.Linear(num_hidden, num_head * d_q) self.query = Linear(num_hidden, num_head * d_q, is_bias=False)
self.scal_attn = ScaledDotProductAttention(d_k) self.scal_attn = ScaledDotProductAttention(d_k)
self.fc = dg.Linear(num_head * d_q, num_hidden) self.fc = Linear(num_head * d_q * 2, num_hidden)
self.layer_norm = dg.LayerNorm(num_hidden) self.layer_norm = dg.LayerNorm(num_hidden)
@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer):
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.concat([query_input,result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout) result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input result = result + query_input

View File

@ -16,6 +16,7 @@ class PostConvNet(dg.Layer):
super(PostConvNet, self).__init__() super(PostConvNet, self).__init__()
self.dropout = dropout self.dropout = dropout
self.num_conv = num_conv
self.conv_list = [] self.conv_list = []
self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step, self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step,
out_channels = num_hidden, out_channels = num_hidden,
@ -43,17 +44,9 @@ class PostConvNet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(num_conv-1)] data_layout='NCHW') for _ in range(num_conv-1)]
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, #self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
param_attr = fluid.ParamAttr(name='weight'), # data_layout='NCHW'))
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
@ -67,9 +60,15 @@ class PostConvNet(dg.Layer):
Returns: Returns:
output (Variable), Shape(B, T, C), the result after postconvnet. output (Variable), Shape(B, T, C), the result after postconvnet.
""" """
input = layers.transpose(input, [0,2,1]) input = layers.transpose(input, [0,2,1])
len = input.shape[-1] len = input.shape[-1]
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): for i in range(self.num_conv-1):
batch_norm = self.batch_norm_list[i]
conv = self.conv_list[i]
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
conv = self.conv_list[self.num_conv-1]
input = conv(input)[:,:,:len]
output = layers.transpose(input, [0,2,1]) output = layers.transpose(input, [0,2,1])
return output return output

View File

@ -1,5 +1,6 @@
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.layers import Linear
class PreNet(dg.Layer): class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
@ -14,8 +15,8 @@ class PreNet(dg.Layer):
self.output_size = output_size self.output_size = output_size
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.linear1 = dg.Linear(input_size, hidden_size) self.linear1 = Linear(input_size, hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size) self.linear2 = Linear(hidden_size, output_size)
def forward(self, x): def forward(self, x):
""" """