add_TransformerTTS

This commit is contained in:
lifuchen 2019-12-16 09:04:22 +00:00 committed by chenfeiyu
parent fd9e198ab6
commit 8a9bbc2634
14 changed files with 1593 additions and 2 deletions

View File

@ -88,7 +88,7 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
max_len = np.max(lengths)
batch = []
for example in minibatch:

View File

@ -0,0 +1,20 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
max_len: 50
transformer_step: 1
postnet_step: 1
use_gpu: True
checkpoint_path: ./checkpoint
log_dir: ./log
sample_path: ./sample

View File

@ -0,0 +1,27 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
network:
hidden_size: 256
embedding_size: 512
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log

View File

@ -0,0 +1,32 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
network:
hidden_size: 256
embedding_size: 512
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
image_step: 2000
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log

View File

@ -0,0 +1,170 @@
import math
import numpy as np
import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Conv1D(dg.Layer):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def __init__(self,
name_scope,
in_channels,
num_filters,
filter_size=3,
padding=0,
dilation=1,
stride=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
data_format='NCT',
dtype="float32"):
super(Conv1D, self).__init__(name_scope, dtype=dtype)
self.padding = padding
self.in_channels = in_channels
self.num_filters = num_filters
self.filter_size = filter_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.act = act
self.data_format = data_format
self.conv = dg.Conv2D(
self.full_name(),
num_filters=num_filters,
filter_size=(1, filter_size),
stride=(1, stride),
dilation=(1, dilation),
padding=(0, padding),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.conv(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
name_scope,
pool_size=-1,
pool_type='max',
pool_stride=1,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
exclusive=True,
data_format='NCT',
dtype='float32'):
super(Pool1D, self).__init__(name_scope, dtype=dtype)
self.pool_size = pool_size
self.pool_type = pool_type
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.global_pooling = global_pooling
self.use_cudnn = use_cudnn
self.ceil_mode = ceil_mode
self.exclusive = exclusive
self.data_format = data_format
self.dtype = dtype
self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.pool2d(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class DynamicGRU(dg.Layer):
def __init__(self,
scope_name,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__(scope_name)
self.gru_unit = dg.GRUUnit(
self.full_name(),
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res

View File

@ -0,0 +1,525 @@
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from layers import Conv1D, Pool1D, DynamicGRU
import numpy as np
class FC(dg.Layer):
def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1):
super(FC, self).__init__(name_scope)
self.in_features = in_features
self.out_features = out_features
self.is_bias = is_bias
self.dtype = dtype
self.gain = gain
self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features),
dtype=dtype,
default_initializer = fluid.initializer.XavierInitializer())
#self.weight = gain * self.weight
# mind the implicit conversion to ParamAttr for many cases
if is_bias is not False:
k = math.sqrt(1 / in_features)
self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ),
is_bias=True,
dtype=dtype,
default_initializer = fluid.initializer.Uniform(low=-k, high=k))
# 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature
def forward(self, x):
x = fluid.layers.matmul(x, self.weight)
if hasattr(self, "bias"):
x = fluid.layers.elementwise_add(x, self.bias)
return x
class Conv(dg.Layer):
def __init__(self, name_scope, in_channels, out_channels, filter_size=1,
padding=0, dilation=1, stride=1, use_cudnn=True,
data_format="NCT", is_bias=True, gain=1):
super(Conv, self).__init__(name_scope)
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_size = filter_size
self.padding = padding
self.dilation = dilation
self.stride = stride
self.use_cudnn = use_cudnn
self.data_format = data_format
self.is_bias = is_bias
self.gain = gain
self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer())
self.bias_attr = None
if is_bias is not False:
k = math.sqrt(1 / in_channels)
self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k))
self.conv = Conv1D( self.full_name(),
in_channels = in_channels,
num_filters = out_channels,
filter_size = filter_size,
padding = padding,
dilation = dilation,
stride = stride,
param_attr = self.weight_attr,
bias_attr = self.bias_attr,
use_cudnn = use_cudnn,
data_format = data_format)
def forward(self, x):
x = self.conv(x)
return x
class EncoderPrenet(dg.Layer):
def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__(name_scope)
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding(self.full_name(),
size = [len(symbols), embedding_size],
param_attr = fluid.ParamAttr(name='weight'),
padding_idx = None)
self.conv1 = Conv(self.full_name(),
in_channels = embedding_size,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.conv2 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.conv3 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.projection = FC(self.full_name(), num_hidden, num_hidden)
def forward(self, x):
x = self.embedding(fluid.layers.unsqueeze(x, axes=[-1])) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2)
x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2)
x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = self.projection(x)
return x
class FFN(dg.Layer):
def __init__(self, name_scope, num_hidden, use_cudnn=True):
super(FFN, self).__init__(name_scope)
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.w_1 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden * 4,
filter_size = 1,
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.w_2 = Conv(self.full_name(),
in_channels = num_hidden * 4,
out_channels = num_hidden,
filter_size = 1,
use_cudnn = use_cudnn,
data_format = "NCT",
gain = math.sqrt(2))
self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2)
def forward(self, input):
#FFN Networt
x = layers.transpose(input, [0,2,1])
x = self.w_2(layers.relu(self.w_1(x)))
x = layers.transpose(x,[0,2,1])
# dropout
# x = layers.dropout(x, 0.1)
# not sure where dropout should be placed, in paper should before residual,
# but the diagonal alignment did not appear correctly in the attention plot.
# residual connection
x = x + input
#layer normalization
x = self.layer_norm(x)
return x
class DecoderPrenet(dg.Layer):
def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5):
super(DecoderPrenet, self).__init__(name_scope)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_rate = dropout_rate
self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1
self.fc2 = FC(self.full_name(), hidden_size, output_size)
def forward(self, x):
x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate)
x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate)
return x
class ScaledDotProductAttention(dg.Layer):
def __init__(self, name_scope, d_key):
super(ScaledDotProductAttention, self).__init__(name_scope)
self.d_key = d_key
# please attention this mask is diff from pytorch
def forward(self, key, value, query, mask=None, query_mask=None):
# Compute attention score
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
attention = attention / math.sqrt(self.d_key)
# Mask key to ignore padding
if mask is not None:
attention = attention * mask
mask = (mask == 0).astype(float) * (-2 ** 32 + 1)
attention = attention + mask
attention = layers.softmax(attention)
# Mask query to ignore padding
# Not sure how to work
if query_mask is not None:
attention = attention * query_mask
result = layers.matmul(attention, value)
return result, attention
class MultiheadAttention(dg.Layer):
def __init__(self, name_scope, num_hidden, num_head=4):
super(MultiheadAttention, self).__init__(name_scope)
self.num_hidden = num_hidden
self.num_hidden_per_attn = num_hidden // num_head
self.num_head = num_head
self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn)
self.fc = FC(self.full_name(), num_hidden * 2, num_hidden)
self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2)
def forward(self, key, value, query_input, mask=None, query_mask=None):
batch_size = key.shape[0]
seq_len_key = key.shape[1]
seq_len_query = query_input.shape[1]
# repeat masks h times
if query_mask is not None:
query_mask = layers.unsqueeze(query_mask, axes=[-1])
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn])
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn])
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn])
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn])
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn])
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn])
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
#print(result.().shape)
# concat result with input
result = layers.concat([query_input, result], axis=-1)
result = self.fc(result)
result = result + query_input
result = self.layer_norm(result)
return result, attention
class PostConvNet(dg.Layer):
def __init__(self, name_scope, config):
super(PostConvNet, self).__init__(name_scope)
num_hidden = config.network.hidden_size
self.num_hidden = num_hidden
self.conv1 = Conv(self.full_name(),
in_channels = config.audio.num_mels * config.audio.outputs_per_step,
out_channels = num_hidden,
filter_size = 5,
padding = 4,
use_cudnn = config.use_gpu,
data_format = "NCT",
gain = 5 / 3)
self.conv_list = [Conv(self.full_name(),
in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = 4,
use_cudnn = config.use_gpu,
data_format = "NCT",
gain = 5 / 3) for _ in range(3)]
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.conv5 = Conv(self.full_name(),
in_channels = num_hidden,
out_channels = config.audio.num_mels * config.audio.outputs_per_step,
filter_size = 5,
padding = 4,
use_cudnn = config.use_gpu,
data_format = "NCT")
self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
def forward(self, input):
input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1)
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1)
input = self.conv5(input)[:, :, :-4]
return input
class CBHG(dg.Layer):
def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False):
super(CBHG, self).__init__(name_scope)
"""
:param hidden_size: dimension of hidden unit
:param K: # of convolution banks
:param projection_size: dimension of projection unit
:param num_gru_layers: # of layers of GRUcell
:param max_pool_kernel_size: max pooling kernel size
:param is_post: whether post processing or not
"""
hidden_size = config.network.hidden_size
self.hidden_size = hidden_size
self.projection_size = projection_size
self.conv_list = []
self.conv_list.append(Conv(self.full_name(),
in_channels = projection_size,
out_channels = hidden_size,
filter_size = 1,
padding = int(np.floor(1/2)),
data_format = "NCT"))
for i in range(2,K+1):
self.conv_list.append(Conv(self.full_name(),
in_channels = hidden_size,
out_channels = hidden_size,
filter_size = i,
padding = int(np.floor(i/2)),
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
conv_outdim = hidden_size * K
self.conv_projection_1 = Conv(self.full_name(),
in_channels = conv_outdim,
out_channels = hidden_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.conv_projection_2 = Conv(self.full_name(),
in_channels = hidden_size,
out_channels = projection_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format = "NCT")
self.highway = Highwaynet(self.full_name(), self.projection_size)
h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(self.full_name(),
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(self.full_name(),
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(self.full_name(),
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(self.full_name(),
size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True,
origin_mode=True,
h_0 = h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:,:,:-1]
else:
return x
def forward(self, input_):
# input_.shape = [N, C, T]
conv_list = []
conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
fc_forward = self.fc_forward1(highway)
fc_reverse = self.fc_reverse1(highway)
out_forward = self.gru_forward1(fc_forward)
out_reverse = self.gru_reverse1(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
fc_forward = self.fc_forward2(out)
fc_reverse = self.fc_reverse2(out)
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1])
return out
class Highwaynet(dg.Layer):
def __init__(self, name_scope, num_units, num_layers=4):
super(Highwaynet, self).__init__(name_scope)
self.num_units = num_units
self.num_layers = num_layers
self.gates = []
self.linears = []
for i in range(num_layers):
self.linears.append(FC(self.full_name(), num_units, num_units))
self.gates.append(FC(self.full_name(), num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
def forward(self, input_):
out = input_
for linear, gate in zip(self.linears, self.gates):
h = fluid.layers.relu(linear(out))
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
return out

View File

@ -0,0 +1,207 @@
from module import *
from utils import get_positional_table, get_sinusoid_encoding_table
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
class Encoder(dg.Layer):
def __init__(self, name_scope, embedding_size, num_hidden, config):
super(Encoder, self).__init__(name_scope)
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha')
self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(name_scope=self.full_name(),
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(),
embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional):
if fluid.framework._dygraph_tracer()._train_mode:
query_mask = (positional != 0).astype(float)
mask = (positional != 0).astype(float)
mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1])
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1]))
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x = ffn(x)
attentions.append(attention)
return x, query_mask, attentions
class Decoder(dg.Layer):
def __init__(self, name_scope, num_hidden, config):
super(Decoder, self).__init__(name_scope)
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha')
self.alpha = self.create_parameter(param, shape=(1,), dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(name_scope=self.full_name(),
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = DecoderPrenet(self.full_name(),
input_size = config.audio.num_mels,
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.linear = FC(self.full_name(), num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1)
self.postconvnet = PostConvNet(self.full_name(), config)
def forward(self, key, value, query, c_mask, positional):
batch_size = key.shape[0]
decoder_len = query.shape[1]
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
#zeros = np.zeros(positional.shape, dtype=np.float32)
m_mask = (positional != 0).astype(float)
mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1)
mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
# (batch_size, decoder_len, decoder_len)
zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(float), axes=2), [1,1,decoder_len])
# (batch_size, decoder_len, seq_len)
zero_mask = fluid.layers.transpose(zero_mask, [0,2,1])
else:
mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
#import pdb; pdb.set_trace()
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
# Get position embedding
positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1]))
query = positional * self.alpha + query
#positional dropout
query = fluid.layers.dropout(query, 0.1)
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
postnet_input = layers.transpose(mel_out, [0,2,1])
out = self.postconvnet(postnet_input)
out = postnet_input + out
out = layers.transpose(out, [0,2,1])
# Stop tokens
stop_tokens = self.stop_linear(query)
return mel_out, out, attn_list, stop_tokens, selfattn_list
class Model(dg.Layer):
def __init__(self, name_scope, config):
super(Model, self).__init__(name_scope)
self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config)
self.decoder = Decoder(self.full_name(), config.network.hidden_size, config)
self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# attn_probs (128, mel_len, seq_len)
# stop_preds (batch_size, mel_len, 1)
# attns_dec (128, mel_len, mel_len)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
class ModelPostNet(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, name_scope, config):
super(ModelPostNet, self).__init__(name_scope)
self.pre_proj = Conv(self.full_name(),
in_channels = config.audio.num_mels,
out_channels = config.network.hidden_size,
data_format = "NCT")
self.cbhg = CBHG(self.full_name(), config)
self.post_proj = Conv(self.full_name(),
in_channels = config.audio.num_mels,
out_channels = (config.audio.n_fft // 2) + 1,
data_format = "NCT")
def forward(self, mel):
mel = layers.transpose(mel, [0,2,1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1])
return mag_pred

View File

@ -0,0 +1,63 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=float, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=float, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--network.hidden_size', type=int, default=256,
help="the hidden size in network.")
parser.add_argument('--network.embedding_size', type=int, default=512,
help="the embedding vector size.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer in synthesis.")
parser.add_argument('--postnet_step', type=int, default=100000,
help="Global step to restore checkpoint of postnet in synthesis.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./log',
help="the directory to save audio sample in synthesis.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -0,0 +1,137 @@
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from parakeet import g2p
from parakeet import audio
from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler
from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher
_ljspeech_processor = audio.AudioProcessor(
sample_rate=22050,
num_mels=80,
min_level_db=-100,
ref_level_db=20,
n_fft=2048,
win_length= int(22050 * 0.05),
hop_length= int(22050 * 0.0125),
power=1.2,
preemphasis=0.97,
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
mel_fmin=0,
mel_fmax=None,
clip_norm=True,
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
class LJSpeech(Dataset):
def __init__(self, root):
super(LJSpeech, self).__init__()
assert isinstance(root, (str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.metadata = self._prepare_metadata()
def _prepare_metadata(self):
csv_path = self.root.joinpath("metadata.csv")
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
names=["fname", "raw_text", "normalized_text"])
return metadata
def _get_example(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname, raw_text, normalized_text = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav")
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = _ljspeech_processor.load_wav(str(wav_path))
mag = _ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def _batch_examples(self, minibatch):
mag_batch = []
mel_batch = []
phoneme_batch = []
for example in minibatch:
mag, mel, phoneme = example
mag_batch.append(mag)
mel_batch.append(mel)
phoneme_batch.append(phoneme)
mag_batch = SpecBatcher(pad_value=0.)(mag_batch)
mel_batch = SpecBatcher(pad_value=0.)(mel_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return (mag_batch, mel_batch, phoneme_batch)
def __getitem__(self, index):
metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum)
return example
def __iter__(self):
for i in range(len(self)):
yield self[i]
def __len__(self):
return len(self.metadata)
def batch_examples(batch):
texts = []
mels = []
mel_inputs = []
text_lens = []
pos_texts = []
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
def batch_examples_postnet(batch):
mels=[]
mags=[]
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
return (mels, mags)

View File

@ -0,0 +1,67 @@
import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
import numpy as np
from network import Model, ModelPostNet
from tqdm import tqdm
from tensorboardX import SummaryWriter
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
from preprocess import _ljspeech_processor
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
return model_dict
def synthesis(text_input, cfg):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
# tensorboard
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'synthesis')
writer = SummaryWriter(path)
with dg.guard(place):
model = Model('transtts', cfg)
model_postnet = ModelPostNet('postnet', cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
# init input
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
model.eval()
model_postnet.eval()
pbar = tqdm(range(cfg.max_len))
for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
mag_pred = model_postnet(postnet_pred)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
if not os.path.exists(cfg.sample_path):
os.mkdir(cfg.sample_path)
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav)
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
synthesis("Transformer model is so fast!", cfg)

View File

@ -0,0 +1,135 @@
from network import *
from preprocess import batch_examples_postnet, LJSpeech
from tensorboardX import SummaryWriter
import os
from tqdm import tqdm
from parakeet.data.datacargo import DataCargo
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main():
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
local_rank = dg.parallel.Env().local_rank
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
LJSPEECH_ROOT = Path(cfg.data_path)
dataset = LJSpeech(LJSPEECH_ROOT)
dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'postnet')
writer = SummaryWriter(path)
with dg.guard(place):
# dataloader
input_fields = {
'names': ['mel', 'mag'],
'shapes':
[[cfg.batch_size, None, 80], [cfg.batch_size, None, 257]],
'dtypes': ['float32', 'float32'],
'lod_levels': [0, 0]
}
inputs = [
fluid.data(
name=input_fields['names'][i],
shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i],
lod_level=input_fields['lod_levels'][i])
for i in range(len(input_fields['names']))
]
reader = fluid.io.DataLoader.from_generator(
feed_list=inputs,
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
model = ModelPostNet('postnet', cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
reader.set_batch_generator(dataloader, place)
pbar = tqdm(reader())
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
mel, mag = data
mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy())
global_step += 1
mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel:
loss = model.scale_loss(loss)
writer.add_scalars('training_loss',{
'loss':loss.numpy(),
}, global_step)
loss.backward()
if cfg.use_data_parallel:
model.apply_collective_grads()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1))
model.clear_gradients()
if global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,166 @@
from preprocess import batch_examples, LJSpeech
import os
from tqdm import tqdm
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from network import *
from tensorboardX import SummaryWriter
from parakeet.data.datacargo import DataCargo
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
from matplotlib import cm
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main():
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
local_rank = dg.parallel.Env().local_rank
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
LJSPEECH_ROOT = Path(cfg.data_path)
dataset = LJSpeech(LJSPEECH_ROOT)
dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples, drop_last=True)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
# dataloader
input_fields = {
'names': ['character', 'mel', 'mel_input', 'pos_text', 'pos_mel', 'text_len'],
'shapes':
[[cfg.batch_size, None], [cfg.batch_size, None, 80], [cfg.batch_size, None, 80], [cfg.batch_size, 1], [cfg.batch_size, 1], [cfg.batch_size, 1]],
'dtypes': ['float32', 'float32', 'float32', 'int64', 'int64', 'int64'],
'lod_levels': [0, 0, 0, 0, 0, 0]
}
inputs = [
fluid.data(
name=input_fields['names'][i],
shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i],
lod_level=input_fields['lod_levels'][i])
for i in range(len(input_fields['names']))
]
reader = fluid.io.DataLoader.from_generator(
feed_list=inputs,
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
model = Model('transtts', cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
reader.set_batch_generator(dataloader, place)
pbar = tqdm(reader())
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data
global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss
if cfg.use_data_parallel:
loss = model.scale_loss(loss)
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy(),
}, global_step)
writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(),
}, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if global_step % cfg.image_step == 1:
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_enc):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_dec):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
loss.backward()
if cfg.use_data_parallel:
model.apply_collective_grads()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
writer.close()
if __name__ =='__main__':
main()

View File

@ -0,0 +1,42 @@
import numpy as np
import librosa
import os, copy
from scipy import signal
def get_positional_table(d_pos_vec, n_position=1024):
position_enc = np.array([
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.
return sinusoid_table
def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32)
for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
return W

View File

@ -7,4 +7,4 @@ LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1")
ljspeech = LJSpeech(LJSPEECH_ROOT)
ljspeech_cargo = DataCargo(ljspeech, batch_size=16, shuffle=True)
for i, batch in enumerate(ljspeech_cargo):
print(i)
print(i)