From 8a9bbc2634637ded7461bf111d4b2c067d9a4a1b Mon Sep 17 00:00:00 2001 From: lifuchen Date: Mon, 16 Dec 2019 09:04:22 +0000 Subject: [PATCH] add_TransformerTTS --- parakeet/data/batch.py | 2 +- .../transformerTTS/config/synthesis.yaml | 20 + .../transformerTTS/config/train_postnet.yaml | 27 + .../config/train_transformer.yaml | 32 ++ parakeet/models/transformerTTS/layers.py | 170 ++++++ parakeet/models/transformerTTS/module.py | 525 ++++++++++++++++++ parakeet/models/transformerTTS/network.py | 207 +++++++ parakeet/models/transformerTTS/parse.py | 63 +++ parakeet/models/transformerTTS/preprocess.py | 137 +++++ parakeet/models/transformerTTS/synthesis.py | 67 +++ .../models/transformerTTS/train_postnet.py | 135 +++++ .../transformerTTS/train_transformer.py | 166 ++++++ parakeet/models/transformerTTS/utils.py | 42 ++ tests/test_ljspeech.py | 2 +- 14 files changed, 1593 insertions(+), 2 deletions(-) create mode 100644 parakeet/models/transformerTTS/config/synthesis.yaml create mode 100644 parakeet/models/transformerTTS/config/train_postnet.yaml create mode 100644 parakeet/models/transformerTTS/config/train_transformer.yaml create mode 100644 parakeet/models/transformerTTS/layers.py create mode 100644 parakeet/models/transformerTTS/module.py create mode 100644 parakeet/models/transformerTTS/network.py create mode 100644 parakeet/models/transformerTTS/parse.py create mode 100644 parakeet/models/transformerTTS/preprocess.py create mode 100644 parakeet/models/transformerTTS/synthesis.py create mode 100644 parakeet/models/transformerTTS/train_postnet.py create mode 100644 parakeet/models/transformerTTS/train_transformer.py create mode 100644 parakeet/models/transformerTTS/utils.py diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 9303b46..8777472 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -88,7 +88,7 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): mono_channel = False lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame) - max_len = np.max(lengths) + max_len = np.max(lengths) batch = [] for example in minibatch: diff --git a/parakeet/models/transformerTTS/config/synthesis.yaml b/parakeet/models/transformerTTS/config/synthesis.yaml new file mode 100644 index 0000000..c3c3f8c --- /dev/null +++ b/parakeet/models/transformerTTS/config/synthesis.yaml @@ -0,0 +1,20 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +max_len: 50 +transformer_step: 1 +postnet_step: 1 +use_gpu: True + +checkpoint_path: ./checkpoint +log_dir: ./log +sample_path: ./sample \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml new file mode 100644 index 0000000..90ac94e --- /dev/null +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -0,0 +1,27 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +network: + hidden_size: 256 + embedding_size: 512 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +save_path: ./checkpoint +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml new file mode 100644 index 0000000..17db190 --- /dev/null +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -0,0 +1,32 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +network: + hidden_size: 256 + embedding_size: 512 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +image_step: 2000 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +save_path: ./checkpoint +log_dir: ./log + + + + \ No newline at end of file diff --git a/parakeet/models/transformerTTS/layers.py b/parakeet/models/transformerTTS/layers.py new file mode 100644 index 0000000..88f110f --- /dev/null +++ b/parakeet/models/transformerTTS/layers.py @@ -0,0 +1,170 @@ +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + name_scope, + in_channels, + num_filters, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(name_scope, dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + self.full_name(), + num_filters=num_filters, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + name_scope, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT', + dtype='float32'): + super(Pool1D, self).__init__(name_scope, dtype=dtype) + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + self.dtype = dtype + + + self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class DynamicGRU(dg.Layer): + def __init__(self, + scope_name, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__(scope_name) + self.gru_unit = dg.GRUUnit( + self.full_name(), + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py new file mode 100644 index 0000000..76bdffb --- /dev/null +++ b/parakeet/models/transformerTTS/module.py @@ -0,0 +1,525 @@ +import math +from parakeet.g2p.text.symbols import symbols +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from layers import Conv1D, Pool1D, DynamicGRU +import numpy as np + +class FC(dg.Layer): + def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1): + super(FC, self).__init__(name_scope) + self.in_features = in_features + self.out_features = out_features + self.is_bias = is_bias + self.dtype = dtype + self.gain = gain + + self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features), + dtype=dtype, + default_initializer = fluid.initializer.XavierInitializer()) + #self.weight = gain * self.weight + # mind the implicit conversion to ParamAttr for many cases + if is_bias is not False: + k = math.sqrt(1 / in_features) + self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ), + is_bias=True, + dtype=dtype, + default_initializer = fluid.initializer.Uniform(low=-k, high=k)) + + # 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature + + def forward(self, x): + x = fluid.layers.matmul(x, self.weight) + if hasattr(self, "bias"): + x = fluid.layers.elementwise_add(x, self.bias) + return x + +class Conv(dg.Layer): + def __init__(self, name_scope, in_channels, out_channels, filter_size=1, + padding=0, dilation=1, stride=1, use_cudnn=True, + data_format="NCT", is_bias=True, gain=1): + super(Conv, self).__init__(name_scope) + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_size = filter_size + self.padding = padding + self.dilation = dilation + self.stride = stride + self.use_cudnn = use_cudnn + self.data_format = data_format + self.is_bias = is_bias + self.gain = gain + + self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer()) + self.bias_attr = None + if is_bias is not False: + k = math.sqrt(1 / in_channels) + self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k)) + + self.conv = Conv1D( self.full_name(), + in_channels = in_channels, + num_filters = out_channels, + filter_size = filter_size, + padding = padding, + dilation = dilation, + stride = stride, + param_attr = self.weight_attr, + bias_attr = self.bias_attr, + use_cudnn = use_cudnn, + data_format = data_format) + + def forward(self, x): + x = self.conv(x) + return x + +class EncoderPrenet(dg.Layer): + def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True): + super(EncoderPrenet, self).__init__(name_scope) + self.embedding_size = embedding_size + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.embedding = dg.Embedding(self.full_name(), + size = [len(symbols), embedding_size], + param_attr = fluid.ParamAttr(name='weight'), + padding_idx = None) + self.conv1 = Conv(self.full_name(), + in_channels = embedding_size, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.conv2 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.conv3 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + + self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + + self.projection = FC(self.full_name(), num_hidden, num_hidden) + + def forward(self, x): + x = self.embedding(fluid.layers.unsqueeze(x, axes=[-1])) #(batch_size, seq_len, embending_size) + x = layers.transpose(x,[0,2,1]) + x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2) + x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2) + x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2) + x = layers.transpose(x,[0,2,1]) #(N,T,C) + x = self.projection(x) + return x + +class FFN(dg.Layer): + def __init__(self, name_scope, num_hidden, use_cudnn=True): + super(FFN, self).__init__(name_scope) + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.w_1 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden * 4, + filter_size = 1, + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.w_2 = Conv(self.full_name(), + in_channels = num_hidden * 4, + out_channels = num_hidden, + filter_size = 1, + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) + + def forward(self, input): + #FFN Networt + x = layers.transpose(input, [0,2,1]) + x = self.w_2(layers.relu(self.w_1(x))) + x = layers.transpose(x,[0,2,1]) + + # dropout + # x = layers.dropout(x, 0.1) + # not sure where dropout should be placed, in paper should before residual, + # but the diagonal alignment did not appear correctly in the attention plot. + + # residual connection + x = x + input + + + #layer normalization + x = self.layer_norm(x) + + return x + +class DecoderPrenet(dg.Layer): + def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5): + super(DecoderPrenet, self).__init__(name_scope) + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.dropout_rate = dropout_rate + + self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1 + self.fc2 = FC(self.full_name(), hidden_size, output_size) + + def forward(self, x): + x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate) + x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate) + return x + +class ScaledDotProductAttention(dg.Layer): + def __init__(self, name_scope, d_key): + super(ScaledDotProductAttention, self).__init__(name_scope) + + self.d_key = d_key + + # please attention this mask is diff from pytorch + def forward(self, key, value, query, mask=None, query_mask=None): + # Compute attention score + attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y + attention = attention / math.sqrt(self.d_key) + + # Mask key to ignore padding + if mask is not None: + attention = attention * mask + mask = (mask == 0).astype(float) * (-2 ** 32 + 1) + attention = attention + mask + + attention = layers.softmax(attention) + # Mask query to ignore padding + # Not sure how to work + if query_mask is not None: + attention = attention * query_mask + + result = layers.matmul(attention, value) + return result, attention + +class MultiheadAttention(dg.Layer): + def __init__(self, name_scope, num_hidden, num_head=4): + super(MultiheadAttention, self).__init__(name_scope) + self.num_hidden = num_hidden + self.num_hidden_per_attn = num_hidden // num_head + self.num_head = num_head + + self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) + self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) + self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) + + self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn) + + self.fc = FC(self.full_name(), num_hidden * 2, num_hidden) + + self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) + + def forward(self, key, value, query_input, mask=None, query_mask=None): + batch_size = key.shape[0] + seq_len_key = key.shape[1] + seq_len_query = query_input.shape[1] + + # repeat masks h times + if query_mask is not None: + query_mask = layers.unsqueeze(query_mask, axes=[-1]) + query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) + if mask is not None: + mask = layers.expand(mask, (self.num_head, 1, 1)) + + # Make multihead attention + # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) + key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) + value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) + query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn]) + + key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) + value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) + query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn]) + + result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) + + # concat all multihead result + result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn]) + result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + #print(result.().shape) + # concat result with input + result = layers.concat([query_input, result], axis=-1) + + result = self.fc(result) + result = result + query_input + + result = self.layer_norm(result) + return result, attention + +class PostConvNet(dg.Layer): + def __init__(self, name_scope, config): + super(PostConvNet, self).__init__(name_scope) + + num_hidden = config.network.hidden_size + self.num_hidden = num_hidden + self.conv1 = Conv(self.full_name(), + in_channels = config.audio.num_mels * config.audio.outputs_per_step, + out_channels = num_hidden, + filter_size = 5, + padding = 4, + use_cudnn = config.use_gpu, + data_format = "NCT", + gain = 5 / 3) + self.conv_list = [Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = 4, + use_cudnn = config.use_gpu, + data_format = "NCT", + gain = 5 / 3) for _ in range(3)] + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + self.conv5 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = config.audio.num_mels * config.audio.outputs_per_step, + filter_size = 5, + padding = 4, + use_cudnn = config.use_gpu, + data_format = "NCT") + + self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') for _ in range(3)] + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + + def forward(self, input): + input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1) + for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1) + input = self.conv5(input)[:, :, :-4] + return input + +class CBHG(dg.Layer): + def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2, + max_pool_kernel_size=2, is_post=False): + super(CBHG, self).__init__(name_scope) + """ + :param hidden_size: dimension of hidden unit + :param K: # of convolution banks + :param projection_size: dimension of projection unit + :param num_gru_layers: # of layers of GRUcell + :param max_pool_kernel_size: max pooling kernel size + :param is_post: whether post processing or not + """ + hidden_size = config.network.hidden_size + self.hidden_size = hidden_size + self.projection_size = projection_size + self.conv_list = [] + self.conv_list.append(Conv(self.full_name(), + in_channels = projection_size, + out_channels = hidden_size, + filter_size = 1, + padding = int(np.floor(1/2)), + data_format = "NCT")) + for i in range(2,K+1): + self.conv_list.append(Conv(self.full_name(), + in_channels = hidden_size, + out_channels = hidden_size, + filter_size = i, + padding = int(np.floor(i/2)), + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batchnorm_list = [] + for i in range(K): + self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW')) + + for i, layer in enumerate(self.batchnorm_list): + self.add_sublayer("batchnorm_list_{}".format(i), layer) + + conv_outdim = hidden_size * K + + self.conv_projection_1 = Conv(self.full_name(), + in_channels = conv_outdim, + out_channels = hidden_size, + filter_size = 3, + padding = int(np.floor(3/2)), + data_format = "NCT") + + self.conv_projection_2 = Conv(self.full_name(), + in_channels = hidden_size, + out_channels = projection_size, + filter_size = 3, + padding = int(np.floor(3/2)), + data_format = "NCT") + + self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size, + pool_type='max', + pool_stride=1, + pool_padding=1, + data_format = "NCT") + self.highway = Highwaynet(self.full_name(), self.projection_size) + + h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32") + h_0 = dg.to_variable(h_0) + self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.gru_forward1 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse = False, + origin_mode = True, + h_0 = h_0) + self.gru_reverse1 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse=True, + origin_mode=True, + h_0 = h_0) + + self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.gru_forward2 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse = False, + origin_mode = True, + h_0 = h_0) + self.gru_reverse2 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse=True, + origin_mode=True, + h_0 = h_0) + + def _conv_fit_dim(self, x, filter_size=3): + if filter_size % 2 == 0: + return x[:,:,:-1] + else: + return x + + def forward(self, input_): + # input_.shape = [N, C, T] + + conv_list = [] + conv_input = input_ + + for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): + conv_input = self._conv_fit_dim(conv(conv_input), i+1) + conv_input = layers.relu(batchnorm(conv_input)) + conv_list.append(conv_input) + + conv_cat = layers.concat(conv_list, axis=1) + conv_pool = self.max_pool(conv_cat)[:,:,:-1] + + + conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) + conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ + + # conv_proj.shape = [N, C, T] + highway = layers.transpose(conv_proj, [0,2,1]) + highway = self.highway(highway) + + # highway.shape = [N, T, C] + fc_forward = self.fc_forward1(highway) + fc_reverse = self.fc_reverse1(highway) + out_forward = self.gru_forward1(fc_forward) + out_reverse = self.gru_reverse1(fc_reverse) + out = layers.concat([out_forward, out_reverse], axis=-1) + fc_forward = self.fc_forward2(out) + fc_reverse = self.fc_reverse2(out) + out_forward = self.gru_forward2(fc_forward) + out_reverse = self.gru_reverse2(fc_reverse) + out = layers.concat([out_forward, out_reverse], axis=-1) + out = layers.transpose(out, [0,2,1]) + return out + +class Highwaynet(dg.Layer): + def __init__(self, name_scope, num_units, num_layers=4): + super(Highwaynet, self).__init__(name_scope) + self.num_units = num_units + self.num_layers = num_layers + + self.gates = [] + self.linears = [] + + for i in range(num_layers): + self.linears.append(FC(self.full_name(), num_units, num_units)) + self.gates.append(FC(self.full_name(), num_units, num_units)) + + for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): + self.add_sublayer("linears_{}".format(i), linear) + self.add_sublayer("gates_{}".format(i), gate) + + def forward(self, input_): + out = input_ + + for linear, gate in zip(self.linears, self.gates): + h = fluid.layers.relu(linear(out)) + t_ = fluid.layers.sigmoid(gate(out)) + + c = 1 - t_ + out = h * t_ + out * c + + return out + + + + + + diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py new file mode 100644 index 0000000..ff25ad2 --- /dev/null +++ b/parakeet/models/transformerTTS/network.py @@ -0,0 +1,207 @@ +from module import * +from utils import get_positional_table, get_sinusoid_encoding_table +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid + +class Encoder(dg.Layer): + def __init__(self, name_scope, embedding_size, num_hidden, config): + super(Encoder, self).__init__(name_scope) + self.num_hidden = num_hidden + param = fluid.ParamAttr(name='alpha') + self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32', + default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) + self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding(name_scope=self.full_name(), + size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + name='weight', + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(), + embedding_size = embedding_size, + num_hidden = num_hidden, + use_cudnn=config.use_gpu) + self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.layers): + self.add_sublayer("self_attn_{}".format(i), layer) + self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)] + for i, layer in enumerate(self.ffns): + self.add_sublayer("ffns_{}".format(i), layer) + + def forward(self, x, positional): + if fluid.framework._dygraph_tracer()._train_mode: + query_mask = (positional != 0).astype(float) + mask = (positional != 0).astype(float) + mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1]) + else: + query_mask, mask = None, None + + # Encoder pre_network + x = self.encoder_prenet(x) #(N,T,C) + + + # Get positional encoding + positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1])) + x = positional * self.alpha + x #(N, T, C) + + + # Positional dropout + x = layers.dropout(x, 0.1) + + # Self attention encoder + attentions = list() + for layer, ffn in zip(self.layers, self.ffns): + x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) + x = ffn(x) + attentions.append(attention) + + return x, query_mask, attentions + +class Decoder(dg.Layer): + def __init__(self, name_scope, num_hidden, config): + super(Decoder, self).__init__(name_scope) + self.num_hidden = num_hidden + param = fluid.ParamAttr(name='alpha') + self.alpha = self.create_parameter(param, shape=(1,), dtype='float32', + default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) + self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding(name_scope=self.full_name(), + size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + name='weight', + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.decoder_prenet = DecoderPrenet(self.full_name(), + input_size = config.audio.num_mels, + hidden_size = num_hidden * 2, + output_size = num_hidden, + dropout_rate=0.2) + self.linear = FC(self.full_name(), num_hidden, num_hidden) + + self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.selfattn_layers): + self.add_sublayer("self_attn_{}".format(i), layer) + self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.attn_layers): + self.add_sublayer("attn_{}".format(i), layer) + self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.ffns): + self.add_sublayer("ffns_{}".format(i), layer) + self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step) + self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1) + + self.postconvnet = PostConvNet(self.full_name(), config) + + def forward(self, key, value, query, c_mask, positional): + batch_size = key.shape[0] + decoder_len = query.shape[1] + + # get decoder mask with triangular matrix + + if fluid.framework._dygraph_tracer()._train_mode: + #zeros = np.zeros(positional.shape, dtype=np.float32) + m_mask = (positional != 0).astype(float) + mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1) + mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) + mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + + + # (batch_size, decoder_len, decoder_len) + zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(float), axes=2), [1,1,decoder_len]) + # (batch_size, decoder_len, seq_len) + zero_mask = fluid.layers.transpose(zero_mask, [0,2,1]) + + else: + mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) + mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + m_mask, zero_mask = None, None + #import pdb; pdb.set_trace() + # Decoder pre-network + query = self.decoder_prenet(query) + + # Centered position + query = self.linear(query) + + # Get position embedding + positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1])) + query = positional * self.alpha + query + + #positional dropout + query = fluid.layers.dropout(query, 0.1) + + # Attention decoder-decoder, encoder-decoder + selfattn_list = list() + attn_list = list() + + for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): + query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) + query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) + query = ffn(query) + selfattn_list.append(attn_dec) + attn_list.append(attn_dot) + + # Mel linear projection + mel_out = self.mel_linear(query) + # Post Mel Network + postnet_input = layers.transpose(mel_out, [0,2,1]) + out = self.postconvnet(postnet_input) + out = postnet_input + out + out = layers.transpose(out, [0,2,1]) + + # Stop tokens + stop_tokens = self.stop_linear(query) + + return mel_out, out, attn_list, stop_tokens, selfattn_list + +class Model(dg.Layer): + def __init__(self, name_scope, config): + super(Model, self).__init__(name_scope) + self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config) + self.decoder = Decoder(self.full_name(), config.network.hidden_size, config) + self.config = config + + def forward(self, characters, mel_input, pos_text, pos_mel): + # key (batch_size, seq_len, channel) + # c_mask (batch_size, seq_len) + # attns_enc (channel / 2, seq_len, seq_len) + key, c_mask, attns_enc = self.encoder(characters, pos_text) + + # mel_output/postnet_output (batch_size, mel_len, n_mel) + # attn_probs (128, mel_len, seq_len) + # stop_preds (batch_size, mel_len, 1) + # attns_dec (128, mel_len, mel_len) + mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel) + + return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec + +class ModelPostNet(dg.Layer): + """ + CBHG Network (mel -> linear) + """ + def __init__(self, name_scope, config): + super(ModelPostNet, self).__init__(name_scope) + self.pre_proj = Conv(self.full_name(), + in_channels = config.audio.num_mels, + out_channels = config.network.hidden_size, + data_format = "NCT") + self.cbhg = CBHG(self.full_name(), config) + self.post_proj = Conv(self.full_name(), + in_channels = config.audio.num_mels, + out_channels = (config.audio.n_fft // 2) + 1, + data_format = "NCT") + + def forward(self, mel): + mel = layers.transpose(mel, [0,2,1]) + mel = self.pre_proj(mel) + mel = self.cbhg(mel) + mag_pred = self.post_proj(mel) + mag_pred = layers.transpose(mag_pred, [0,2,1]) + return mag_pred + + + + + + diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py new file mode 100644 index 0000000..0c09d01 --- /dev/null +++ b/parakeet/models/transformerTTS/parse.py @@ -0,0 +1,63 @@ +import jsonargparse + +def add_config_options_to_parser(parser): + parser.add_argument('--audio.num_mels', type=int, default=80, + help="the number of mel bands when calculating mel spectrograms.") + parser.add_argument('--audio.n_fft', type=int, default=2048, + help="the number of fft components.") + parser.add_argument('--audio.sr', type=int, default=22050, + help="the sampling rate of audio data file.") + parser.add_argument('--audio.preemphasis', type=float, default=0.97, + help="the preemphasis coefficient.") + parser.add_argument('--audio.hop_length', type=float, default=128, + help="the number of samples to advance between frames.") + parser.add_argument('--audio.win_length', type=float, default=1024, + help="the length (width) of the window function.") + parser.add_argument('--audio.power', type=float, default=1.4, + help="the power to raise before griffin-lim.") + parser.add_argument('--audio.min_level_db', type=int, default=-100, + help="the minimum level db.") + parser.add_argument('--audio.ref_level_db', type=int, default=20, + help="the reference level db.") + parser.add_argument('--audio.outputs_per_step', type=int, default=1, + help="the outputs per step.") + + parser.add_argument('--network.hidden_size', type=int, default=256, + help="the hidden size in network.") + parser.add_argument('--network.embedding_size', type=int, default=512, + help="the embedding vector size.") + + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--image_step', type=int, default=2000, + help="attention image interval during training.") + parser.add_argument('--max_len', type=int, default=400, + help="The max length of audio when synthsis.") + parser.add_argument('--transformer_step', type=int, default=160000, + help="Global step to restore checkpoint of transformer in synthesis.") + parser.add_argument('--postnet_step', type=int, default=100000, + help="Global step to restore checkpoint of postnet in synthesis.") + parser.add_argument('--use_gpu', type=bool, default=True, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=bool, default=False, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./log', + help="the directory to save audio sample in synthesis.") + + + parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/parakeet/models/transformerTTS/preprocess.py b/parakeet/models/transformerTTS/preprocess.py new file mode 100644 index 0000000..61ed353 --- /dev/null +++ b/parakeet/models/transformerTTS/preprocess.py @@ -0,0 +1,137 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa + +from parakeet import g2p +from parakeet import audio + +from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler +from parakeet.data.dataset import Dataset +from parakeet.data.datacargo import DataCargo +from parakeet.data.batch import TextIDBatcher, SpecBatcher + +_ljspeech_processor = audio.AudioProcessor( + sample_rate=22050, + num_mels=80, + min_level_db=-100, + ref_level_db=20, + n_fft=2048, + win_length= int(22050 * 0.05), + hop_length= int(22050 * 0.0125), + power=1.2, + preemphasis=0.97, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + +class LJSpeech(Dataset): + def __init__(self, root): + super(LJSpeech, self).__init__() + assert isinstance(root, (str, Path)), "root should be a string or Path object" + self.root = root if isinstance(root, Path) else Path(root) + self.metadata = self._prepare_metadata() + + def _prepare_metadata(self): + csv_path = self.root.joinpath("metadata.csv") + metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, + names=["fname", "raw_text", "normalized_text"]) + return metadata + + def _get_example(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + + fname, raw_text, normalized_text = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = _ljspeech_processor.load_wav(str(wav_path)) + mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + def _batch_examples(self, minibatch): + mag_batch = [] + mel_batch = [] + phoneme_batch = [] + for example in minibatch: + mag, mel, phoneme = example + mag_batch.append(mag) + mel_batch.append(mel) + phoneme_batch.append(phoneme) + mag_batch = SpecBatcher(pad_value=0.)(mag_batch) + mel_batch = SpecBatcher(pad_value=0.)(mel_batch) + phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) + return (mag_batch, mel_batch, phoneme_batch) + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __len__(self): + return len(self.metadata) + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) + +def batch_examples_postnet(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py new file mode 100644 index 0000000..13e0de0 --- /dev/null +++ b/parakeet/models/transformerTTS/synthesis.py @@ -0,0 +1,67 @@ +import os +from scipy.io.wavfile import write +from parakeet.g2p.en import text_to_sequence +import numpy as np +from network import Model, ModelPostNet +from tqdm import tqdm +from tensorboardX import SummaryWriter +import paddle.fluid as fluid +import paddle.fluid.dygraph as dg +from preprocess import _ljspeech_processor +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint + +def load_checkpoint(step, model_path): + model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + return model_dict + +def synthesis(text_input, cfg): + place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) + + # tensorboard + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'synthesis') + + writer = SummaryWriter(path) + + with dg.guard(place): + model = Model('transtts', cfg) + model_postnet = ModelPostNet('postnet', cfg) + + model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) + model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) + + # init input + text = np.asarray(text_to_sequence(text_input)) + text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) + mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32) + pos_text = np.arange(1, text.shape[1]+1) + pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) + + + model.eval() + model_postnet.eval() + + pbar = tqdm(range(cfg.max_len)) + + for i in pbar: + pos_mel = np.arange(1, mel_input.shape[1]+1) + pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0]) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel) + mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) + mag_pred = model_postnet(postnet_pred) + + wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) + writer.add_audio(text_input, wav, 0, cfg.audio.sr) + if not os.path.exists(cfg.sample_path): + os.mkdir(cfg.sample_path) + write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) + +if __name__ == '__main__': + parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) + synthesis("Transformer model is so fast!", cfg) \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py new file mode 100644 index 0000000..6e32f9c --- /dev/null +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -0,0 +1,135 @@ +from network import * +from preprocess import batch_examples_postnet, LJSpeech +from tensorboardX import SummaryWriter +import os +from tqdm import tqdm +from parakeet.data.datacargo import DataCargo +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + + +def main(): + parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split()) + + local_rank = dg.parallel.Env().local_rank + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + LJSPEECH_ROOT = Path(cfg.data_path) + dataset = LJSpeech(LJSPEECH_ROOT) + dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'postnet') + writer = SummaryWriter(path) + + with dg.guard(place): + # dataloader + input_fields = { + 'names': ['mel', 'mag'], + 'shapes': + [[cfg.batch_size, None, 80], [cfg.batch_size, None, 257]], + 'dtypes': ['float32', 'float32'], + 'lod_levels': [0, 0] + } + + inputs = [ + fluid.data( + name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) + for i in range(len(input_fields['names'])) + ] + + reader = fluid.io.DataLoader.from_generator( + feed_list=inputs, + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + + + model = ModelPostNet('postnet', cfg) + + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + reader.set_batch_generator(dataloader, place) + pbar = tqdm(reader()) + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + mel, mag = data + mag = dg.to_variable(mag.numpy()) + mel = dg.to_variable(mel.numpy()) + global_step += 1 + + mag_pred = model(mel) + + loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) + if cfg.use_data_parallel: + loss = model.scale_loss(loss) + + writer.add_scalars('training_loss',{ + 'loss':loss.numpy(), + }, global_step) + + loss.backward() + if cfg.use_data_parallel: + model.apply_collective_grads() + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) + model.clear_gradients() + + if global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + + + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py new file mode 100644 index 0000000..0cdbf37 --- /dev/null +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -0,0 +1,166 @@ +from preprocess import batch_examples, LJSpeech +import os +from tqdm import tqdm +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +from network import * +from tensorboardX import SummaryWriter +from parakeet.data.datacargo import DataCargo +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint +from matplotlib import cm + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + + +def main(): + parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split()) + + local_rank = dg.parallel.Env().local_rank + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + + LJSPEECH_ROOT = Path(cfg.data_path) + dataset = LJSpeech(LJSPEECH_ROOT) + dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples, drop_last=True) + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'transformer') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + + # dataloader + input_fields = { + 'names': ['character', 'mel', 'mel_input', 'pos_text', 'pos_mel', 'text_len'], + 'shapes': + [[cfg.batch_size, None], [cfg.batch_size, None, 80], [cfg.batch_size, None, 80], [cfg.batch_size, 1], [cfg.batch_size, 1], [cfg.batch_size, 1]], + 'dtypes': ['float32', 'float32', 'float32', 'int64', 'int64', 'int64'], + 'lod_levels': [0, 0, 0, 0, 0, 0] + } + + inputs = [ + fluid.data( + name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) + for i in range(len(input_fields['names'])) + ] + + reader = fluid.io.DataLoader.from_generator( + feed_list=inputs, + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + + model = Model('transtts', cfg) + + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + reader.set_batch_generator(dataloader, place) + pbar = tqdm(reader()) + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + character, mel, mel_input, pos_text, pos_mel, text_length = data + + global_step += 1 + + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) + + mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) + post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) + loss = mel_loss + post_mel_loss + + if cfg.use_data_parallel: + loss = model.scale_loss(loss) + + writer.add_scalars('training_loss', { + 'mel_loss':mel_loss.numpy(), + 'post_mel_loss':post_mel_loss.numpy(), + }, global_step) + + writer.add_scalars('alphas', { + 'encoder_alpha':model.encoder.alpha.numpy(), + 'decoder_alpha':model.decoder.alpha.numpy(), + }, global_step) + + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + if global_step % cfg.image_step == 1: + for i, prob in enumerate(attn_probs): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + loss.backward() + if cfg.use_data_parallel: + model.apply_collective_grads() + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) + model.clear_gradients() + + # save checkpoint + if local_rank==0 and global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() + + +if __name__ =='__main__': + main() \ No newline at end of file diff --git a/parakeet/models/transformerTTS/utils.py b/parakeet/models/transformerTTS/utils.py new file mode 100644 index 0000000..087cacf --- /dev/null +++ b/parakeet/models/transformerTTS/utils.py @@ -0,0 +1,42 @@ +import numpy as np +import librosa +import os, copy +from scipy import signal + + +def get_positional_table(d_pos_vec, n_position=1024): + position_enc = np.array([ + [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)] + if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc + +def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): + ''' Sinusoid position encoding table ''' + + def cal_angle(position, hid_idx): + return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) + + def get_posi_angle_vec(position): + return [cal_angle(position, hid_j) for hid_j in range(d_hid)] + + sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) + + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + + if padding_idx is not None: + # zero vector for padding dimension + sinusoid_table[padding_idx] = 0. + + return sinusoid_table + +def guided_attention(N, T, g=0.2): + '''Guided attention. Refer to page 3 on the paper.''' + W = np.zeros((N, T), dtype=np.float32) + for n_pos in range(W.shape[0]): + for t_pos in range(W.shape[1]): + W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) + return W diff --git a/tests/test_ljspeech.py b/tests/test_ljspeech.py index 04db6a9..34f5011 100644 --- a/tests/test_ljspeech.py +++ b/tests/test_ljspeech.py @@ -7,4 +7,4 @@ LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1") ljspeech = LJSpeech(LJSPEECH_ROOT) ljspeech_cargo = DataCargo(ljspeech, batch_size=16, shuffle=True) for i, batch in enumerate(ljspeech_cargo): - print(i) \ No newline at end of file + print(i)