diff --git a/parakeet/models/transformerTTS/post_convnet.py b/parakeet/models/transformerTTS/post_convnet.py new file mode 100644 index 0000000..7ed905b --- /dev/null +++ b/parakeet/models/transformerTTS/post_convnet.py @@ -0,0 +1,89 @@ +import math +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from parakeet.modules.customized import Conv1D + +class PostConvNet(dg.Layer): + def __init__(self, + n_mels=80, + num_hidden=512, + filter_size=5, + padding=0, + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1, + batchnorm_last=False): + super(PostConvNet, self).__init__() + + self.dropout = dropout + self.num_conv = num_conv + self.batchnorm_last = batchnorm_last + self.conv_list = [] + k = math.sqrt(1 / (n_mels * outputs_per_step)) + self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), + use_cudnn = use_cudnn, + data_format = "NCT")) + + k = math.sqrt(1 / num_hidden) + for _ in range(1, num_conv-1): + self.conv_list.append(Conv1D(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), + use_cudnn = use_cudnn, + data_format = "NCT") ) + + self.conv_list.append(Conv1D(in_channels = num_hidden, + out_channels = n_mels * outputs_per_step, + filter_size = filter_size, + padding = padding, + param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), + use_cudnn = use_cudnn, + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batch_norm_list = [dg.BatchNorm(num_hidden, + data_layout='NCHW') for _ in range(num_conv-1)] + if self.batchnorm_last: + self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, + data_layout='NCHW')) + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + + + def forward(self, input): + """ + Post Conv Net. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result after postconvnet. + """ + + input = layers.transpose(input, [0,2,1]) + len = input.shape[-1] + for i in range(self.num_conv-1): + batch_norm = self.batch_norm_list[i] + conv = self.conv_list[i] + + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) + conv = self.conv_list[self.num_conv-1] + input = conv(input)[:,:,:len] + if self.batchnorm_last: + batch_norm = self.batch_norm_list[self.num_conv-1] + input = layers.dropout(batch_norm(input), self.dropout) + output = layers.transpose(input, [0,2,1]) + return output \ No newline at end of file diff --git a/parakeet/models/transformerTTS/prenet.py b/parakeet/models/transformerTTS/prenet.py new file mode 100644 index 0000000..e9b0667 --- /dev/null +++ b/parakeet/models/transformerTTS/prenet.py @@ -0,0 +1,39 @@ +import math +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers + +class PreNet(dg.Layer): + def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): + """ + :param input_size: dimension of input + :param hidden_size: dimension of hidden unit + :param output_size: dimension of output + """ + super(PreNet, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.dropout_rate = dropout_rate + + k = math.sqrt(1 / input_size) + self.linear1 = dg.Linear(input_size, hidden_size, + param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + k = math.sqrt(1 / hidden_size) + self.linear2 = dg.Linear(hidden_size, output_size, + param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + + def forward(self, x): + """ + Pre Net before passing through the network. + + Args: + x (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + x (Variable), Shape(B, T, C), the result after pernet. + """ + x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) + x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) + return x diff --git a/parakeet/modules/customized.py b/parakeet/modules/customized.py new file mode 100644 index 0000000..e0eb65a --- /dev/null +++ b/parakeet/modules/customized.py @@ -0,0 +1,117 @@ +from paddle import fluid +import paddle.fluid.dygraph as dg + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + in_channels, + out_channels, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = out_channels + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + num_channels=in_channels, + num_filters=out_channels, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT'): + super(Pool1D, self).__init__() + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + + + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x diff --git a/parakeet/modules/dynamic_gru.py b/parakeet/modules/dynamic_gru.py new file mode 100644 index 0000000..e84c598 --- /dev/null +++ b/parakeet/modules/dynamic_gru.py @@ -0,0 +1,52 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class DynamicGRU(dg.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = dg.GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + """ + Dynamic GRU block. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result compute by GRU. + """ + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = layers.concat(res, axis=1) + return res + diff --git a/parakeet/modules/ffn.py b/parakeet/modules/ffn.py new file mode 100644 index 0000000..7b06dcb --- /dev/null +++ b/parakeet/modules/ffn.py @@ -0,0 +1,57 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +import math +from parakeet.modules.customized import Conv1D + + +class PositionwiseFeedForward(dg.Layer): + ''' A two-feed-forward-layer module ''' + def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.dropout = dropout + + k = math.sqrt(1 / d_in) + self.w_1 = Conv1D(in_channels = d_in, + out_channels = num_hidden, + filter_size = filter_size, + padding=padding, + param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), + use_cudnn = use_cudnn, + data_format = "NTC") + k = math.sqrt(1 / num_hidden) + self.w_2 = Conv1D(in_channels = num_hidden, + out_channels = d_in, + filter_size = filter_size, + padding=padding, + param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), + use_cudnn = use_cudnn, + data_format = "NTC") + self.layer_norm = dg.LayerNorm(d_in) + + def forward(self, input): + """ + Feed Forward Network. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result after FFN. + """ + #FFN Networt + x = self.w_2(layers.relu(self.w_1(input))) + + # dropout + x = layers.dropout(x, self.dropout) + + # residual connection + x = x + input + + #layer normalization + output = self.layer_norm(x) + + return output \ No newline at end of file