diff --git a/parakeet/__main__.py b/parakeet/__main__.py new file mode 100644 index 0000000..e7c60be --- /dev/null +++ b/parakeet/__main__.py @@ -0,0 +1,36 @@ +import parakeet + +if __name__ == '__main__': + import argparse + import os + import shutil + from pathlib import Path + + package_path = Path(__file__).parent + print(package_path) + + parser = argparse.ArgumentParser() + subparser = parser.add_subparsers(dest="cmd") + + list_exp_parser = subparser.add_parser("list-examples") + clone = subparser.add_parser("clone-example") + clone.add_argument("experiment_name", type=str, help="experiment name") + + args = parser.parse_args() + + if args.cmd == "list-examples": + print(os.listdir(package_path / "examples")) + exit(0) + + if args.cmd == "clone-example": + source = package_path / "examples" / (args.experiment_name) + target = Path(os.getcwd()) / (args.experiment_name) + if not os.path.exists(str(source)): + raise ValueError("{} does not exist".format(str(source))) + + if os.path.exists(str(target)): + raise FileExistsError("{} already exists".format(str(target))) + + shutil.copytree(str(source), str(target)) + print("{} copied!".format(args.experiment_name)) + exit(0) diff --git a/parakeet/models/transformer_tts_deprecated/__init__.py b/parakeet/models/transformer_tts_deprecated/__init__.py deleted file mode 100644 index 6d5bfd4..0000000 --- a/parakeet/models/transformer_tts_deprecated/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .transformer_tts import TransformerTTS -from .vocoder import Vocoder \ No newline at end of file diff --git a/parakeet/models/transformer_tts_deprecated/cbhg.py b/parakeet/models/transformer_tts_deprecated/cbhg.py deleted file mode 100644 index 9a330f9..0000000 --- a/parakeet/models/transformer_tts_deprecated/cbhg.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -from parakeet.g2p.text.symbols import symbols -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -import paddle.fluid.layers as layers -from parakeet.modules.customized import Pool1D, Conv1D -from parakeet.modules.dynamic_gru import DynamicGRU -import numpy as np - - -class CBHG(dg.Layer): - def __init__(self, - hidden_size, - batch_size, - K=16, - projection_size=256, - num_gru_layers=2, - max_pool_kernel_size=2, - is_post=False): - """CBHG Module - - Args: - hidden_size (int): dimension of hidden unit. - batch_size (int): batch size of input. - K (int, optional): number of convolution banks. Defaults to 16. - projection_size (int, optional): dimension of projection unit. Defaults to 256. - num_gru_layers (int, optional): number of layers of GRUcell. Defaults to 2. - max_pool_kernel_size (int, optional): max pooling kernel size. Defaults to 2 - is_post (bool, optional): whether post processing or not. Defaults to False. - """ - super(CBHG, self).__init__() - - self.hidden_size = hidden_size - self.projection_size = projection_size - self.conv_list = [] - k = math.sqrt(1.0 / projection_size) - self.conv_list.append( - Conv1D( - num_channels=projection_size, - num_filters=hidden_size, - filter_size=1, - padding=int(np.floor(1 / 2)), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)))) - k = math.sqrt(1.0 / hidden_size) - for i in range(2, K + 1): - self.conv_list.append( - Conv1D( - num_channels=hidden_size, - num_filters=hidden_size, - filter_size=i, - padding=int(np.floor(i / 2)), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)))) - - for i, layer in enumerate(self.conv_list): - self.add_sublayer("conv_list_{}".format(i), layer) - - self.batchnorm_list = [] - for i in range(K): - self.batchnorm_list.append( - dg.BatchNorm( - hidden_size, data_layout='NCHW')) - - for i, layer in enumerate(self.batchnorm_list): - self.add_sublayer("batchnorm_list_{}".format(i), layer) - - conv_outdim = hidden_size * K - - k = math.sqrt(1.0 / conv_outdim) - self.conv_projection_1 = Conv1D( - num_channels=conv_outdim, - num_filters=hidden_size, - filter_size=3, - padding=int(np.floor(3 / 2)), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - - k = math.sqrt(1.0 / hidden_size) - self.conv_projection_2 = Conv1D( - num_channels=hidden_size, - num_filters=projection_size, - filter_size=3, - padding=int(np.floor(3 / 2)), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - - self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW') - self.batchnorm_proj_2 = dg.BatchNorm( - projection_size, data_layout='NCHW') - self.max_pool = Pool1D( - pool_size=max_pool_kernel_size, - pool_type='max', - pool_stride=1, - pool_padding=1, - data_format="NCT") - self.highway = Highwaynet(self.projection_size) - - h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") - h_0 = dg.to_variable(h_0) - k = math.sqrt(1.0 / hidden_size) - self.fc_forward1 = dg.Linear( - hidden_size, - hidden_size // 2 * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - self.fc_reverse1 = dg.Linear( - hidden_size, - hidden_size // 2 * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - self.gru_forward1 = DynamicGRU( - size=self.hidden_size // 2, - is_reverse=False, - origin_mode=True, - h_0=h_0) - self.gru_reverse1 = DynamicGRU( - size=self.hidden_size // 2, - is_reverse=True, - origin_mode=True, - h_0=h_0) - - self.fc_forward2 = dg.Linear( - hidden_size, - hidden_size // 2 * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - self.fc_reverse2 = dg.Linear( - hidden_size, - hidden_size // 2 * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - self.gru_forward2 = DynamicGRU( - size=self.hidden_size // 2, - is_reverse=False, - origin_mode=True, - h_0=h_0) - self.gru_reverse2 = DynamicGRU( - size=self.hidden_size // 2, - is_reverse=True, - origin_mode=True, - h_0=h_0) - - def _conv_fit_dim(self, x, filter_size=3): - if filter_size % 2 == 0: - return x[:, :, :-1] - else: - return x - - def forward(self, input_): - """ - Convert linear spectrum to Mel spectrum. - - Args: - input_ (Variable): shape(B, C, T), dtype float32, the sequentially input. - - Returns: - out (Variable): shape(B, C, T), the CBHG output. - """ - - conv_list = [] - conv_input = input_ - - for i, (conv, batchnorm - ) in enumerate(zip(self.conv_list, self.batchnorm_list)): - conv_input = self._conv_fit_dim(conv(conv_input), i + 1) - conv_input = layers.relu(batchnorm(conv_input)) - conv_list.append(conv_input) - - conv_cat = layers.concat(conv_list, axis=1) - conv_pool = self.max_pool(conv_cat)[:, :, :-1] - - conv_proj = layers.relu( - self.batchnorm_proj_1( - self._conv_fit_dim(self.conv_projection_1(conv_pool)))) - conv_proj = self.batchnorm_proj_2( - self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ - - # conv_proj.shape = [N, C, T] - highway = layers.transpose(conv_proj, [0, 2, 1]) - highway = self.highway(highway) - - # highway.shape = [N, T, C] - fc_forward = self.fc_forward1(highway) - fc_reverse = self.fc_reverse1(highway) - out_forward = self.gru_forward1(fc_forward) - out_reverse = self.gru_reverse1(fc_reverse) - out = layers.concat([out_forward, out_reverse], axis=-1) - fc_forward = self.fc_forward2(out) - fc_reverse = self.fc_reverse2(out) - out_forward = self.gru_forward2(fc_forward) - out_reverse = self.gru_reverse2(fc_reverse) - out = layers.concat([out_forward, out_reverse], axis=-1) - out = layers.transpose(out, [0, 2, 1]) - return out - - -class Highwaynet(dg.Layer): - def __init__(self, num_units, num_layers=4): - """Highway network - - Args: - num_units (int): dimension of hidden unit. - num_layers (int, optional): number of highway layers. Defaults to 4. - """ - super(Highwaynet, self).__init__() - self.num_units = num_units - self.num_layers = num_layers - - self.gates = [] - self.linears = [] - k = math.sqrt(1.0 / num_units) - for i in range(num_layers): - self.linears.append( - dg.Linear( - num_units, - num_units, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)))) - self.gates.append( - dg.Linear( - num_units, - num_units, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)))) - - for i, (linear, gate) in enumerate(zip(self.linears, self.gates)): - self.add_sublayer("linears_{}".format(i), linear) - self.add_sublayer("gates_{}".format(i), gate) - - def forward(self, input_): - """ - Compute result of Highway network. - - Args: - input_(Variable): shape(B, T, C), dtype float32, the sequentially input. - - Returns: - out(Variable): the Highway output. - """ - out = input_ - - for linear, gate in zip(self.linears, self.gates): - h = fluid.layers.relu(linear(out)) - t_ = fluid.layers.sigmoid(gate(out)) - - c = 1 - t_ - out = h * t_ + out * c - - return out diff --git a/parakeet/models/transformer_tts_deprecated/decoder.py b/parakeet/models/transformer_tts_deprecated/decoder.py deleted file mode 100644 index 41e11a0..0000000 --- a/parakeet/models/transformer_tts_deprecated/decoder.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -from parakeet.models.transformer_tts.utils import * -from parakeet.modules.multihead_attention import MultiheadAttention -from parakeet.modules.ffn import PositionwiseFeedForward -from parakeet.models.transformer_tts.prenet import PreNet -from parakeet.models.transformer_tts.post_convnet import PostConvNet - - -class Decoder(dg.Layer): - def __init__(self, - num_hidden, - num_mels=80, - outputs_per_step=1, - num_head=4, - n_layers=3): - """Decoder layer of TransformerTTS. - - Args: - num_hidden (int): the number of source vocabulary. - n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80. - outputs_per_step (int, optional): the num of output frames per step . Defaults to 1. - num_head (int, optional): the head number of multihead attention. Defaults to 4. - n_layers (int, optional): the layers number of multihead attention. Defaults to 3. - """ - super(Decoder, self).__init__() - self.num_hidden = num_hidden - self.num_head = num_head - param = fluid.ParamAttr() - self.alpha = self.create_parameter( - shape=(1, ), - attr=param, - dtype='float32', - default_initializer=fluid.initializer.ConstantInitializer( - value=1.0)) - self.pos_inp = get_sinusoid_encoding_table( - 1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding( - size=[1024, num_hidden], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - self.pos_inp), - trainable=False)) - self.decoder_prenet = PreNet( - input_size=num_mels, - hidden_size=num_hidden * 2, - output_size=num_hidden, - dropout_rate=0.2) - k = math.sqrt(1.0 / num_hidden) - self.linear = dg.Linear( - num_hidden, - num_hidden, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - - self.selfattn_layers = [ - MultiheadAttention(num_hidden, num_hidden // num_head, - num_hidden // num_head) for _ in range(n_layers) - ] - for i, layer in enumerate(self.selfattn_layers): - self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [ - MultiheadAttention(num_hidden, num_hidden // num_head, - num_hidden // num_head) for _ in range(n_layers) - ] - for i, layer in enumerate(self.attn_layers): - self.add_sublayer("attn_{}".format(i), layer) - self.ffns = [ - PositionwiseFeedForward( - num_hidden, num_hidden * num_head, filter_size=1) - for _ in range(n_layers) - ] - for i, layer in enumerate(self.ffns): - self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = dg.Linear( - num_hidden, - num_mels * outputs_per_step, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - self.stop_linear = dg.Linear( - num_hidden, - 1, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - - self.postconvnet = PostConvNet( - num_mels, - num_hidden, - filter_size=5, - padding=4, - num_conv=5, - outputs_per_step=outputs_per_step, - use_cudnn=True) - - def forward(self, key, value, query, positional, c_mask): - """ - Compute decoder outputs. - - Args: - key (Variable): shape(B, T_text, C), dtype float32, the input key of decoder, - where T_text means the timesteps of input text, - value (Variable): shape(B, T_text, C), dtype float32, the input value of decoder. - query (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder, - where T_mel means the timesteps of input spectrum, - positional (Variable): shape(B, T_mel), dtype int64, the spectrum position. - c_mask (Variable): shape(B, T_text, 1), dtype float32, query mask returned from encoder. - Returns: - mel_out (Variable): shape(B, T_mel, C), the decoder output after mel linear projection. - out (Variable): shape(B, T_mel, C), the decoder output after post mel network. - stop_tokens (Variable): shape(B, T_mel, 1), the stop tokens of output. - attn_list (list[Variable]): len(n_layers), the encoder-decoder attention list. - selfattn_list (list[Variable]): len(n_layers), the decoder self attention list. - """ - - # get decoder mask with triangular matrix - - if fluid.framework._dygraph_tracer()._train_mode: - mask = get_dec_attn_key_pad_mask(positional, self.num_head, - query.dtype) - m_mask = get_non_pad_mask(positional, self.num_head, query.dtype) - zero_mask = layers.cast(c_mask == 0, dtype=query.dtype) * -1e30 - zero_mask = layers.transpose(zero_mask, perm=[0, 2, 1]) - - else: - len_q = query.shape[1] - mask = layers.triu( - layers.ones( - shape=[len_q, len_q], dtype=query.dtype), - diagonal=1) - mask = layers.cast(mask != 0, dtype=query.dtype) * -1e30 - m_mask, zero_mask = None, None - - # Decoder pre-network - query = self.decoder_prenet(query) - - # Centered position - query = self.linear(query) - - # Get position embedding - positional = self.pos_emb(positional) - query = positional * self.alpha + query - - #positional dropout - query = fluid.layers.dropout( - query, 0.1, dropout_implementation='upscale_in_train') - - # Attention decoder-decoder, encoder-decoder - selfattn_list = list() - attn_list = list() - - for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, - self.ffns): - query, attn_dec = selfattn( - query, query, query, mask=mask, query_mask=m_mask) - query, attn_dot = attn( - key, value, query, mask=zero_mask, query_mask=m_mask) - query = ffn(query) - selfattn_list.append(attn_dec) - attn_list.append(attn_dot) - - # Mel linear projection - mel_out = self.mel_linear(query) - # Post Mel Network - out = self.postconvnet(mel_out) - out = mel_out + out - - # Stop tokens - stop_tokens = self.stop_linear(query) - stop_tokens = layers.squeeze(stop_tokens, [-1]) - stop_tokens = layers.sigmoid(stop_tokens) - - return mel_out, out, attn_list, stop_tokens, selfattn_list diff --git a/parakeet/models/transformer_tts_deprecated/encoder.py b/parakeet/models/transformer_tts_deprecated/encoder.py deleted file mode 100644 index a7a0f7a..0000000 --- a/parakeet/models/transformer_tts_deprecated/encoder.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -from parakeet.models.transformer_tts.utils import * -from parakeet.modules.multihead_attention import MultiheadAttention -from parakeet.modules.ffn import PositionwiseFeedForward -from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet - - -class Encoder(dg.Layer): - def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3): - """Encoder layer of TransformerTTS. - - Args: - embedding_size (int): the size of position embedding. - num_hidden (int): the size of hidden layer in network. - num_head (int, optional): the head number of multihead attention. Defaults to 4. - n_layers (int, optional): the layers number of multihead attention. Defaults to 3. - """ - super(Encoder, self).__init__() - self.num_hidden = num_hidden - self.num_head = num_head - param = fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=1.0)) - self.alpha = self.create_parameter( - shape=(1, ), attr=param, dtype='float32') - self.pos_inp = get_sinusoid_encoding_table( - 1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding( - size=[1024, num_hidden], - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - self.pos_inp), - trainable=False)) - self.encoder_prenet = EncoderPrenet( - embedding_size=embedding_size, - num_hidden=num_hidden, - use_cudnn=True) - self.layers = [ - MultiheadAttention(num_hidden, num_hidden // num_head, - num_hidden // num_head) for _ in range(n_layers) - ] - for i, layer in enumerate(self.layers): - self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [ - PositionwiseFeedForward( - num_hidden, - num_hidden * num_head, - filter_size=1, - use_cudnn=True) for _ in range(n_layers) - ] - for i, layer in enumerate(self.ffns): - self.add_sublayer("ffns_{}".format(i), layer) - - def forward(self, x, positional): - """ - Encode text sequence. - - Args: - x (Variable): shape(B, T_text), dtype float32, the input character, - where T_text means the timesteps of input text, - positional (Variable): shape(B, T_text), dtype int64, the characters position. - - Returns: - x (Variable): shape(B, T_text, C), the encoder output. - attentions (list[Variable]): len(n_layers), the encoder self attention list. - """ - - # Encoder pre_network - x = self.encoder_prenet(x) - - if fluid.framework._dygraph_tracer()._train_mode: - mask = get_attn_key_pad_mask(positional, self.num_head, x.dtype) - query_mask = get_non_pad_mask(positional, self.num_head, x.dtype) - - else: - query_mask, mask = None, None - - # Get positional encoding - positional = self.pos_emb(positional) - - x = positional * self.alpha + x - - # Positional dropout - x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train') - - # Self attention encoder - attentions = list() - for layer, ffn in zip(self.layers, self.ffns): - x, attention = layer(x, x, x, mask=mask, query_mask=query_mask) - x = ffn(x) - attentions.append(attention) - - return x, attentions, query_mask diff --git a/parakeet/models/transformer_tts_deprecated/encoderprenet.py b/parakeet/models/transformer_tts_deprecated/encoderprenet.py deleted file mode 100644 index a32f5a8..0000000 --- a/parakeet/models/transformer_tts_deprecated/encoderprenet.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -from parakeet.g2p.text.symbols import symbols -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -import paddle.fluid.layers as layers -from parakeet.modules.customized import Conv1D -import numpy as np - - -class EncoderPrenet(dg.Layer): - def __init__(self, embedding_size, num_hidden, use_cudnn=True): - """ Encoder prenet layer of TransformerTTS. - - Args: - embedding_size (int): the size of embedding. - num_hidden (int): the size of hidden layer in network. - use_cudnn (bool, optional): use cudnn or not. Defaults to True. - """ - super(EncoderPrenet, self).__init__() - self.embedding_size = embedding_size - self.num_hidden = num_hidden - self.use_cudnn = use_cudnn - self.embedding = dg.Embedding( - size=[len(symbols), embedding_size], - padding_idx=0, - param_attr=fluid.initializer.Normal( - loc=0.0, scale=1.0)) - self.conv_list = [] - k = math.sqrt(1.0 / embedding_size) - self.conv_list.append( - Conv1D( - num_channels=embedding_size, - num_filters=num_hidden, - filter_size=5, - padding=int(np.floor(5 / 2)), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)), - use_cudnn=use_cudnn)) - k = math.sqrt(1.0 / num_hidden) - for _ in range(2): - self.conv_list.append( - Conv1D( - num_channels=num_hidden, - num_filters=num_hidden, - filter_size=5, - padding=int(np.floor(5 / 2)), - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)), - use_cudnn=use_cudnn)) - - for i, layer in enumerate(self.conv_list): - self.add_sublayer("conv_list_{}".format(i), layer) - - self.batch_norm_list = [ - dg.BatchNorm( - num_hidden, data_layout='NCHW') for _ in range(3) - ] - - for i, layer in enumerate(self.batch_norm_list): - self.add_sublayer("batch_norm_list_{}".format(i), layer) - - k = math.sqrt(1.0 / num_hidden) - self.projection = dg.Linear( - num_hidden, - num_hidden, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - - def forward(self, x): - """ - Prepare encoder input. - - Args: - x (Variable): shape(B, T_text), dtype float32, the input character, where T_text means the timesteps of input text. - - Returns: - (Variable): shape(B, T_text, C), the encoder prenet output. - """ - - x = self.embedding(x) - x = layers.transpose(x, [0, 2, 1]) - for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): - x = layers.dropout( - layers.relu(batch_norm(conv(x))), - 0.2, - dropout_implementation='upscale_in_train') - x = layers.transpose(x, [0, 2, 1]) #(N,T,C) - x = self.projection(x) - - return x diff --git a/parakeet/models/transformer_tts_deprecated/post_convnet.py b/parakeet/models/transformer_tts_deprecated/post_convnet.py deleted file mode 100644 index 6ad8e5d..0000000 --- a/parakeet/models/transformer_tts_deprecated/post_convnet.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -import paddle.fluid.layers as layers -from parakeet.modules.customized import Conv1D - - -class PostConvNet(dg.Layer): - def __init__(self, - n_mels=80, - num_hidden=512, - filter_size=5, - padding=0, - num_conv=5, - outputs_per_step=1, - use_cudnn=True, - dropout=0.1, - batchnorm_last=False): - """Decocder post conv net of TransformerTTS. - - Args: - n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80. - num_hidden (int, optional): the size of hidden layer in network. Defaults to 512. - filter_size (int, optional): the filter size of Conv. Defaults to 5. - padding (int, optional): the padding size of Conv. Defaults to 0. - num_conv (int, optional): the num of Conv layers in network. Defaults to 5. - outputs_per_step (int, optional): the num of output frames per step . Defaults to 1. - use_cudnn (bool, optional): use cudnn in Conv or not. Defaults to True. - dropout (float, optional): dropout probability. Defaults to 0.1. - batchnorm_last (bool, optional): if batchnorm at last layer or not. Defaults to False. - """ - super(PostConvNet, self).__init__() - - self.dropout = dropout - self.num_conv = num_conv - self.batchnorm_last = batchnorm_last - self.conv_list = [] - k = math.sqrt(1.0 / (n_mels * outputs_per_step)) - self.conv_list.append( - Conv1D( - num_channels=n_mels * outputs_per_step, - num_filters=num_hidden, - filter_size=filter_size, - padding=padding, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)), - use_cudnn=use_cudnn)) - - k = math.sqrt(1.0 / num_hidden) - for _ in range(1, num_conv - 1): - self.conv_list.append( - Conv1D( - num_channels=num_hidden, - num_filters=num_hidden, - filter_size=filter_size, - padding=padding, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)), - use_cudnn=use_cudnn)) - - self.conv_list.append( - Conv1D( - num_channels=num_hidden, - num_filters=n_mels * outputs_per_step, - filter_size=filter_size, - padding=padding, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-k, high=k)), - use_cudnn=use_cudnn)) - - for i, layer in enumerate(self.conv_list): - self.add_sublayer("conv_list_{}".format(i), layer) - - self.batch_norm_list = [ - dg.BatchNorm( - num_hidden, data_layout='NCHW') for _ in range(num_conv - 1) - ] - if self.batchnorm_last: - self.batch_norm_list.append( - dg.BatchNorm( - n_mels * outputs_per_step, data_layout='NCHW')) - for i, layer in enumerate(self.batch_norm_list): - self.add_sublayer("batch_norm_list_{}".format(i), layer) - - def forward(self, input): - """ - Compute the mel spectrum. - - Args: - input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection. - - Returns: - output (Variable): shape(B, T, C), the result after postconvnet. - """ - - input = layers.transpose(input, [0, 2, 1]) - len = input.shape[-1] - for i in range(self.num_conv - 1): - batch_norm = self.batch_norm_list[i] - conv = self.conv_list[i] - - input = layers.dropout( - layers.tanh(batch_norm(conv(input)[:, :, :len])), - self.dropout, - dropout_implementation='upscale_in_train') - conv = self.conv_list[self.num_conv - 1] - input = conv(input)[:, :, :len] - if self.batchnorm_last: - batch_norm = self.batch_norm_list[self.num_conv - 1] - input = layers.dropout( - batch_norm(input), - self.dropout, - dropout_implementation='upscale_in_train') - output = layers.transpose(input, [0, 2, 1]) - return output diff --git a/parakeet/models/transformer_tts_deprecated/prenet.py b/parakeet/models/transformer_tts_deprecated/prenet.py deleted file mode 100644 index eaf4bc8..0000000 --- a/parakeet/models/transformer_tts_deprecated/prenet.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -class PreNet(dg.Layer): - def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): - """Prenet before passing through the network. - - Args: - input_size (int): the input channel size. - hidden_size (int): the size of hidden layer in network. - output_size (int): the output channel size. - dropout_rate (float, optional): dropout probability. Defaults to 0.2. - """ - super(PreNet, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.output_size = output_size - self.dropout_rate = dropout_rate - - k = math.sqrt(1.0 / input_size) - self.linear1 = dg.Linear( - input_size, - hidden_size, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - k = math.sqrt(1.0 / hidden_size) - self.linear2 = dg.Linear( - hidden_size, - output_size, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-k, high=k))) - - def forward(self, x): - """ - Prepare network input. - - Args: - x (Variable): shape(B, T, C), dtype float32, the input value. - - Returns: - output (Variable): shape(B, T, C), the result after pernet. - """ - x = layers.dropout( - layers.relu(self.linear1(x)), - self.dropout_rate, - dropout_implementation='upscale_in_train') - output = layers.dropout( - layers.relu(self.linear2(x)), - self.dropout_rate, - dropout_implementation='upscale_in_train') - return output diff --git a/parakeet/models/transformer_tts_deprecated/transformer_tts.py b/parakeet/models/transformer_tts_deprecated/transformer_tts.py deleted file mode 100644 index e1d9418..0000000 --- a/parakeet/models/transformer_tts_deprecated/transformer_tts.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -from parakeet.models.transformer_tts.encoder import Encoder -from parakeet.models.transformer_tts.decoder import Decoder - - -class TransformerTTS(dg.Layer): - def __init__(self, - embedding_size, - num_hidden, - encoder_num_head=4, - encoder_n_layers=3, - n_mels=80, - outputs_per_step=1, - decoder_num_head=4, - decoder_n_layers=3): - """TransformerTTS model. - - Args: - embedding_size (int): the size of position embedding. - num_hidden (int): the size of hidden layer in network. - encoder_num_head (int, optional): the head number of multihead attention in encoder. Defaults to 4. - encoder_n_layers (int, optional): the layers number of multihead attention in encoder. Defaults to 3. - n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80. - outputs_per_step (int, optional): the num of output frames per step . Defaults to 1. - decoder_num_head (int, optional): the head number of multihead attention in decoder. Defaults to 4. - decoder_n_layers (int, optional): the layers number of multihead attention in decoder. Defaults to 3. - """ - super(TransformerTTS, self).__init__() - self.encoder = Encoder(embedding_size, num_hidden, encoder_num_head, - encoder_n_layers) - self.decoder = Decoder(num_hidden, n_mels, outputs_per_step, - decoder_num_head, decoder_n_layers) - - def forward(self, characters, mel_input, pos_text, pos_mel): - """ - TransformerTTS network. - - Args: - characters (Variable): shape(B, T_text), dtype float32, the input character, - where T_text means the timesteps of input text, - mel_input (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder, - where T_mel means the timesteps of input spectrum, - pos_text (Variable): shape(B, T_text), dtype int64, the characters position. - - Returns: - mel_output (Variable): shape(B, T_mel, C), the decoder output after mel linear projection. - postnet_output (Variable): shape(B, T_mel, C), the decoder output after post mel network. - stop_preds (Variable): shape(B, T_mel, 1), the stop tokens of output. - attn_probs (list[Variable]): len(n_layers), the encoder-decoder attention list. - attns_enc (list[Variable]): len(n_layers), the encoder self attention list. - attns_dec (list[Variable]): len(n_layers), the decoder self attention list. - """ - key, attns_enc, query_mask = self.encoder(characters, pos_text) - - mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder( - key, key, mel_input, pos_mel, query_mask) - return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec diff --git a/parakeet/models/transformer_tts_deprecated/utils.py b/parakeet/models/transformer_tts_deprecated/utils.py deleted file mode 100644 index 9482c23..0000000 --- a/parakeet/models/transformer_tts_deprecated/utils.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numpy as np -import librosa -import os, copy -from scipy import signal -import paddle.fluid.layers as layers - - -def get_positional_table(d_pos_vec, n_position=1024): - position_enc = np.array( - [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)] - if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) - - position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i - position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 - return position_enc - - -def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): - ''' Sinusoid position encoding table ''' - - def cal_angle(position, hid_idx): - return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) - - def get_posi_angle_vec(position): - return [cal_angle(position, hid_j) for hid_j in range(d_hid)] - - sinusoid_table = np.array( - [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) - - sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i - sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 - - if padding_idx is not None: - # zero vector for padding dimension - sinusoid_table[padding_idx] = 0. - - return sinusoid_table - - -def get_non_pad_mask(seq, num_head, dtype): - mask = layers.cast(seq != 0, dtype=dtype) - mask = layers.unsqueeze(mask, axes=[-1]) - mask = layers.expand(mask, [num_head, 1, 1]) - return mask - - -def get_attn_key_pad_mask(seq_k, num_head, dtype): - ''' For masking out the padding part of key sequence. ''' - # Expand to fit the shape of key query attention matrix. - padding_mask = layers.cast(seq_k == 0, dtype=dtype) * -1e30 - padding_mask = layers.unsqueeze(padding_mask, axes=[1]) - padding_mask = layers.expand(padding_mask, [num_head, 1, 1]) - return padding_mask - - -def get_dec_attn_key_pad_mask(seq_k, num_head, dtype): - ''' For masking out the padding part of key sequence. ''' - - # Expand to fit the shape of key query attention matrix. - padding_mask = layers.cast(seq_k == 0, dtype=dtype) - padding_mask = layers.unsqueeze(padding_mask, axes=[1]) - len_k = seq_k.shape[1] - triu = layers.triu( - layers.ones( - shape=[len_k, len_k], dtype=dtype), diagonal=1) - padding_mask = padding_mask + triu - padding_mask = layers.cast( - padding_mask != 0, dtype=dtype) * -1e30 #* (-2**32 + 1) - padding_mask = layers.expand(padding_mask, [num_head, 1, 1]) - return padding_mask - - -def guided_attention(N, T, g=0.2): - '''Guided attention. Refer to page 3 on the paper.''' - W = np.zeros((N, T), dtype=np.float32) - for n_pos in range(W.shape[0]): - for t_pos in range(W.shape[1]): - W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) - **2 / (2 * g * g)) - return W - - -def cross_entropy(input, label, weight=1.0, epsilon=1e-30): - output = -1 * label * layers.log(input + epsilon) - ( - 1 - label) * layers.log(1 - input + epsilon) - output = output * (label * (weight - 1) + 1) - - return layers.reduce_mean(output, dim=[0, 1]) diff --git a/parakeet/models/transformer_tts_deprecated/vocoder.py b/parakeet/models/transformer_tts_deprecated/vocoder.py deleted file mode 100644 index 4b40ebb..0000000 --- a/parakeet/models/transformer_tts_deprecated/vocoder.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -from parakeet.modules.customized import Conv1D -from parakeet.models.transformer_tts.utils import * -from parakeet.models.transformer_tts.cbhg import CBHG - - -class Vocoder(dg.Layer): - def __init__(self, batch_size, hidden_size, num_mels=80, n_fft=2048): - """CBHG Network (mel -> linear) - - Args: - batch_size (int): the batch size of input. - hidden_size (int): the size of hidden layer in network. - n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80. - n_fft (int, optional): length of the windowed signal after padding with zeros. Defaults to 2048. - """ - super(Vocoder, self).__init__() - self.pre_proj = Conv1D( - num_channels=num_mels, num_filters=hidden_size, filter_size=1) - self.cbhg = CBHG(hidden_size, batch_size) - self.post_proj = Conv1D( - num_channels=hidden_size, - num_filters=(n_fft // 2) + 1, - filter_size=1) - - def forward(self, mel): - """ - Compute mel spectrum to linear spectrum. - - Args: - mel (Variable): shape(B, C, T), dtype float32, the input mel spectrum. - - Returns: - mag_pred (Variable): shape(B, T, C), the linear output. - """ - mel = layers.transpose(mel, [0, 2, 1]) - mel = self.pre_proj(mel) - mel = self.cbhg(mel) - mag_pred = self.post_proj(mel) - mag_pred = layers.transpose(mag_pred, [0, 2, 1]) - return mag_pred diff --git a/parakeet/models/waveflow_deprecated/__init__.py b/parakeet/models/waveflow_deprecated/__init__.py deleted file mode 100644 index b068b59..0000000 --- a/parakeet/models/waveflow_deprecated/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule diff --git a/parakeet/models/waveflow_deprecated/waveflow_modules.py b/parakeet/models/waveflow_deprecated/waveflow_modules.py deleted file mode 100644 index 96c5715..0000000 --- a/parakeet/models/waveflow_deprecated/waveflow_modules.py +++ /dev/null @@ -1,443 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import numpy as np -import paddle.fluid.dygraph as dg -from paddle import fluid -from parakeet.modules import weight_norm - - -def get_param_attr(layer_type, filter_size, c_in=1): - if layer_type == "weight_norm": - k = np.sqrt(1.0 / (c_in * np.prod(filter_size))) - weight_init = fluid.initializer.UniformInitializer(low=-k, high=k) - bias_init = fluid.initializer.UniformInitializer(low=-k, high=k) - elif layer_type == "common": - weight_init = fluid.initializer.ConstantInitializer(0.0) - bias_init = fluid.initializer.ConstantInitializer(0.0) - else: - raise TypeError("Unsupported layer type.") - - param_attr = fluid.ParamAttr(initializer=weight_init) - bias_attr = fluid.ParamAttr(initializer=bias_init) - return param_attr, bias_attr - - -def unfold(x, n_group): - length = x.shape[-1] - new_shape = x.shape[:-1] + [length // n_group, n_group] - return fluid.layers.reshape(x, new_shape) - - -class WaveFlowLoss: - def __init__(self, sigma=1.0): - self.sigma = sigma - - def __call__(self, model_output): - z, log_s_list = model_output - for i, log_s in enumerate(log_s_list): - if i == 0: - log_s_total = fluid.layers.reduce_sum(log_s) - else: - log_s_total = log_s_total + fluid.layers.reduce_sum(log_s) - - loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \ - - log_s_total - loss = loss / np.prod(z.shape) - const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma) - - return loss + const - - -class Conditioner(dg.Layer): - def __init__(self, dtype, upsample_factors): - super(Conditioner, self).__init__() - - self.upsample_conv2d = [] - for s in upsample_factors: - in_channel = 1 - param_attr, bias_attr = get_param_attr( - "weight_norm", (3, 2 * s), c_in=in_channel) - conv_trans2d = weight_norm.Conv2DTranspose( - num_channels=in_channel, - num_filters=1, - filter_size=(3, 2 * s), - padding=(1, s // 2), - stride=(1, s), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=dtype) - self.upsample_conv2d.append(conv_trans2d) - - for i, layer in enumerate(self.upsample_conv2d): - self.add_sublayer("conv2d_transpose_{}".format(i), layer) - - def forward(self, x): - x = fluid.layers.unsqueeze(x, 1) - for layer in self.upsample_conv2d: - x = layer(x) - x = fluid.layers.leaky_relu(x, alpha=0.4) - - return fluid.layers.squeeze(x, [1]) - - def infer(self, x): - x = fluid.layers.unsqueeze(x, 1) - for layer in self.upsample_conv2d: - x = layer(x) - # Trim conv artifacts. - time_cutoff = layer._filter_size[1] - layer._stride[1] - x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4) - - return fluid.layers.squeeze(x, [1]) - - -class Flow(dg.Layer): - def __init__(self, config): - super(Flow, self).__init__() - self.n_layers = config.n_layers - self.n_channels = config.n_channels - self.kernel_h = config.kernel_h - self.kernel_w = config.kernel_w - self.dtype = "float16" if config.use_fp16 else "float32" - - # Transform audio: [batch, 1, n_group, time/n_group] - # => [batch, n_channels, n_group, time/n_group] - param_attr, bias_attr = get_param_attr("weight_norm", (1, 1), c_in=1) - self.start = weight_norm.Conv2D( - num_channels=1, - num_filters=self.n_channels, - filter_size=(1, 1), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=self.dtype) - - # Initializing last layer to 0 makes the affine coupling layers - # do nothing at first. This helps with training stability - # output shape: [batch, 2, n_group, time/n_group] - param_attr, bias_attr = get_param_attr( - "common", (1, 1), c_in=self.n_channels) - self.end = dg.Conv2D( - num_channels=self.n_channels, - num_filters=2, - filter_size=(1, 1), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=self.dtype) - - # receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze - dilation_dict = { - 8: [1, 1, 1, 1, 1, 1, 1, 1], - 16: [1, 1, 1, 1, 1, 1, 1, 1], - 32: [1, 2, 4, 1, 2, 4, 1, 2], - 64: [1, 2, 4, 8, 16, 1, 2, 4], - 128: [1, 2, 4, 8, 16, 32, 64, 1] - } - self.dilation_h_list = dilation_dict[config.n_group] - - self.in_layers = [] - self.cond_layers = [] - self.res_skip_layers = [] - for i in range(self.n_layers): - dilation_h = self.dilation_h_list[i] - dilation_w = 2**i - - param_attr, bias_attr = get_param_attr( - "weight_norm", (self.kernel_h, self.kernel_w), - c_in=self.n_channels) - in_layer = weight_norm.Conv2D( - num_channels=self.n_channels, - num_filters=2 * self.n_channels, - filter_size=(self.kernel_h, self.kernel_w), - dilation=(dilation_h, dilation_w), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=self.dtype) - self.in_layers.append(in_layer) - - param_attr, bias_attr = get_param_attr( - "weight_norm", (1, 1), c_in=config.mel_bands) - cond_layer = weight_norm.Conv2D( - num_channels=config.mel_bands, - num_filters=2 * self.n_channels, - filter_size=(1, 1), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=self.dtype) - self.cond_layers.append(cond_layer) - - if i < self.n_layers - 1: - res_skip_channels = 2 * self.n_channels - else: - res_skip_channels = self.n_channels - param_attr, bias_attr = get_param_attr( - "weight_norm", (1, 1), c_in=self.n_channels) - res_skip_layer = weight_norm.Conv2D( - num_channels=self.n_channels, - num_filters=res_skip_channels, - filter_size=(1, 1), - param_attr=param_attr, - bias_attr=bias_attr, - dtype=self.dtype) - self.res_skip_layers.append(res_skip_layer) - - self.add_sublayer("in_layer_{}".format(i), in_layer) - self.add_sublayer("cond_layer_{}".format(i), cond_layer) - self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer) - - def forward(self, audio, mel): - # audio: [bs, 1, n_group, time/group] - # mel: [bs, mel_bands, n_group, time/n_group] - audio = self.start(audio) - - for i in range(self.n_layers): - dilation_h = self.dilation_h_list[i] - dilation_w = 2**i - - # Pad height dim (n_group): causal convolution - # Pad width dim (time): dialated non-causal convolution - pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0 - pad_left = pad_right = int((self.kernel_w - 1) * dilation_w / 2) - # Using pad2d is a bit faster than using padding in Conv2D directly - audio_pad = fluid.layers.pad2d( - audio, paddings=[pad_top, pad_bottom, pad_left, pad_right]) - hidden = self.in_layers[i](audio_pad) - cond_hidden = self.cond_layers[i](mel) - in_acts = hidden + cond_hidden - out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \ - fluid.layers.sigmoid(in_acts[:, self.n_channels:, :]) - res_skip_acts = self.res_skip_layers[i](out_acts) - - if i < self.n_layers - 1: - audio += res_skip_acts[:, :self.n_channels, :, :] - skip_acts = res_skip_acts[:, self.n_channels:, :, :] - else: - skip_acts = res_skip_acts - - if i == 0: - output = skip_acts - else: - output += skip_acts - - return self.end(output) - - def infer(self, audio, mel, queues): - audio = self.start(audio) - - for i in range(self.n_layers): - dilation_h = self.dilation_h_list[i] - dilation_w = 2**i - - state_size = dilation_h * (self.kernel_h - 1) - queue = queues[i] - - if len(queue) == 0: - for j in range(state_size): - queue.append(fluid.layers.zeros_like(audio)) - - state = queue[0:state_size] - state = fluid.layers.concat(state + [audio], axis=2) - - queue.pop(0) - queue.append(audio) - - # Pad height dim (n_group): causal convolution - # Pad width dim (time): dialated non-causal convolution - pad_top, pad_bottom = 0, 0 - pad_left = int((self.kernel_w - 1) * dilation_w / 2) - pad_right = int((self.kernel_w - 1) * dilation_w / 2) - state = fluid.layers.pad2d( - state, paddings=[pad_top, pad_bottom, pad_left, pad_right]) - hidden = self.in_layers[i](state) - cond_hidden = self.cond_layers[i](mel) - in_acts = hidden + cond_hidden - out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \ - fluid.layers.sigmoid(in_acts[:, self.n_channels:, :]) - res_skip_acts = self.res_skip_layers[i](out_acts) - - if i < self.n_layers - 1: - audio += res_skip_acts[:, :self.n_channels, :, :] - skip_acts = res_skip_acts[:, self.n_channels:, :, :] - else: - skip_acts = res_skip_acts - - if i == 0: - output = skip_acts - else: - output += skip_acts - - return self.end(output) - - -class WaveFlowModule(dg.Layer): - """WaveFlow model implementation. - - Args: - config (obj): model configuration parameters. - - Returns: - WaveFlowModule - """ - - def __init__(self, config): - super(WaveFlowModule, self).__init__() - self.n_flows = config.n_flows - self.n_group = config.n_group - self.n_layers = config.n_layers - self.upsample_factors = config.upsample_factors if hasattr( - config, "upsample_factors") else [16, 16] - assert self.n_group % 2 == 0 - assert self.n_flows % 2 == 0 - - self.dtype = "float16" if config.use_fp16 else "float32" - self.conditioner = Conditioner(self.dtype, self.upsample_factors) - self.flows = [] - for i in range(self.n_flows): - flow = Flow(config) - self.flows.append(flow) - self.add_sublayer("flow_{}".format(i), flow) - - self.perms = [] - half = self.n_group // 2 - for i in range(self.n_flows): - perm = list(range(self.n_group)) - if i < self.n_flows // 2: - perm = perm[::-1] - else: - perm[:half] = reversed(perm[:half]) - perm[half:] = reversed(perm[half:]) - self.perms.append(perm) - - def forward(self, audio, mel): - """Training forward pass. - - Use a conditioner to upsample mel spectrograms into hidden states. - These hidden states along with the audio are passed to a stack of Flow - modules to obtain the final latent variable z and a list of log scaling - variables, which are then passed to the WaveFlowLoss module to calculate - the negative log likelihood. - - Args: - audio (obj): audio samples. - mel (obj): mel spectrograms. - - Returns: - z (obj): latent variable. - log_s_list(list): list of log scaling variables. - """ - mel = self.conditioner(mel) - assert mel.shape[2] >= audio.shape[1] - # Prune out the tail of audio/mel so that time/n_group == 0. - pruned_len = int(audio.shape[1] // self.n_group * self.n_group) - - if audio.shape[1] > pruned_len: - audio = audio[:, :pruned_len] - if mel.shape[2] > pruned_len: - mel = mel[:, :, :pruned_len] - - # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] - mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) - # From [bs, time] to [bs, n_group, time/n_group] - audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1]) - # [bs, 1, n_group, time/n_group] - audio = fluid.layers.unsqueeze(audio, 1) - log_s_list = [] - for i in range(self.n_flows): - inputs = audio[:, :, :-1, :] - conds = mel[:, :, 1:, :] - outputs = self.flows[i](inputs, conds) - log_s = outputs[:, :1, :, :] - b = outputs[:, 1:, :, :] - log_s_list.append(log_s) - - audio_0 = audio[:, :, :1, :] - audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b - audio = fluid.layers.concat([audio_0, audio_out], axis=2) - - # Permute over the height dim. - audio_slices = [audio[:, :, j, :] for j in self.perms[i]] - audio = fluid.layers.stack(audio_slices, axis=2) - mel_slices = [mel[:, :, j, :] for j in self.perms[i]] - mel = fluid.layers.stack(mel_slices, axis=2) - - z = fluid.layers.squeeze(audio, [1]) - return z, log_s_list - - def synthesize(self, mel, sigma=1.0): - """Use model to synthesize waveform. - - Use a conditioner to upsample mel spectrograms into hidden states. - These hidden states along with initial random gaussian latent variable - are passed to a stack of Flow modules to obtain the audio output. - - Note that we use convolutional queue (https://arxiv.org/abs/1611.09482) - to cache the intermediate hidden states, which will speed up the - autoregressive inference over the height dimension. Current - implementation only supports height dimension (self.n_group) equals - 8 or 16, i.e., where there is no dilation on the height dimension. - - Args: - mel (obj): mel spectrograms. - sigma (float, optional): standard deviation of the guassian latent - variable. Defaults to 1.0. - - Returns: - audio (obj): synthesized audio. - """ - if self.dtype == "float16": - mel = fluid.layers.cast(mel, self.dtype) - mel = self.conditioner.infer(mel) - # Prune out the tail of mel so that time/n_group == 0. - pruned_len = int(mel.shape[2] // self.n_group * self.n_group) - if mel.shape[2] > pruned_len: - mel = mel[:, :, :pruned_len] - # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] - mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) - - audio = fluid.layers.gaussian_random( - shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma) - if self.dtype == "float16": - audio = fluid.layers.cast(audio, self.dtype) - for i in reversed(range(self.n_flows)): - # Permute over the height dimension. - audio_slices = [audio[:, :, j, :] for j in self.perms[i]] - audio = fluid.layers.stack(audio_slices, axis=2) - mel_slices = [mel[:, :, j, :] for j in self.perms[i]] - mel = fluid.layers.stack(mel_slices, axis=2) - - audio_list = [] - audio_0 = audio[:, :, 0:1, :] - audio_list.append(audio_0) - audio_h = audio_0 - queues = [[] for _ in range(self.n_layers)] - - for h in range(1, self.n_group): - inputs = audio_h - conds = mel[:, :, h:(h + 1), :] - outputs = self.flows[i].infer(inputs, conds, queues) - - log_s = outputs[:, 0:1, :, :] - b = outputs[:, 1:, :, :] - audio_h = (audio[:, :, h:(h+1), :] - b) / \ - fluid.layers.exp(log_s) - audio_list.append(audio_h) - - audio = fluid.layers.concat(audio_list, axis=2) - - # audio: [bs, n_group, time/n_group] - audio = fluid.layers.squeeze(audio, [1]) - # audio: [bs, time] - audio = fluid.layers.reshape( - fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1]) - return audio