add an cli for cloning examples

2020-11-19 18:08:11 +08:00 · 2020-11-19 18:08:11 +08:00 · a01200e437
parent c7e5aaa540
commit a01200e437
13 changed files with 36 additions and 1605 deletions
--- a/parakeet/main.py
+++ b/parakeet/main.py
@ -0,0 +1,36 @@
+import parakeet
+
+if __name__ == '__main__':
+    import argparse
+    import os
+    import shutil
+    from pathlib import Path
+    
+    package_path = Path(__file__).parent
+    print(package_path)
+
+    parser = argparse.ArgumentParser()
+    subparser = parser.add_subparsers(dest="cmd")
+    
+    list_exp_parser = subparser.add_parser("list-examples")
+    clone = subparser.add_parser("clone-example")
+    clone.add_argument("experiment_name", type=str, help="experiment name")
+    
+    args = parser.parse_args()
+    
+    if args.cmd == "list-examples":
+        print(os.listdir(package_path / "examples"))
+        exit(0)
+    
+    if args.cmd == "clone-example":
+        source = package_path / "examples" / (args.experiment_name)
+        target = Path(os.getcwd()) / (args.experiment_name)
+        if not os.path.exists(str(source)):
+            raise ValueError("{} does not exist".format(str(source)))
+        
+        if os.path.exists(str(target)):
+            raise FileExistsError("{} already exists".format(str(target)))
+        
+        shutil.copytree(str(source), str(target))
+        print("{} copied!".format(args.experiment_name))
+        exit(0)
--- a/parakeet/models/transformer_tts_deprecated/init.py
+++ b/parakeet/models/transformer_tts_deprecated/init.py
@ -1,15 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .transformer_tts import TransformerTTS
-from .vocoder import Vocoder
--- a/parakeet/models/transformer_tts_deprecated/cbhg.py
+++ b/parakeet/models/transformer_tts_deprecated/cbhg.py
@ -1,287 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from parakeet.g2p.text.symbols import symbols
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from parakeet.modules.customized import Pool1D, Conv1D
-from parakeet.modules.dynamic_gru import DynamicGRU
-import numpy as np
-
-
-class CBHG(dg.Layer):
-    def __init__(self,
-                 hidden_size,
-                 batch_size,
-                 K=16,
-                 projection_size=256,
-                 num_gru_layers=2,
-                 max_pool_kernel_size=2,
-                 is_post=False):
-        """CBHG Module
-
-        Args:
-            hidden_size (int): dimension of hidden unit.
-            batch_size (int): batch size of input.
-            K (int, optional): number of convolution banks. Defaults to 16.
-            projection_size (int, optional): dimension of projection unit. Defaults to 256.
-            num_gru_layers (int, optional): number of layers of GRUcell. Defaults to 2.
-            max_pool_kernel_size (int, optional): max pooling kernel size. Defaults to 2
-            is_post (bool, optional): whether post processing or not. Defaults to False.
-        """
-        super(CBHG, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.projection_size = projection_size
-        self.conv_list = []
-        k = math.sqrt(1.0 / projection_size)
-        self.conv_list.append(
-            Conv1D(
-                num_channels=projection_size,
-                num_filters=hidden_size,
-                filter_size=1,
-                padding=int(np.floor(1 / 2)),
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.XavierInitializer()),
-                bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Uniform(
-                        low=-k, high=k))))
-        k = math.sqrt(1.0 / hidden_size)
-        for i in range(2, K + 1):
-            self.conv_list.append(
-                Conv1D(
-                    num_channels=hidden_size,
-                    num_filters=hidden_size,
-                    filter_size=i,
-                    padding=int(np.floor(i / 2)),
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.XavierInitializer()),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Uniform(
-                            low=-k, high=k))))
-
-        for i, layer in enumerate(self.conv_list):
-            self.add_sublayer("conv_list_{}".format(i), layer)
-
-        self.batchnorm_list = []
-        for i in range(K):
-            self.batchnorm_list.append(
-                dg.BatchNorm(
-                    hidden_size, data_layout='NCHW'))
-
-        for i, layer in enumerate(self.batchnorm_list):
-            self.add_sublayer("batchnorm_list_{}".format(i), layer)
-
-        conv_outdim = hidden_size * K
-
-        k = math.sqrt(1.0 / conv_outdim)
-        self.conv_projection_1 = Conv1D(
-            num_channels=conv_outdim,
-            num_filters=hidden_size,
-            filter_size=3,
-            padding=int(np.floor(3 / 2)),
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-
-        k = math.sqrt(1.0 / hidden_size)
-        self.conv_projection_2 = Conv1D(
-            num_channels=hidden_size,
-            num_filters=projection_size,
-            filter_size=3,
-            padding=int(np.floor(3 / 2)),
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-
-        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
-        self.batchnorm_proj_2 = dg.BatchNorm(
-            projection_size, data_layout='NCHW')
-        self.max_pool = Pool1D(
-            pool_size=max_pool_kernel_size,
-            pool_type='max',
-            pool_stride=1,
-            pool_padding=1,
-            data_format="NCT")
-        self.highway = Highwaynet(self.projection_size)
-
-        h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
-        h_0 = dg.to_variable(h_0)
-        k = math.sqrt(1.0 / hidden_size)
-        self.fc_forward1 = dg.Linear(
-            hidden_size,
-            hidden_size // 2 * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-        self.fc_reverse1 = dg.Linear(
-            hidden_size,
-            hidden_size // 2 * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-        self.gru_forward1 = DynamicGRU(
-            size=self.hidden_size // 2,
-            is_reverse=False,
-            origin_mode=True,
-            h_0=h_0)
-        self.gru_reverse1 = DynamicGRU(
-            size=self.hidden_size // 2,
-            is_reverse=True,
-            origin_mode=True,
-            h_0=h_0)
-
-        self.fc_forward2 = dg.Linear(
-            hidden_size,
-            hidden_size // 2 * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-        self.fc_reverse2 = dg.Linear(
-            hidden_size,
-            hidden_size // 2 * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-        self.gru_forward2 = DynamicGRU(
-            size=self.hidden_size // 2,
-            is_reverse=False,
-            origin_mode=True,
-            h_0=h_0)
-        self.gru_reverse2 = DynamicGRU(
-            size=self.hidden_size // 2,
-            is_reverse=True,
-            origin_mode=True,
-            h_0=h_0)
-
-    def _conv_fit_dim(self, x, filter_size=3):
-        if filter_size % 2 == 0:
-            return x[:, :, :-1]
-        else:
-            return x
-
-    def forward(self, input_):
-        """
-        Convert linear spectrum to Mel spectrum.
-
-        Args:
-            input_ (Variable): shape(B, C, T), dtype float32, the sequentially input.  
-
-        Returns:
-            out (Variable): shape(B, C, T), the CBHG output.
-        """
-
-        conv_list = []
-        conv_input = input_
-
-        for i, (conv, batchnorm
-                ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
-            conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
-            conv_input = layers.relu(batchnorm(conv_input))
-            conv_list.append(conv_input)
-
-        conv_cat = layers.concat(conv_list, axis=1)
-        conv_pool = self.max_pool(conv_cat)[:, :, :-1]
-
-        conv_proj = layers.relu(
-            self.batchnorm_proj_1(
-                self._conv_fit_dim(self.conv_projection_1(conv_pool))))
-        conv_proj = self.batchnorm_proj_2(
-            self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
-
-        # conv_proj.shape = [N, C, T]
-        highway = layers.transpose(conv_proj, [0, 2, 1])
-        highway = self.highway(highway)
-
-        # highway.shape = [N, T, C]
-        fc_forward = self.fc_forward1(highway)
-        fc_reverse = self.fc_reverse1(highway)
-        out_forward = self.gru_forward1(fc_forward)
-        out_reverse = self.gru_reverse1(fc_reverse)
-        out = layers.concat([out_forward, out_reverse], axis=-1)
-        fc_forward = self.fc_forward2(out)
-        fc_reverse = self.fc_reverse2(out)
-        out_forward = self.gru_forward2(fc_forward)
-        out_reverse = self.gru_reverse2(fc_reverse)
-        out = layers.concat([out_forward, out_reverse], axis=-1)
-        out = layers.transpose(out, [0, 2, 1])
-        return out
-
-
-class Highwaynet(dg.Layer):
-    def __init__(self, num_units, num_layers=4):
-        """Highway network
-
-        Args:
-            num_units (int): dimension of hidden unit.
-            num_layers (int, optional): number of highway layers. Defaults to 4.
-        """
-        super(Highwaynet, self).__init__()
-        self.num_units = num_units
-        self.num_layers = num_layers
-
-        self.gates = []
-        self.linears = []
-        k = math.sqrt(1.0 / num_units)
-        for i in range(num_layers):
-            self.linears.append(
-                dg.Linear(
-                    num_units,
-                    num_units,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.XavierInitializer()),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Uniform(
-                            low=-k, high=k))))
-            self.gates.append(
-                dg.Linear(
-                    num_units,
-                    num_units,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.XavierInitializer()),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Uniform(
-                            low=-k, high=k))))
-
-        for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
-            self.add_sublayer("linears_{}".format(i), linear)
-            self.add_sublayer("gates_{}".format(i), gate)
-
-    def forward(self, input_):
-        """
-        Compute result of Highway network.
-
-        Args:
-            input_(Variable): shape(B, T, C), dtype float32, the sequentially input.
-            
-        Returns:
-            out(Variable): the Highway output.
-        """
-        out = input_
-
-        for linear, gate in zip(self.linears, self.gates):
-            h = fluid.layers.relu(linear(out))
-            t_ = fluid.layers.sigmoid(gate(out))
-
-            c = 1 - t_
-            out = h * t_ + out * c
-
-        return out
--- a/parakeet/models/transformer_tts_deprecated/decoder.py
+++ b/parakeet/models/transformer_tts_deprecated/decoder.py
@ -1,193 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-from parakeet.models.transformer_tts.utils import *
-from parakeet.modules.multihead_attention import MultiheadAttention
-from parakeet.modules.ffn import PositionwiseFeedForward
-from parakeet.models.transformer_tts.prenet import PreNet
-from parakeet.models.transformer_tts.post_convnet import PostConvNet
-
-
-class Decoder(dg.Layer):
-    def __init__(self,
-                 num_hidden,
-                 num_mels=80,
-                 outputs_per_step=1,
-                 num_head=4,
-                 n_layers=3):
-        """Decoder layer of TransformerTTS.
-
-        Args:
-            num_hidden (int): the number of source vocabulary.
-            n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
-            outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
-            num_head (int, optional): the head number of multihead attention. Defaults to 4.
-            n_layers (int, optional): the layers number of multihead attention. Defaults to 3.  
-        """
-        super(Decoder, self).__init__()
-        self.num_hidden = num_hidden
-        self.num_head = num_head
-        param = fluid.ParamAttr()
-        self.alpha = self.create_parameter(
-            shape=(1, ),
-            attr=param,
-            dtype='float32',
-            default_initializer=fluid.initializer.ConstantInitializer(
-                value=1.0))
-        self.pos_inp = get_sinusoid_encoding_table(
-            1024, self.num_hidden, padding_idx=0)
-        self.pos_emb = dg.Embedding(
-            size=[1024, num_hidden],
-            padding_idx=0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    self.pos_inp),
-                trainable=False))
-        self.decoder_prenet = PreNet(
-            input_size=num_mels,
-            hidden_size=num_hidden * 2,
-            output_size=num_hidden,
-            dropout_rate=0.2)
-        k = math.sqrt(1.0 / num_hidden)
-        self.linear = dg.Linear(
-            num_hidden,
-            num_hidden,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-
-        self.selfattn_layers = [
-            MultiheadAttention(num_hidden, num_hidden // num_head,
-                               num_hidden // num_head) for _ in range(n_layers)
-        ]
-        for i, layer in enumerate(self.selfattn_layers):
-            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.attn_layers = [
-            MultiheadAttention(num_hidden, num_hidden // num_head,
-                               num_hidden // num_head) for _ in range(n_layers)
-        ]
-        for i, layer in enumerate(self.attn_layers):
-            self.add_sublayer("attn_{}".format(i), layer)
-        self.ffns = [
-            PositionwiseFeedForward(
-                num_hidden, num_hidden * num_head, filter_size=1)
-            for _ in range(n_layers)
-        ]
-        for i, layer in enumerate(self.ffns):
-            self.add_sublayer("ffns_{}".format(i), layer)
-        self.mel_linear = dg.Linear(
-            num_hidden,
-            num_mels * outputs_per_step,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-        self.stop_linear = dg.Linear(
-            num_hidden,
-            1,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-
-        self.postconvnet = PostConvNet(
-            num_mels,
-            num_hidden,
-            filter_size=5,
-            padding=4,
-            num_conv=5,
-            outputs_per_step=outputs_per_step,
-            use_cudnn=True)
-
-    def forward(self, key, value, query, positional, c_mask):
-        """
-        Compute decoder outputs.
-        
-        Args:
-            key (Variable): shape(B, T_text, C), dtype float32, the input key of decoder,
-                where T_text means the timesteps of input text,
-            value (Variable): shape(B, T_text, C), dtype float32, the input value of decoder.
-            query (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder,
-                where T_mel means the timesteps of input spectrum,
-            positional (Variable): shape(B, T_mel), dtype int64, the spectrum position. 
-            c_mask (Variable): shape(B, T_text, 1), dtype float32, query mask returned from encoder.
-        Returns:
-            mel_out (Variable): shape(B, T_mel, C), the decoder output after mel linear projection.
-            out (Variable): shape(B, T_mel, C), the decoder output after post mel network.
-            stop_tokens (Variable): shape(B, T_mel, 1), the stop tokens of output.
-            attn_list (list[Variable]): len(n_layers), the encoder-decoder attention list.
-            selfattn_list (list[Variable]): len(n_layers), the decoder self attention list.
-        """
-
-        # get decoder mask with triangular matrix
-
-        if fluid.framework._dygraph_tracer()._train_mode:
-            mask = get_dec_attn_key_pad_mask(positional, self.num_head,
-                                             query.dtype)
-            m_mask = get_non_pad_mask(positional, self.num_head, query.dtype)
-            zero_mask = layers.cast(c_mask == 0, dtype=query.dtype) * -1e30
-            zero_mask = layers.transpose(zero_mask, perm=[0, 2, 1])
-
-        else:
-            len_q = query.shape[1]
-            mask = layers.triu(
-                layers.ones(
-                    shape=[len_q, len_q], dtype=query.dtype),
-                diagonal=1)
-            mask = layers.cast(mask != 0, dtype=query.dtype) * -1e30
-            m_mask, zero_mask = None, None
-
-        # Decoder pre-network
-        query = self.decoder_prenet(query)
-
-        # Centered position
-        query = self.linear(query)
-
-        # Get position embedding
-        positional = self.pos_emb(positional)
-        query = positional * self.alpha + query
-
-        #positional dropout
-        query = fluid.layers.dropout(
-            query, 0.1, dropout_implementation='upscale_in_train')
-
-        # Attention decoder-decoder, encoder-decoder
-        selfattn_list = list()
-        attn_list = list()
-
-        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
-                                       self.ffns):
-            query, attn_dec = selfattn(
-                query, query, query, mask=mask, query_mask=m_mask)
-            query, attn_dot = attn(
-                key, value, query, mask=zero_mask, query_mask=m_mask)
-            query = ffn(query)
-            selfattn_list.append(attn_dec)
-            attn_list.append(attn_dot)
-
-        # Mel linear projection
-        mel_out = self.mel_linear(query)
-        # Post Mel Network
-        out = self.postconvnet(mel_out)
-        out = mel_out + out
-
-        # Stop tokens
-        stop_tokens = self.stop_linear(query)
-        stop_tokens = layers.squeeze(stop_tokens, [-1])
-        stop_tokens = layers.sigmoid(stop_tokens)
-
-        return mel_out, out, attn_list, stop_tokens, selfattn_list
--- a/parakeet/models/transformer_tts_deprecated/encoder.py
+++ b/parakeet/models/transformer_tts_deprecated/encoder.py
@ -1,106 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-from parakeet.models.transformer_tts.utils import *
-from parakeet.modules.multihead_attention import MultiheadAttention
-from parakeet.modules.ffn import PositionwiseFeedForward
-from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
-
-
-class Encoder(dg.Layer):
-    def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3):
-        """Encoder layer of TransformerTTS.
-
-        Args:
-            embedding_size (int): the size of position embedding.
-            num_hidden (int): the size of hidden layer in network.
-            num_head (int, optional): the head number of multihead attention. Defaults to 4.
-            n_layers (int, optional): the layers number of multihead attention. Defaults to 3.
-        """
-        super(Encoder, self).__init__()
-        self.num_hidden = num_hidden
-        self.num_head = num_head
-        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=1.0))
-        self.alpha = self.create_parameter(
-            shape=(1, ), attr=param, dtype='float32')
-        self.pos_inp = get_sinusoid_encoding_table(
-            1024, self.num_hidden, padding_idx=0)
-        self.pos_emb = dg.Embedding(
-            size=[1024, num_hidden],
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    self.pos_inp),
-                trainable=False))
-        self.encoder_prenet = EncoderPrenet(
-            embedding_size=embedding_size,
-            num_hidden=num_hidden,
-            use_cudnn=True)
-        self.layers = [
-            MultiheadAttention(num_hidden, num_hidden // num_head,
-                               num_hidden // num_head) for _ in range(n_layers)
-        ]
-        for i, layer in enumerate(self.layers):
-            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.ffns = [
-            PositionwiseFeedForward(
-                num_hidden,
-                num_hidden * num_head,
-                filter_size=1,
-                use_cudnn=True) for _ in range(n_layers)
-        ]
-        for i, layer in enumerate(self.ffns):
-            self.add_sublayer("ffns_{}".format(i), layer)
-
-    def forward(self, x, positional):
-        """
-        Encode text sequence.
-        
-        Args:
-            x (Variable): shape(B, T_text), dtype float32, the input character,
-                where T_text means the timesteps of input text,
-            positional (Variable): shape(B, T_text), dtype int64, the characters position. 
-                
-        Returns:
-            x (Variable): shape(B, T_text, C), the encoder output.
-            attentions (list[Variable]): len(n_layers), the encoder self attention list.
-        """
-
-        # Encoder pre_network
-        x = self.encoder_prenet(x)
-
-        if fluid.framework._dygraph_tracer()._train_mode:
-            mask = get_attn_key_pad_mask(positional, self.num_head, x.dtype)
-            query_mask = get_non_pad_mask(positional, self.num_head, x.dtype)
-
-        else:
-            query_mask, mask = None, None
-
-        # Get positional encoding
-        positional = self.pos_emb(positional)
-
-        x = positional * self.alpha + x
-
-        # Positional dropout
-        x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train')
-
-        # Self attention encoder
-        attentions = list()
-        for layer, ffn in zip(self.layers, self.ffns):
-            x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
-            x = ffn(x)
-            attentions.append(attention)
-
-        return x, attentions, query_mask
--- a/parakeet/models/transformer_tts_deprecated/encoderprenet.py
+++ b/parakeet/models/transformer_tts_deprecated/encoderprenet.py
@ -1,111 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from parakeet.g2p.text.symbols import symbols
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from parakeet.modules.customized import Conv1D
-import numpy as np
-
-
-class EncoderPrenet(dg.Layer):
-    def __init__(self, embedding_size, num_hidden, use_cudnn=True):
-        """ Encoder prenet layer of TransformerTTS.
-
-        Args:
-            embedding_size (int): the size of embedding.
-            num_hidden (int): the size of hidden layer in network.
-            use_cudnn (bool, optional): use cudnn or not. Defaults to True.
-        """
-        super(EncoderPrenet, self).__init__()
-        self.embedding_size = embedding_size
-        self.num_hidden = num_hidden
-        self.use_cudnn = use_cudnn
-        self.embedding = dg.Embedding(
-            size=[len(symbols), embedding_size],
-            padding_idx=0,
-            param_attr=fluid.initializer.Normal(
-                loc=0.0, scale=1.0))
-        self.conv_list = []
-        k = math.sqrt(1.0 / embedding_size)
-        self.conv_list.append(
-            Conv1D(
-                num_channels=embedding_size,
-                num_filters=num_hidden,
-                filter_size=5,
-                padding=int(np.floor(5 / 2)),
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.XavierInitializer()),
-                bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Uniform(
-                        low=-k, high=k)),
-                use_cudnn=use_cudnn))
-        k = math.sqrt(1.0 / num_hidden)
-        for _ in range(2):
-            self.conv_list.append(
-                Conv1D(
-                    num_channels=num_hidden,
-                    num_filters=num_hidden,
-                    filter_size=5,
-                    padding=int(np.floor(5 / 2)),
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.XavierInitializer()),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Uniform(
-                            low=-k, high=k)),
-                    use_cudnn=use_cudnn))
-
-        for i, layer in enumerate(self.conv_list):
-            self.add_sublayer("conv_list_{}".format(i), layer)
-
-        self.batch_norm_list = [
-            dg.BatchNorm(
-                num_hidden, data_layout='NCHW') for _ in range(3)
-        ]
-
-        for i, layer in enumerate(self.batch_norm_list):
-            self.add_sublayer("batch_norm_list_{}".format(i), layer)
-
-        k = math.sqrt(1.0 / num_hidden)
-        self.projection = dg.Linear(
-            num_hidden,
-            num_hidden,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-
-    def forward(self, x):
-        """
-        Prepare encoder input.
-        
-        Args:
-            x (Variable): shape(B, T_text), dtype float32, the input character, where T_text means the timesteps of input text.
-                
-        Returns:
-            (Variable): shape(B, T_text, C), the encoder prenet output.
-        """
-
-        x = self.embedding(x)
-        x = layers.transpose(x, [0, 2, 1])
-        for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
-            x = layers.dropout(
-                layers.relu(batch_norm(conv(x))),
-                0.2,
-                dropout_implementation='upscale_in_train')
-        x = layers.transpose(x, [0, 2, 1])  #(N,T,C)
-        x = self.projection(x)
-
-        return x
--- a/parakeet/models/transformer_tts_deprecated/post_convnet.py
+++ b/parakeet/models/transformer_tts_deprecated/post_convnet.py
@ -1,137 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from parakeet.modules.customized import Conv1D
-
-
-class PostConvNet(dg.Layer):
-    def __init__(self,
-                 n_mels=80,
-                 num_hidden=512,
-                 filter_size=5,
-                 padding=0,
-                 num_conv=5,
-                 outputs_per_step=1,
-                 use_cudnn=True,
-                 dropout=0.1,
-                 batchnorm_last=False):
-        """Decocder post conv net of TransformerTTS.
-
-        Args:
-            n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
-            num_hidden (int, optional): the size of hidden layer in network. Defaults to 512.
-            filter_size (int, optional): the filter size of Conv.  Defaults to 5.
-            padding (int, optional): the padding size of Conv. Defaults to 0.
-            num_conv (int, optional): the num of Conv layers in network. Defaults to 5.
-            outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
-            use_cudnn (bool, optional): use cudnn in Conv or not. Defaults to True.
-            dropout (float, optional): dropout probability. Defaults to 0.1.
-            batchnorm_last (bool, optional): if batchnorm at last layer or not. Defaults to False.
-        """
-        super(PostConvNet, self).__init__()
-
-        self.dropout = dropout
-        self.num_conv = num_conv
-        self.batchnorm_last = batchnorm_last
-        self.conv_list = []
-        k = math.sqrt(1.0 / (n_mels * outputs_per_step))
-        self.conv_list.append(
-            Conv1D(
-                num_channels=n_mels * outputs_per_step,
-                num_filters=num_hidden,
-                filter_size=filter_size,
-                padding=padding,
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.XavierInitializer()),
-                bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Uniform(
-                        low=-k, high=k)),
-                use_cudnn=use_cudnn))
-
-        k = math.sqrt(1.0 / num_hidden)
-        for _ in range(1, num_conv - 1):
-            self.conv_list.append(
-                Conv1D(
-                    num_channels=num_hidden,
-                    num_filters=num_hidden,
-                    filter_size=filter_size,
-                    padding=padding,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.XavierInitializer()),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Uniform(
-                            low=-k, high=k)),
-                    use_cudnn=use_cudnn))
-
-        self.conv_list.append(
-            Conv1D(
-                num_channels=num_hidden,
-                num_filters=n_mels * outputs_per_step,
-                filter_size=filter_size,
-                padding=padding,
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.XavierInitializer()),
-                bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Uniform(
-                        low=-k, high=k)),
-                use_cudnn=use_cudnn))
-
-        for i, layer in enumerate(self.conv_list):
-            self.add_sublayer("conv_list_{}".format(i), layer)
-
-        self.batch_norm_list = [
-            dg.BatchNorm(
-                num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
-        ]
-        if self.batchnorm_last:
-            self.batch_norm_list.append(
-                dg.BatchNorm(
-                    n_mels * outputs_per_step, data_layout='NCHW'))
-        for i, layer in enumerate(self.batch_norm_list):
-            self.add_sublayer("batch_norm_list_{}".format(i), layer)
-
-    def forward(self, input):
-        """
-        Compute the mel spectrum.
-        
-        Args:
-            input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection. 
-               
-        Returns:
-           output (Variable): shape(B, T, C), the result after postconvnet.
-        """
-
-        input = layers.transpose(input, [0, 2, 1])
-        len = input.shape[-1]
-        for i in range(self.num_conv - 1):
-            batch_norm = self.batch_norm_list[i]
-            conv = self.conv_list[i]
-
-            input = layers.dropout(
-                layers.tanh(batch_norm(conv(input)[:, :, :len])),
-                self.dropout,
-                dropout_implementation='upscale_in_train')
-        conv = self.conv_list[self.num_conv - 1]
-        input = conv(input)[:, :, :len]
-        if self.batchnorm_last:
-            batch_norm = self.batch_norm_list[self.num_conv - 1]
-            input = layers.dropout(
-                batch_norm(input),
-                self.dropout,
-                dropout_implementation='upscale_in_train')
-        output = layers.transpose(input, [0, 2, 1])
-        return output
--- a/parakeet/models/transformer_tts_deprecated/prenet.py
+++ b/parakeet/models/transformer_tts_deprecated/prenet.py
@ -1,71 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-class PreNet(dg.Layer):
-    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
-        """Prenet before passing through the network.
-
-        Args:
-            input_size (int): the input channel size.
-            hidden_size (int): the size of hidden layer in network.
-            output_size (int): the output channel size.
-            dropout_rate (float, optional): dropout probability. Defaults to 0.2.
-        """
-        super(PreNet, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.output_size = output_size
-        self.dropout_rate = dropout_rate
-
-        k = math.sqrt(1.0 / input_size)
-        self.linear1 = dg.Linear(
-            input_size,
-            hidden_size,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-        k = math.sqrt(1.0 / hidden_size)
-        self.linear2 = dg.Linear(
-            hidden_size,
-            output_size,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.XavierInitializer()),
-            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                low=-k, high=k)))
-
-    def forward(self, x):
-        """
-        Prepare network input.
-        
-        Args:
-            x (Variable): shape(B, T, C), dtype float32, the input value.
-                
-        Returns:
-            output (Variable): shape(B, T, C), the result after pernet.
-        """
-        x = layers.dropout(
-            layers.relu(self.linear1(x)),
-            self.dropout_rate,
-            dropout_implementation='upscale_in_train')
-        output = layers.dropout(
-            layers.relu(self.linear2(x)),
-            self.dropout_rate,
-            dropout_implementation='upscale_in_train')
-        return output
--- a/parakeet/models/transformer_tts_deprecated/transformer_tts.py
+++ b/parakeet/models/transformer_tts_deprecated/transformer_tts.py
@ -1,71 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-from parakeet.models.transformer_tts.encoder import Encoder
-from parakeet.models.transformer_tts.decoder import Decoder
-
-
-class TransformerTTS(dg.Layer):
-    def __init__(self,
-                 embedding_size,
-                 num_hidden,
-                 encoder_num_head=4,
-                 encoder_n_layers=3,
-                 n_mels=80,
-                 outputs_per_step=1,
-                 decoder_num_head=4,
-                 decoder_n_layers=3):
-        """TransformerTTS model.
-
-        Args:
-            embedding_size (int): the size of position embedding.
-            num_hidden (int): the size of hidden layer in network.
-            encoder_num_head (int, optional): the head number of multihead attention in encoder. Defaults to 4.
-            encoder_n_layers (int, optional): the layers number of multihead attention in encoder. Defaults to 3.
-            n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
-            outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
-            decoder_num_head (int, optional): the head number of multihead attention in decoder. Defaults to 4.
-            decoder_n_layers (int, optional): the layers number of multihead attention in decoder. Defaults to 3.
-        """
-        super(TransformerTTS, self).__init__()
-        self.encoder = Encoder(embedding_size, num_hidden, encoder_num_head,
-                               encoder_n_layers)
-        self.decoder = Decoder(num_hidden, n_mels, outputs_per_step,
-                               decoder_num_head, decoder_n_layers)
-
-    def forward(self, characters, mel_input, pos_text, pos_mel):
-        """
-        TransformerTTS network.
-        
-        Args:
-            characters (Variable): shape(B, T_text), dtype float32, the input character,
-                where T_text means the timesteps of input text,
-            mel_input (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder,
-                where T_mel means the timesteps of input spectrum,
-            pos_text (Variable): shape(B, T_text), dtype int64, the characters position. 
-                
-        Returns:
-            mel_output (Variable): shape(B, T_mel, C), the decoder output after mel linear projection.
-            postnet_output (Variable): shape(B, T_mel, C), the decoder output after post mel network.
-            stop_preds (Variable): shape(B, T_mel, 1), the stop tokens of output.
-            attn_probs (list[Variable]): len(n_layers), the encoder-decoder attention list.
-            attns_enc (list[Variable]): len(n_layers), the encoder self attention list.
-            attns_dec (list[Variable]): len(n_layers), the decoder self attention list.
-        """
-        key, attns_enc, query_mask = self.encoder(characters, pos_text)
-
-        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
-            key, key, mel_input, pos_mel, query_mask)
-        return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
--- a/parakeet/models/transformer_tts_deprecated/utils.py
+++ b/parakeet/models/transformer_tts_deprecated/utils.py
@ -1,101 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import librosa
-import os, copy
-from scipy import signal
-import paddle.fluid.layers as layers
-
-
-def get_positional_table(d_pos_vec, n_position=1024):
-    position_enc = np.array(
-        [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
-         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
-
-    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
-    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
-    return position_enc
-
-
-def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
-    ''' Sinusoid position encoding table '''
-
-    def cal_angle(position, hid_idx):
-        return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
-
-    def get_posi_angle_vec(position):
-        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
-
-    sinusoid_table = np.array(
-        [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
-
-    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
-    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
-
-    if padding_idx is not None:
-        # zero vector for padding dimension
-        sinusoid_table[padding_idx] = 0.
-
-    return sinusoid_table
-
-
-def get_non_pad_mask(seq, num_head, dtype):
-    mask = layers.cast(seq != 0, dtype=dtype)
-    mask = layers.unsqueeze(mask, axes=[-1])
-    mask = layers.expand(mask, [num_head, 1, 1])
-    return mask
-
-
-def get_attn_key_pad_mask(seq_k, num_head, dtype):
-    ''' For masking out the padding part of key sequence. '''
-    # Expand to fit the shape of key query attention matrix.
-    padding_mask = layers.cast(seq_k == 0, dtype=dtype) * -1e30
-    padding_mask = layers.unsqueeze(padding_mask, axes=[1])
-    padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
-    return padding_mask
-
-
-def get_dec_attn_key_pad_mask(seq_k, num_head, dtype):
-    ''' For masking out the padding part of key sequence. '''
-
-    # Expand to fit the shape of key query attention matrix.
-    padding_mask = layers.cast(seq_k == 0, dtype=dtype)
-    padding_mask = layers.unsqueeze(padding_mask, axes=[1])
-    len_k = seq_k.shape[1]
-    triu = layers.triu(
-        layers.ones(
-            shape=[len_k, len_k], dtype=dtype), diagonal=1)
-    padding_mask = padding_mask + triu
-    padding_mask = layers.cast(
-        padding_mask != 0, dtype=dtype) * -1e30  #* (-2**32 + 1)
-    padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
-    return padding_mask
-
-
-def guided_attention(N, T, g=0.2):
-    '''Guided attention. Refer to page 3 on the paper.'''
-    W = np.zeros((N, T), dtype=np.float32)
-    for n_pos in range(W.shape[0]):
-        for t_pos in range(W.shape[1]):
-            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
-                                         **2 / (2 * g * g))
-    return W
-
-
-def cross_entropy(input, label, weight=1.0, epsilon=1e-30):
-    output = -1 * label * layers.log(input + epsilon) - (
-        1 - label) * layers.log(1 - input + epsilon)
-    output = output * (label * (weight - 1) + 1)
-
-    return layers.reduce_mean(output, dim=[0, 1])
--- a/parakeet/models/transformer_tts_deprecated/vocoder.py
+++ b/parakeet/models/transformer_tts_deprecated/vocoder.py
@ -1,55 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid.dygraph as dg
-import paddle.fluid as fluid
-from parakeet.modules.customized import Conv1D
-from parakeet.models.transformer_tts.utils import *
-from parakeet.models.transformer_tts.cbhg import CBHG
-
-
-class Vocoder(dg.Layer):
-    def __init__(self, batch_size, hidden_size, num_mels=80, n_fft=2048):
-        """CBHG Network (mel -> linear)
-
-        Args:
-            batch_size (int): the batch size of input.
-            hidden_size (int): the size of hidden layer in network.
-            n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
-            n_fft (int, optional): length of the windowed signal after padding with zeros. Defaults to 2048.
-        """
-        super(Vocoder, self).__init__()
-        self.pre_proj = Conv1D(
-            num_channels=num_mels, num_filters=hidden_size, filter_size=1)
-        self.cbhg = CBHG(hidden_size, batch_size)
-        self.post_proj = Conv1D(
-            num_channels=hidden_size,
-            num_filters=(n_fft // 2) + 1,
-            filter_size=1)
-
-    def forward(self, mel):
-        """
-        Compute mel spectrum to linear spectrum.
-        
-        Args:
-            mel (Variable): shape(B, C, T), dtype float32, the input mel spectrum.
-                
-        Returns:
-            mag_pred (Variable): shape(B, T, C), the linear output.
-        """
-        mel = layers.transpose(mel, [0, 2, 1])
-        mel = self.pre_proj(mel)
-        mel = self.cbhg(mel)
-        mag_pred = self.post_proj(mel)
-        mag_pred = layers.transpose(mag_pred, [0, 2, 1])
-        return mag_pred
--- a/parakeet/models/waveflow_deprecated/init.py
+++ b/parakeet/models/waveflow_deprecated/init.py
@ -1,15 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule
--- a/parakeet/models/waveflow_deprecated/waveflow_modules.py
+++ b/parakeet/models/waveflow_deprecated/waveflow_modules.py
@ -1,443 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import numpy as np
-import paddle.fluid.dygraph as dg
-from paddle import fluid
-from parakeet.modules import weight_norm
-
-
-def get_param_attr(layer_type, filter_size, c_in=1):
-    if layer_type == "weight_norm":
-        k = np.sqrt(1.0 / (c_in * np.prod(filter_size)))
-        weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
-        bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
-    elif layer_type == "common":
-        weight_init = fluid.initializer.ConstantInitializer(0.0)
-        bias_init = fluid.initializer.ConstantInitializer(0.0)
-    else:
-        raise TypeError("Unsupported layer type.")
-
-    param_attr = fluid.ParamAttr(initializer=weight_init)
-    bias_attr = fluid.ParamAttr(initializer=bias_init)
-    return param_attr, bias_attr
-
-
-def unfold(x, n_group):
-    length = x.shape[-1]
-    new_shape = x.shape[:-1] + [length // n_group, n_group]
-    return fluid.layers.reshape(x, new_shape)
-
-
-class WaveFlowLoss:
-    def __init__(self, sigma=1.0):
-        self.sigma = sigma
-
-    def __call__(self, model_output):
-        z, log_s_list = model_output
-        for i, log_s in enumerate(log_s_list):
-            if i == 0:
-                log_s_total = fluid.layers.reduce_sum(log_s)
-            else:
-                log_s_total = log_s_total + fluid.layers.reduce_sum(log_s)
-
-        loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \
-            - log_s_total
-        loss = loss / np.prod(z.shape)
-        const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
-
-        return loss + const
-
-
-class Conditioner(dg.Layer):
-    def __init__(self, dtype, upsample_factors):
-        super(Conditioner, self).__init__()
-
-        self.upsample_conv2d = []
-        for s in upsample_factors:
-            in_channel = 1
-            param_attr, bias_attr = get_param_attr(
-                "weight_norm", (3, 2 * s), c_in=in_channel)
-            conv_trans2d = weight_norm.Conv2DTranspose(
-                num_channels=in_channel,
-                num_filters=1,
-                filter_size=(3, 2 * s),
-                padding=(1, s // 2),
-                stride=(1, s),
-                param_attr=param_attr,
-                bias_attr=bias_attr,
-                dtype=dtype)
-            self.upsample_conv2d.append(conv_trans2d)
-
-        for i, layer in enumerate(self.upsample_conv2d):
-            self.add_sublayer("conv2d_transpose_{}".format(i), layer)
-
-    def forward(self, x):
-        x = fluid.layers.unsqueeze(x, 1)
-        for layer in self.upsample_conv2d:
-            x = layer(x)
-            x = fluid.layers.leaky_relu(x, alpha=0.4)
-
-        return fluid.layers.squeeze(x, [1])
-
-    def infer(self, x):
-        x = fluid.layers.unsqueeze(x, 1)
-        for layer in self.upsample_conv2d:
-            x = layer(x)
-            # Trim conv artifacts.
-            time_cutoff = layer._filter_size[1] - layer._stride[1]
-            x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4)
-
-        return fluid.layers.squeeze(x, [1])
-
-
-class Flow(dg.Layer):
-    def __init__(self, config):
-        super(Flow, self).__init__()
-        self.n_layers = config.n_layers
-        self.n_channels = config.n_channels
-        self.kernel_h = config.kernel_h
-        self.kernel_w = config.kernel_w
-        self.dtype = "float16" if config.use_fp16 else "float32"
-
-        # Transform audio: [batch, 1, n_group, time/n_group] 
-        # => [batch, n_channels, n_group, time/n_group]
-        param_attr, bias_attr = get_param_attr("weight_norm", (1, 1), c_in=1)
-        self.start = weight_norm.Conv2D(
-            num_channels=1,
-            num_filters=self.n_channels,
-            filter_size=(1, 1),
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            dtype=self.dtype)
-
-        # Initializing last layer to 0 makes the affine coupling layers
-        # do nothing at first.  This helps with training stability
-        # output shape: [batch, 2, n_group, time/n_group]
-        param_attr, bias_attr = get_param_attr(
-            "common", (1, 1), c_in=self.n_channels)
-        self.end = dg.Conv2D(
-            num_channels=self.n_channels,
-            num_filters=2,
-            filter_size=(1, 1),
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            dtype=self.dtype)
-
-        # receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
-        dilation_dict = {
-            8: [1, 1, 1, 1, 1, 1, 1, 1],
-            16: [1, 1, 1, 1, 1, 1, 1, 1],
-            32: [1, 2, 4, 1, 2, 4, 1, 2],
-            64: [1, 2, 4, 8, 16, 1, 2, 4],
-            128: [1, 2, 4, 8, 16, 32, 64, 1]
-        }
-        self.dilation_h_list = dilation_dict[config.n_group]
-
-        self.in_layers = []
-        self.cond_layers = []
-        self.res_skip_layers = []
-        for i in range(self.n_layers):
-            dilation_h = self.dilation_h_list[i]
-            dilation_w = 2**i
-
-            param_attr, bias_attr = get_param_attr(
-                "weight_norm", (self.kernel_h, self.kernel_w),
-                c_in=self.n_channels)
-            in_layer = weight_norm.Conv2D(
-                num_channels=self.n_channels,
-                num_filters=2 * self.n_channels,
-                filter_size=(self.kernel_h, self.kernel_w),
-                dilation=(dilation_h, dilation_w),
-                param_attr=param_attr,
-                bias_attr=bias_attr,
-                dtype=self.dtype)
-            self.in_layers.append(in_layer)
-
-            param_attr, bias_attr = get_param_attr(
-                "weight_norm", (1, 1), c_in=config.mel_bands)
-            cond_layer = weight_norm.Conv2D(
-                num_channels=config.mel_bands,
-                num_filters=2 * self.n_channels,
-                filter_size=(1, 1),
-                param_attr=param_attr,
-                bias_attr=bias_attr,
-                dtype=self.dtype)
-            self.cond_layers.append(cond_layer)
-
-            if i < self.n_layers - 1:
-                res_skip_channels = 2 * self.n_channels
-            else:
-                res_skip_channels = self.n_channels
-            param_attr, bias_attr = get_param_attr(
-                "weight_norm", (1, 1), c_in=self.n_channels)
-            res_skip_layer = weight_norm.Conv2D(
-                num_channels=self.n_channels,
-                num_filters=res_skip_channels,
-                filter_size=(1, 1),
-                param_attr=param_attr,
-                bias_attr=bias_attr,
-                dtype=self.dtype)
-            self.res_skip_layers.append(res_skip_layer)
-
-            self.add_sublayer("in_layer_{}".format(i), in_layer)
-            self.add_sublayer("cond_layer_{}".format(i), cond_layer)
-            self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer)
-
-    def forward(self, audio, mel):
-        # audio: [bs, 1, n_group, time/group]
-        # mel: [bs, mel_bands, n_group, time/n_group]
-        audio = self.start(audio)
-
-        for i in range(self.n_layers):
-            dilation_h = self.dilation_h_list[i]
-            dilation_w = 2**i
-
-            # Pad height dim (n_group): causal convolution
-            # Pad width dim (time): dialated non-causal convolution
-            pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
-            pad_left = pad_right = int((self.kernel_w - 1) * dilation_w / 2)
-            # Using pad2d is a bit faster than using padding in Conv2D directly 
-            audio_pad = fluid.layers.pad2d(
-                audio, paddings=[pad_top, pad_bottom, pad_left, pad_right])
-            hidden = self.in_layers[i](audio_pad)
-            cond_hidden = self.cond_layers[i](mel)
-            in_acts = hidden + cond_hidden
-            out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
-                fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
-            res_skip_acts = self.res_skip_layers[i](out_acts)
-
-            if i < self.n_layers - 1:
-                audio += res_skip_acts[:, :self.n_channels, :, :]
-                skip_acts = res_skip_acts[:, self.n_channels:, :, :]
-            else:
-                skip_acts = res_skip_acts
-
-            if i == 0:
-                output = skip_acts
-            else:
-                output += skip_acts
-
-        return self.end(output)
-
-    def infer(self, audio, mel, queues):
-        audio = self.start(audio)
-
-        for i in range(self.n_layers):
-            dilation_h = self.dilation_h_list[i]
-            dilation_w = 2**i
-
-            state_size = dilation_h * (self.kernel_h - 1)
-            queue = queues[i]
-
-            if len(queue) == 0:
-                for j in range(state_size):
-                    queue.append(fluid.layers.zeros_like(audio))
-
-            state = queue[0:state_size]
-            state = fluid.layers.concat(state + [audio], axis=2)
-
-            queue.pop(0)
-            queue.append(audio)
-
-            # Pad height dim (n_group): causal convolution
-            # Pad width dim (time): dialated non-causal convolution
-            pad_top, pad_bottom = 0, 0
-            pad_left = int((self.kernel_w - 1) * dilation_w / 2)
-            pad_right = int((self.kernel_w - 1) * dilation_w / 2)
-            state = fluid.layers.pad2d(
-                state, paddings=[pad_top, pad_bottom, pad_left, pad_right])
-            hidden = self.in_layers[i](state)
-            cond_hidden = self.cond_layers[i](mel)
-            in_acts = hidden + cond_hidden
-            out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
-                      fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
-            res_skip_acts = self.res_skip_layers[i](out_acts)
-
-            if i < self.n_layers - 1:
-                audio += res_skip_acts[:, :self.n_channels, :, :]
-                skip_acts = res_skip_acts[:, self.n_channels:, :, :]
-            else:
-                skip_acts = res_skip_acts
-
-            if i == 0:
-                output = skip_acts
-            else:
-                output += skip_acts
-
-        return self.end(output)
-
-
-class WaveFlowModule(dg.Layer):
-    """WaveFlow model implementation.
-
-    Args:
-        config (obj): model configuration parameters.
-
-    Returns:
-        WaveFlowModule
-    """
-
-    def __init__(self, config):
-        super(WaveFlowModule, self).__init__()
-        self.n_flows = config.n_flows
-        self.n_group = config.n_group
-        self.n_layers = config.n_layers
-        self.upsample_factors = config.upsample_factors if hasattr(
-            config, "upsample_factors") else [16, 16]
-        assert self.n_group % 2 == 0
-        assert self.n_flows % 2 == 0
-
-        self.dtype = "float16" if config.use_fp16 else "float32"
-        self.conditioner = Conditioner(self.dtype, self.upsample_factors)
-        self.flows = []
-        for i in range(self.n_flows):
-            flow = Flow(config)
-            self.flows.append(flow)
-            self.add_sublayer("flow_{}".format(i), flow)
-
-        self.perms = []
-        half = self.n_group // 2
-        for i in range(self.n_flows):
-            perm = list(range(self.n_group))
-            if i < self.n_flows // 2:
-                perm = perm[::-1]
-            else:
-                perm[:half] = reversed(perm[:half])
-                perm[half:] = reversed(perm[half:])
-            self.perms.append(perm)
-
-    def forward(self, audio, mel):
-        """Training forward pass.
-
-        Use a conditioner to upsample mel spectrograms into hidden states.
-        These hidden states along with the audio are passed to a stack of Flow
-        modules to obtain the final latent variable z and a list of log scaling
-        variables, which are then passed to the WaveFlowLoss module to calculate
-        the negative log likelihood.
-
-        Args:
-            audio (obj): audio samples.
-            mel (obj): mel spectrograms.
-
-        Returns:
-            z (obj): latent variable.
-            log_s_list(list): list of log scaling variables.
-        """
-        mel = self.conditioner(mel)
-        assert mel.shape[2] >= audio.shape[1]
-        # Prune out the tail of audio/mel so that time/n_group == 0.
-        pruned_len = int(audio.shape[1] // self.n_group * self.n_group)
-
-        if audio.shape[1] > pruned_len:
-            audio = audio[:, :pruned_len]
-        if mel.shape[2] > pruned_len:
-            mel = mel[:, :, :pruned_len]
-
-        # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
-        mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
-        # From [bs, time] to [bs, n_group, time/n_group]
-        audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
-        # [bs, 1, n_group, time/n_group] 
-        audio = fluid.layers.unsqueeze(audio, 1)
-        log_s_list = []
-        for i in range(self.n_flows):
-            inputs = audio[:, :, :-1, :]
-            conds = mel[:, :, 1:, :]
-            outputs = self.flows[i](inputs, conds)
-            log_s = outputs[:, :1, :, :]
-            b = outputs[:, 1:, :, :]
-            log_s_list.append(log_s)
-
-            audio_0 = audio[:, :, :1, :]
-            audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b
-            audio = fluid.layers.concat([audio_0, audio_out], axis=2)
-
-            # Permute over the height dim.
-            audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
-            audio = fluid.layers.stack(audio_slices, axis=2)
-            mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
-            mel = fluid.layers.stack(mel_slices, axis=2)
-
-        z = fluid.layers.squeeze(audio, [1])
-        return z, log_s_list
-
-    def synthesize(self, mel, sigma=1.0):
-        """Use model to synthesize waveform.
-
-        Use a conditioner to upsample mel spectrograms into hidden states.
-        These hidden states along with initial random gaussian latent variable
-        are passed to a stack of Flow modules to obtain the audio output.
-
-        Note that we use convolutional queue (https://arxiv.org/abs/1611.09482)
-        to cache the intermediate hidden states, which will speed up the
-        autoregressive inference over the height dimension. Current
-        implementation only supports height dimension (self.n_group) equals
-        8 or 16, i.e., where there is no dilation on the height dimension.
-
-        Args:
-            mel (obj): mel spectrograms.
-            sigma (float, optional): standard deviation of the guassian latent
-                variable. Defaults to 1.0.
-
-        Returns:
-            audio (obj): synthesized audio.
-        """
-        if self.dtype == "float16":
-            mel = fluid.layers.cast(mel, self.dtype)
-        mel = self.conditioner.infer(mel)
-        # Prune out the tail of mel so that time/n_group == 0.
-        pruned_len = int(mel.shape[2] // self.n_group * self.n_group)
-        if mel.shape[2] > pruned_len:
-            mel = mel[:, :, :pruned_len]
-        # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
-        mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
-
-        audio = fluid.layers.gaussian_random(
-            shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma)
-        if self.dtype == "float16":
-            audio = fluid.layers.cast(audio, self.dtype)
-        for i in reversed(range(self.n_flows)):
-            # Permute over the height dimension.
-            audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
-            audio = fluid.layers.stack(audio_slices, axis=2)
-            mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
-            mel = fluid.layers.stack(mel_slices, axis=2)
-
-            audio_list = []
-            audio_0 = audio[:, :, 0:1, :]
-            audio_list.append(audio_0)
-            audio_h = audio_0
-            queues = [[] for _ in range(self.n_layers)]
-
-            for h in range(1, self.n_group):
-                inputs = audio_h
-                conds = mel[:, :, h:(h + 1), :]
-                outputs = self.flows[i].infer(inputs, conds, queues)
-
-                log_s = outputs[:, 0:1, :, :]
-                b = outputs[:, 1:, :, :]
-                audio_h = (audio[:, :, h:(h+1), :] - b) / \
-                    fluid.layers.exp(log_s)
-                audio_list.append(audio_h)
-
-            audio = fluid.layers.concat(audio_list, axis=2)
-
-        # audio: [bs, n_group, time/n_group]
-        audio = fluid.layers.squeeze(audio, [1])
-        # audio: [bs, time]
-        audio = fluid.layers.reshape(
-            fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1])
-        return audio