WIP: refactor

2020-10-10 15:51:54 +08:00 · 2020-10-10 15:51:54 +08:00 · a8192c79cc
parent 1db01ccc90
commit a8192c79cc
61 changed files with 3170 additions and 2168 deletions
--- a/parakeet/audio/spec_normalizer.py
+++ b/parakeet/audio/spec_normalizer.py
@ -0,0 +1,34 @@
+
+"""
+This modules contains normalizers for spectrogram magnitude.
+Normalizers are invertible transformations. They can be used to process 
+magnitude of spectrogram before training and can also be used to recover from 
+the generated spectrogram so as to be used with vocoders like griffin lim.
+
+The base class describe the interface. `transform` is used to perform 
+transformation and `inverse` is used to perform the inverse transformation.
+"""
+import numpy as np
+
+class NormalizerBase(object):
+    def transform(self, spec):
+        raise NotImplementedError("transform must be implemented")
+    
+    def inverse(self, normalized):
+        raise NotImplementedError("inverse must be implemented")
+
+class LogMagnitude(NormalizerBase):
+    def __init__(self, min=1e-7):
+        self.min = min
+    
+    def transform(self, x):
+        x = np.maximum(x, self.min)
+        x = np.log(x)
+        return x
+    
+    def inverse(self, x):
+        return np.exp(x)
+    
+class UnitMagnitude(NormalizerBase):
+    # dbscale and (0, 1) normalization
+    pass
--- a/parakeet/data/.vscode/settings.json
+++ b/parakeet/data/.vscode/settings.json
@ -1,3 +0,0 @@
-{
-    "python.pythonPath": "/Users/chenfeiyu/miniconda3/envs/paddle/bin/python"
-}
--- a/parakeet/data/init.py
+++ b/parakeet/data/init.py
@ -13,6 +13,5 @@
 # limitations under the License.

 from .dataset import *
-from .datacargo import *
 from .sampler import *
 from .batch import *
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
@ -1,126 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-from .sampler import SequentialSampler, RandomSampler, BatchSampler
-
-
-class DataCargo(object):
-    def __init__(self,
-                 dataset,
-                 batch_fn=None,
-                 batch_size=1,
-                 sampler=None,
-                 shuffle=False,
-                 batch_sampler=None,
-                 drop_last=False):
-        """An Iterable object of batches. It requires a dataset, a batch function and a sampler. The sampler yields the example ids, then the corresponding examples in the dataset are collected and transformed into a batch with the batch function.
-
-        Args:
-            dataset (Dataset): the dataset used to build a data cargo.
-            batch_fn (callable, optional): a callable that takes a list of examples of `dataset` and return a batch, it can be None if the dataset has a `_batch_examples` method which satisfy the requirement. Defaults to None.
-            batch_size (int, optional): number of examples in a batch. Defaults to 1.
-            sampler (Sampler, optional): an iterable of example ids(intergers), the example ids are used to pick examples. Defaults to None.
-            shuffle (bool, optional): when sampler is not provided, shuffle = True creates a RandomSampler and shuffle=False creates a SequentialSampler internally. Defaults to False.
-            batch_sampler (BatchSampler, optional): an iterable of lists of example ids(intergers), the list is used to pick examples, `batch_sampler` option is mutually exclusive with `batch_size`, `shuffle`, `sampler`, and `drop_last`. Defaults to None.
-            drop_last (bool, optional): whether to drop the last minibatch. Defaults to False.
-        """
-        self.dataset = dataset
-        self.batch_fn = batch_fn or self.dataset._batch_examples
-
-        if batch_sampler is not None:
-            # auto_collation with custom batch_sampler
-            if batch_size != 1 or shuffle or sampler is not None or drop_last:
-                raise ValueError('batch_sampler option is mutually exclusive '
-                                 'with batch_size, shuffle, sampler, and '
-                                 'drop_last')
-            batch_size = None
-            drop_last = False
-            shuffle = False
-        elif batch_size is None:
-            raise ValueError(
-                'batch sampler is none. then batch size must not be none.')
-        elif sampler is None:
-            if shuffle:
-                sampler = RandomSampler(dataset)
-            else:
-                sampler = SequentialSampler(dataset)
-            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
-        else:
-            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
-
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-        self.sampler = sampler
-
-        self.batch_sampler = batch_sampler
-
-    def __iter__(self):
-        return DataIterator(self)
-
-    def __call__(self):
-        # protocol for paddle's DataLoader
-        return DataIterator(self)
-
-    @property
-    def _auto_collation(self):
-        # use auto batching
-        return self.batch_sampler is not None
-
-    @property
-    def _index_sampler(self):
-        if self._auto_collation:
-            return self.batch_sampler
-        else:
-            return self.sampler
-
-    def __len__(self):
-        return len(self._index_sampler)
-
-
-class DataIterator(object):
-    def __init__(self, loader):
-        """Iterator object of DataCargo.
-
-        Args:
-            loader (DataCargo): the data cargo to iterate.
-        """
-        self.loader = loader
-        self._dataset = loader.dataset
-
-        self._batch_fn = loader.batch_fn
-        self._index_sampler = loader._index_sampler
-        self._sampler_iter = iter(self._index_sampler)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        # TODO(chenfeiyu): use dynamic batch size
-        index = self._next_index()
-        minibatch = [self._dataset[i] for i in index]
-        minibatch = self._batch_fn(minibatch)  # list[Example] -> Batch
-        return minibatch
-
-    next = __next__  # Python 2 compatibility
-
-    def _next_index(self):
-        if six.PY3:
-            return next(self._sampler_iter)
-        else:
-            # six.PY2
-            return self._sampler_iter.next()
-
-    def __len__(self):
-        return len(self._index_sampler)
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@ -13,62 +13,22 @@
 # limitations under the License.

 import six
-import numpy as np
-from tqdm import tqdm
+import paddle
+from paddle.io import Dataset


-class DatasetMixin(object):
-    """Standard indexing interface for dataset. Inherit this class to 
-    get the indexing interface. Since it is a mixin class which does 
-    not have an `__init__` class, the subclass not need to call  
-    `super().__init__()`.
-    """
+def split(dataset, first_size):
+    """A utility function to split a dataset into two datasets."""
+    first = SliceDataset(dataset, 0, first_size)
+    second = SliceDataset(dataset, first_size, len(dataset))
+    return first, second

-    def __getitem__(self, index):
-        """Standard indexing interface for dataset.
-
-        Args:
-            index (slice, list[int], np.array or int): the index. if can be int, slice, list of integers, or ndarray of integers. It calls `get_example` to pick an example. 
-
-        Returns:
-            Example, or List[Example]:  If `index` is an interger, it returns an 
-                    example. If `index` is a slice, a list of intergers or an array of intergers,
-                    it returns a list of examples.
-        """
-        if isinstance(index, slice):
-            start, stop, step = index.indices(len(self))
-            return [
-                self.get_example(i) for i in six.moves.range(start, stop, step)
-            ]
-        elif isinstance(index, (list, np.ndarray)):
-            return [self.get_example(i) for i in index]
-        else:
-            # assumes it an integer
-            return self.get_example(index)
-
-    def get_example(self, i):
-        """Get an example from the dataset. Custom datasets should have 
-        this method implemented.
-
-        Args:
-            i (int): example index.
-        """
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-    def __iter__(self):
-        for i in range(len(self)):
-            yield self.get_example(i)
-
-
-class TransformDataset(DatasetMixin):
+class TransformDataset(Dataset):
    def __init__(self, dataset, transform):
        """Dataset which is transformed from another with a transform.

        Args:
-            dataset (DatasetMixin): the base dataset.
+            dataset (Dataset): the base dataset.
            transform (callable): the transform which takes an example of the base dataset as parameter and return a new example.
        """
        self._dataset = dataset
@ -77,17 +37,17 @@ class TransformDataset(DatasetMixin):
    def __len__(self):
        return len(self._dataset)

-    def get_example(self, i):
+    def __getitem__(self, i):
        in_data = self._dataset[i]
        return self._transform(in_data)


-class CacheDataset(DatasetMixin):
+class CacheDataset(Dataset):
    def __init__(self, dataset):
        """A lazy cache of the base dataset.

        Args:
-            dataset (DatasetMixin): the base dataset to cache.
+            dataset (Dataset): the base dataset to cache.
        """
        self._dataset = dataset
        self._cache = dict()
@ -95,24 +55,24 @@ class CacheDataset(DatasetMixin):
    def __len__(self):
        return len(self._dataset)

-    def get_example(self, i):
+    def __getitem__(self, i):
        if not i in self._cache:
            self._cache[i] = self._dataset[i]
        return self._cache[i]


-class TupleDataset(object):
+class TupleDataset(Dataset):
    def __init__(self, *datasets):
        """A compound dataset made from several datasets of the same length. An example of the `TupleDataset` is a tuple of examples from the constituent datasets.

        Args:
-            datasets: tuple[DatasetMixin], the constituent datasets.
+            datasets: tuple[Dataset], the constituent datasets.
        """
        if not datasets:
            raise ValueError("no datasets are given")
        length = len(datasets[0])
        for i, dataset in enumerate(datasets):
-            if len(datasets) != length:
+            if len(dataset) != length:
                raise ValueError(
                    "all the datasets should have the same length."
                    "dataset {} has a different length".format(i))
@ -136,12 +96,20 @@ class TupleDataset(object):
        return self._length


-class DictDataset(object):
+class DictDataset(Dataset):
    def __init__(self, **datasets):
-        """A compound dataset made from several datasets of the same length. An example of the `DictDataset` is a dict of examples from the constituent datasets.
+        """
+        A compound dataset made from several datasets of the same length. An 
+        example of the `DictDataset` is a dict of examples from the constituent 
+        datasets.
+        
+        WARNING: paddle does not have a good support for DictDataset, because
+        every batch yield from a DataLoader is a list, but it cannot be a dict.
+        So you have to provide a collate function because you cannot use the
+        default one.

        Args:
-            datasets: Dict[DatasetMixin], the constituent datasets.
+            datasets: Dict[Dataset], the constituent datasets.
        """
        if not datasets:
            raise ValueError("no datasets are given")
@ -149,7 +117,7 @@ class DictDataset(object):
        for key, dataset in six.iteritems(datasets):
            if length is None:
                length = len(dataset)
-            elif len(datasets) != length:
+            elif len(dataset) != length:
                raise ValueError(
                    "all the datasets should have the same length."
                    "dataset {} has a different length".format(key))
@ -168,14 +136,17 @@ class DictDataset(object):
                    for i in six.moves.range(length)]
        else:
            return batches
+    
+    def __len__(self):
+        return self._length


-class SliceDataset(DatasetMixin):
+class SliceDataset(Dataset):
    def __init__(self, dataset, start, finish, order=None):
        """A Dataset which is a slice of the base dataset.

        Args:
-            dataset (DatasetMixin): the base dataset.
+            dataset (Dataset): the base dataset.
            start (int): the start of the slice.
            finish (int): the end of the slice, not inclusive.
            order (List[int], optional): the order, it is a permutation of the valid example ids of the base dataset. If `order` is provided, the slice is taken in `order`. Defaults to None.
@ -197,7 +168,7 @@ class SliceDataset(DatasetMixin):
    def __len__(self):
        return self._size

-    def get_example(self, i):
+    def __getitem__(self, i):
        if i >= 0:
            if i >= self._size:
                raise IndexError('dataset index out of range')
@ -212,12 +183,12 @@ class SliceDataset(DatasetMixin):
        return self._dataset[index]


-class SubsetDataset(DatasetMixin):
+class SubsetDataset(Dataset):
    def __init__(self, dataset, indices):
        """A Dataset which is a subset of the base dataset.

        Args:
-            dataset (DatasetMixin): the base dataset.
+            dataset (Dataset): the base dataset.
            indices (Iterable[int]): the indices of the examples to pick.
        """
        self._dataset = dataset
@ -229,17 +200,17 @@ class SubsetDataset(DatasetMixin):
    def __len__(self):
        return self._size

-    def get_example(self, i):
+    def __getitem__(self, i):
        index = self._indices[i]
        return self._dataset[index]


-class FilterDataset(DatasetMixin):
+class FilterDataset(Dataset):
    def __init__(self, dataset, filter_fn):
        """A filtered dataset.

        Args:
-            dataset (DatasetMixin): the base dataset.
+            dataset (Dataset): the base dataset.
            filter_fn (callable): a callable which takes an example of the base dataset and return a boolean.
        """
        self._dataset = dataset
@ -251,24 +222,24 @@ class FilterDataset(DatasetMixin):
    def __len__(self):
        return self._size

-    def get_example(self, i):
+    def __getitem__(self, i):
        index = self._indices[i]
        return self._dataset[index]


-class ChainDataset(DatasetMixin):
+class ChainDataset(Dataset):
    def __init__(self, *datasets):
        """A concatenation of the several datasets which the same structure.

        Args:
-            datasets (Iterable[DatasetMixin]): datasets to concat.
+            datasets (Iterable[Dataset]): datasets to concat.
        """
        self._datasets = datasets

    def __len__(self):
        return sum(len(dataset) for dataset in self._datasets)

-    def get_example(self, i):
+    def __getitem__(self, i):
        if i < 0:
            raise IndexError("ChainDataset doesnot support negative indexing.")

--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
@ -21,95 +21,8 @@ So the sampler is only responsible for generating valid indices.

 import numpy as np
 import random
-
-
-class Sampler(object):
-    def __iter__(self):
-        # return a iterator of indices
-        # or a iterator of list[int], for BatchSampler
-        raise NotImplementedError
-
-
-class SequentialSampler(Sampler):
-    def __init__(self, data_source):
-        """Sequential sampler, the simplest sampler that samples indices from 0 to N - 1, where N is the dataset is length.
-
-        Args:
-            data_source (DatasetMixin): the dataset. This is used to get the dataset's length.
-        """
-        self.data_source = data_source
-
-    def __iter__(self):
-        return iter(range(len(self.data_source)))
-
-    def __len__(self):
-        return len(self.data_source)
-
-
-class RandomSampler(Sampler):
-    def __init__(self, data_source, replacement=False, num_samples=None):
-        """Random sampler.
-
-        Args:
-            data_source (DatasetMixin): the dataset. This is used to get the dataset's length.
-            replacement (bool, optional): whether replacement is enabled in sampling. When `replacement` is True, `num_samples` must be provided. Defaults to False.
-            num_samples (int, optional): numbers of indices to draw. This option should only be provided when replacement is True. Defaults to None.
-        """
-        self.data_source = data_source
-        self.replacement = replacement
-        self._num_samples = num_samples
-
-        if not isinstance(self.replacement, bool):
-            raise ValueError("replacement should be a boolean value, but got "
-                             "replacement={}".format(self.replacement))
-
-        if self._num_samples is not None and not replacement:
-            raise ValueError(
-                "With replacement=False, num_samples should not be specified, "
-                "since a random permutation will be performed.")
-
-        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
-            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(
-                                 self.num_samples))
-
-    @property
-    def num_samples(self):
-        if self._num_samples is None:
-            return len(self.data_source)
-        return self._num_samples
-
-    def __iter__(self):
-        n = len(self.data_source)
-        if self.replacement:
-            return iter(
-                np.random.randint(
-                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
-        return iter(np.random.permutation(n).tolist())
-
-    def __len__(self):
-        return self.num_samples
-
-
-class SubsetRandomSampler(Sampler):
-    """Samples elements randomly from a given list of indices, without replacement.
-    Arguments:
-        indices (sequence): a sequence of indices
-    """
-
-    def __init__(self, indices):
-        """
-        Args:
-            indices (List[int]): indices to sample from.
-        """
-        self.indices = indices
-
-    def __iter__(self):
-        return (self.indices[i]
-                for i in np.random.permutation(len(self.indices)))
-
-    def __len__(self):
-        return len(self.indices)
+import paddle
+from paddle.io import Sampler


 class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
@ -285,92 +198,3 @@ class WeightedRandomSampler(Sampler):

    def __len__(self):
        return self.num_samples
-
-
-class DistributedSampler(Sampler):
-    def __init__(self, dataset_size, num_trainers, rank, shuffle=True):
-        """Sampler used for data parallel training. Indices are divided into num_trainers parts. Each trainer gets a subset and iter that subset. If the dataset has 16 examples, and there are 4 trainers. 
-
-        Trainer 0 gets [0, 4, 8, 12];
-        Trainer 1 gets [1, 5, 9, 13];
-        Trainer 2 gets [2, 6, 10, 14];
-        trainer 3 gets [3, 7, 11, 15].
-
-        It ensures that trainer get different parts of the dataset. If dataset's length cannot be perfectly devidef by num_trainers, some examples appended to the dataset, to ensures that every trainer gets the same amounts of examples.
-
-        Args:
-            dataset_size (int): the length of the dataset.
-            num_trainers (int): number of trainers(training processes).
-            rank (int): local rank of the trainer.
-            shuffle (bool, optional): whether to shuffle the indices before iteration. Defaults to True.
-        """
-        self.dataset_size = dataset_size
-        self.num_trainers = num_trainers
-        self.rank = rank
-        self.num_samples = int(np.ceil(dataset_size / num_trainers))
-        self.total_size = self.num_samples * num_trainers
-        assert self.total_size >= self.dataset_size
-        self.shuffle = shuffle
-
-    def __iter__(self):
-        indices = list(range(self.dataset_size))
-        if self.shuffle:
-            random.shuffle(indices)
-
-        # Append extra samples to make it evenly distributed on all trainers.
-        indices += indices[:(self.total_size - self.dataset_size)]
-        assert len(indices) == self.total_size
-
-        # Subset samples for each trainer.
-        indices = indices[self.rank:self.total_size:self.num_trainers]
-        assert len(indices) == self.num_samples
-
-        return iter(indices)
-
-    def __len__(self):
-        return self.num_samples
-
-
-class BatchSampler(Sampler):
-    """Wraps another sampler to yield a mini-batch of indices."""
-
-    def __init__(self, sampler, batch_size, drop_last):
-        """
-        Args:
-            sampler (Sampler): Base sampler.
-            batch_size (int): Size of mini-batch.
-            drop_last (bool): If True, the sampler will drop the last batch if its size is less than batch_size.
-        Example:
-            >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
-            [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
-            >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
-            [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
-        """
-        if not isinstance(sampler, Sampler):
-            raise ValueError("sampler should be an instance of "
-                             "Sampler, but got sampler={}".format(sampler))
-        if not isinstance(batch_size, int) or batch_size <= 0:
-            raise ValueError("batch_size should be a positive integer value, "
-                             "but got batch_size={}".format(batch_size))
-        if not isinstance(drop_last, bool):
-            raise ValueError("drop_last should be a boolean value, but got "
-                             "drop_last={}".format(drop_last))
-        self.sampler = sampler
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-
-    def __iter__(self):
-        batch = []
-        for idx in self.sampler:
-            batch.append(idx)
-            if len(batch) == self.batch_size:
-                yield batch
-                batch = []
-        if len(batch) > 0 and not self.drop_last:
-            yield batch
-
-    def __len__(self):
-        if self.drop_last:
-            return len(self.sampler) // self.batch_size
-        else:
-            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
--- a/parakeet/datasets/README.md
+++ b/parakeet/datasets/README.md
@ -1,17 +0,0 @@
-# The Design of Dataset in Parakeet
-
-## data & metadata
-A Dataset in Parakeet is basically a list of Records (or examples, instances if you prefer this glossary.) By being a list, we mean it can be indexed by `__getitem__`, and we can get the size of the dataset by `__len__`.
-
-This might mean we should have load the whole dataset before hand. But in practice, we do not do this due to time, computation and memory of storage limits. We actually load some metadata instead, which gives us the size of the dataset, and metadata of each record. In this case, the metadata itself is a small dataset which helps us to load a larger dataset. We made `_load_metadata` a method for all datasets.
-
-In most cases, metadata is provided with the data. So we can load it trivially. But in other cases, we need to scan the whole dataset to get metadata. For example, the length of the the sentences, the vocabuary or the statistics of the dataset, etc. In these cases, we'd betetr save the metadata, so we do not need to generate them again and again. When implementing a dataset, we do these work in `_prepare_metadata`.
-
-In our initial cases, record is implemented as a tuple for simplicity. Actually, it can be implemented as a dict or namespace.
-
-## preprocessing & batching
-One of the reasons we choose to load data lazily (only load metadata before hand, and load data only when needed) is computation overhead. For large dataset with complicated preprocessing, it may take several days to preprocess them. So we choose to preprocess it lazily. In practice, we implement preprocessing in `_get_example` which is called by `__getitem__`. This method preprocess only one record.
-
-For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
-
-That is it!
--- a/parakeet/datasets/init.py
+++ b/parakeet/datasets/init.py
@ -1,13 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
@ -1,101 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-import pandas as pd
-import librosa
-from .. import g2p
-
-from ..data.sampler import SequentialSampler, RandomSampler, BatchSampler
-from ..data.dataset import DatasetMixin
-from ..data.datacargo import DataCargo
-from ..data.batch import TextIDBatcher, SpecBatcher
-
-
-class LJSpeech(DatasetMixin):
-    def __init__(self, root):
-        super(LJSpeech, self).__init__()
-        self.root = root
-        self.metadata = self._prepare_metadata()
-
-    def _prepare_metadata(self):
-        csv_path = os.path.join(self.root, "metadata.csv")
-        metadata = pd.read_csv(
-            csv_path,
-            sep="|",
-            header=None,
-            quoting=3,
-            names=["fname", "raw_text", "normalized_text"])
-        return metadata
-
-    def _get_example(self, metadatum):
-        """All the code for generating an Example from a metadatum. If you want a 
-        different preprocessing pipeline, you can override this method. 
-        This method may require several processor, each of which has a lot of options.
-        In this case, you'd better pass a composed transform and pass it to the init
-        method.
-        """
-
-        fname, raw_text, normalized_text = metadatum
-        wav_path = os.path.join(self.root, "wavs", fname + ".wav")
-
-        # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
-        wav, sample_rate = librosa.load(
-            wav_path,
-            sr=None)  # we would rather use functor to hold its parameters
-        trimed, _ = librosa.effects.trim(wav)
-        preemphasized = librosa.effects.preemphasis(trimed)
-        D = librosa.stft(preemphasized)
-        mag, phase = librosa.magphase(D)
-        mel = librosa.feature.melspectrogram(S=mag)
-
-        mag = librosa.amplitude_to_db(S=mag)
-        mel = librosa.amplitude_to_db(S=mel)
-
-        ref_db = 20
-        max_db = 100
-        mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
-        mel = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
-
-        phonemes = np.array(
-            g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
-        return (mag, mel, phonemes
-                )  # maybe we need to implement it as a map in the future
-
-    def _batch_examples(self, minibatch):
-        mag_batch = []
-        mel_batch = []
-        phoneme_batch = []
-        for example in minibatch:
-            mag, mel, phoneme = example
-            mag_batch.append(mag)
-            mel_batch.append(mel)
-            phoneme_batch.append(phoneme)
-        mag_batch = SpecBatcher(pad_value=0.)(mag_batch)
-        mel_batch = SpecBatcher(pad_value=0.)(mel_batch)
-        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
-        return (mag_batch, mel_batch, phoneme_batch)
-
-    def __getitem__(self, index):
-        metadatum = self.metadata.iloc[index]
-        example = self._get_example(metadatum)
-        return example
-
-    def __iter__(self):
-        for i in range(len(self)):
-            yield self[i]
-
-    def __len__(self):
-        return len(self.metadata)
--- a/parakeet/datasets/vctk.py
+++ b/parakeet/datasets/vctk.py
@ -1,99 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-import pandas as pd
-from ruamel.yaml import YAML
-import io
-
-import librosa
-import numpy as np
-
-from parakeet.g2p.en import text_to_sequence
-from parakeet.data.dataset import Dataset
-from parakeet.data.datacargo import DataCargo
-from parakeet.data.batch import TextIDBatcher, WavBatcher
-
-
-class VCTK(Dataset):
-    def __init__(self, root):
-        assert isinstance(root, (
-            str, Path)), "root should be a string or Path object"
-        self.root = root if isinstance(root, Path) else Path(root)
-        self.text_root = self.root.joinpath("txt")
-        self.wav_root = self.root.joinpath("wav48")
-
-        if not (self.root.joinpath("metadata.csv").exists() and
-                self.root.joinpath("speaker_indices.yaml").exists()):
-            self._prepare_metadata()
-        self.speaker_indices, self.metadata = self._load_metadata()
-
-    def _load_metadata(self):
-        yaml = YAML(typ='safe')
-        speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
-        metadata = pd.read_csv(
-            self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
-        return speaker_indices, metadata
-
-    def _prepare_metadata(self):
-        metadata = []
-        speaker_to_index = {}
-        for i, speaker_folder in enumerate(self.text_root.iterdir()):
-            if speaker_folder.is_dir():
-                speaker_to_index[speaker_folder.name] = i
-                for text_file in speaker_folder.iterdir():
-                    if text_file.is_file():
-                        with io.open(str(text_file)) as f:
-                            transcription = f.read().strip()
-                    wav_file = text_file.with_suffix(".wav")
-                    metadata.append(
-                        (wav_file.name, speaker_folder.name, transcription))
-        metadata = pd.DataFrame.from_records(
-            metadata, columns=["wave_file", "speaker", "text"])
-
-        # save them
-        yaml = YAML(typ='safe')
-        yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
-        metadata.to_csv(
-            self.root.joinpath("metadata.csv"),
-            sep="|",
-            quoting=3,
-            index=False)
-
-    def _get_example(self, metadatum):
-        wave_file, speaker, text = metadatum
-        wav_path = self.wav_root.joinpath(speaker, wave_file)
-        wav, sr = librosa.load(str(wav_path), sr=None)
-        phoneme_seq = np.array(text_to_sequence(text))
-        return wav, self.speaker_indices[speaker], phoneme_seq
-
-    def __getitem__(self, index):
-        metadatum = self.metadata.iloc[index]
-        example = self._get_example(metadatum)
-        return example
-
-    def __len__(self):
-        return len(self.metadata)
-
-    def _batch_examples(self, minibatch):
-        wav_batch, speaker_batch, phoneme_batch = [], [], []
-        for example in minibatch:
-            wav, speaker_id, phoneme_seq = example
-            wav_batch.append(wav)
-            speaker_batch.append(speaker_id)
-            phoneme_batch.append(phoneme_seq)
-        wav_batch = WavBatcher(pad_value=0.)(wav_batch)
-        speaker_batch = np.array(speaker_batch)
-        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
-        return wav_batch, speaker_batch, phoneme_batch
--- a/parakeet/models/clarinet.py
+++ b/parakeet/models/clarinet.py
@ -0,0 +1,156 @@
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle import distribution as D
+
+from parakeet.models.wavenet import WaveNet, UpsampleNet, crop
+
+class ParallelWaveNet(nn.LayerList):
+    def __init__(self, n_loops, n_layers, residual_channels, condition_dim,
+                 filter_size):
+        """ParallelWaveNet, an inverse autoregressive flow model, it contains several flows(WaveNets).
+
+        Args:
+            n_loops (List[int]): `n_loop` for each flow.
+            n_layers (List[int]): `n_layer` for each flow.
+            residual_channels (int): `residual_channels` for every flow.
+            condition_dim (int): `condition_dim` for every flow.
+            filter_size (int): `filter_size` for every flow.
+        """
+        super(ParallelWaveNet, self).__init__()
+        for n_loop, n_layer in zip(n_loops, n_layers):
+            # teacher's log_scale_min does not matter herem, -100 is a dummy value
+            self.append(
+                WaveNet(n_loop, n_layer, residual_channels, 3, condition_dim,
+                        filter_size, "mog", -100.0))
+
+    def forward(self, z, condition=None):
+        """Transform a random noise sampled from a standard Gaussian distribution into sample from the target distribution. And output the mean and log standard deviation of the output distribution.
+
+        Args:
+            z (Variable): shape(B, T), random noise sampled from a standard gaussian disribution.
+            condition (Variable, optional): shape(B, F, T), dtype float, the upsampled condition. Defaults to None.
+
+        Returns:
+            (z, out_mu, out_log_std)
+            z (Variable): shape(B, T), dtype float, transformed noise, it is the synthesized waveform.
+            out_mu (Variable): shape(B, T), dtype float, means of the output distributions.
+            out_log_std (Variable): shape(B, T), dtype float, log standard deviations of the output distributions.
+        """
+        for i, flow in enumerate(self):
+            theta = flow(z, condition)  # w, mu, log_std [0: T]
+            w, mu, log_std = paddle.chunk(theta, 3, axis=-1)  # (B, T, 1) for each
+            mu = paddle.squeeze(mu, -1)  #[0: T]
+            log_std = paddle.squeeze(log_std, -1)  #[0: T]
+            z = z * paddle.exp(log_std) + mu  #[0: T]
+
+            if i == 0:
+                out_mu = mu
+                out_log_std = log_std
+            else:
+                out_mu = out_mu * paddle.exp(log_std) + mu
+                out_log_std += log_std
+
+        return z, out_mu, out_log_std
+
+
+# Gaussian IAF model
+class Clarinet(nn.Layer):
+    def __init__(self, encoder, teacher, student, stft,
+                 min_log_scale=-6.0, lmd=4.0):
+        """Clarinet model. Conditional Parallel WaveNet.
+
+        Args:
+            encoder (UpsampleNet): an UpsampleNet to upsample mel spectrogram.
+            teacher (WaveNet): a WaveNet, the teacher.
+            student (ParallelWaveNet): a ParallelWaveNet model, the student.
+            stft (STFT): a STFT model to perform differentiable stft transform.
+            min_log_scale (float, optional): used only for computing loss, the minimal value of log standard deviation of the output distribution of both the teacher and the student . Defaults to -6.0.
+            lmd (float, optional): weight for stft loss. Defaults to 4.0.
+        """
+        super(Clarinet, self).__init__()
+        self.encoder = encoder
+        self.teacher = teacher
+        self.student = student
+        self.stft = stft
+
+        self.lmd = lmd
+        self.min_log_scale = min_log_scale
+
+    def forward(self, audio, mel, audio_start, clip_kl=True):
+        """Compute loss of Clarinet model.
+
+        Args:
+            audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform.
+            mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here).
+            audio_start (Variable): shape(B, ), dtype int64, audio starts positions.
+            clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True.
+
+        Returns:
+            Dict(str, Variable)
+            loss (Variable): shape(1, ), dtype flaot32, total loss.
+            kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution.
+            regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence.
+            spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform.
+        """
+        batch_size, audio_length = audio.shape  # audio clip's length
+
+        z = paddle.randn(audio.shape)
+        condition = self.encoder(mel)  # (B, C, T)
+        condition_slice = crop(condition, audio_start, audio_length)
+
+        x, s_means, s_scales = self.student(z, condition_slice)  # all [0: T]
+        s_means = s_means[:, 1:]  # (B, T-1), time steps [1: T]
+        s_scales = s_scales[:, 1:]  # (B, T-1), time steps [1: T]
+        s_clipped_scales = paddle.clip(s_scales, self.min_log_scale, 100.)
+
+        # teacher outputs single gaussian
+        y = self.teacher(x[:, :-1], condition_slice[:, :, 1:])
+        _, t_means, t_scales = paddle.chunk(y, 3, axis=-1)  # time steps [1: T]
+        t_means = paddle.squeeze(t_means, [-1])  # (B, T-1), time steps [1: T]
+        t_scales = paddle.squeeze(t_scales, [-1])  # (B, T-1), time steps [1: T]
+        t_clipped_scales = paddle.clip(t_scales, self.min_log_scale, 100.)
+
+        s_distribution = D.Normal(s_means, paddle.exp(s_clipped_scales))
+        t_distribution = D.Normal(t_means, paddle.exp(t_clipped_scales))
+
+        # kl divergence loss, so we only need to sample once? no MC
+        kl = s_distribution.kl_divergence(t_distribution)
+        if clip_kl:
+            kl = paddle.clip(kl, -100., 10.)
+        # context size dropped
+        kl = paddle.reduce_mean(kl[:, self.teacher.context_size:])
+        # major diff here
+        regularization = F.mse_loss(t_scales[:, self.teacher.context_size:],
+                                    s_scales[:, self.teacher.context_size:])
+
+        # introduce information from real target
+        spectrogram_frame_loss = F.mse_loss(
+            self.stft.magnitude(audio), self.stft.magnitude(x))
+        loss = kl + self.lmd * regularization + spectrogram_frame_loss
+        loss_dict = {
+            "loss": loss,
+            "kl_divergence": kl,
+            "regularization": regularization,
+            "stft_loss": spectrogram_frame_loss
+        }
+        return loss_dict
+
+    @paddle.no_grad()
+    def synthesis(self, mel):
+        """Synthesize waveform using the encoder and the student network.
+
+        Args:
+            mel (Variable): shape(B, F, T_mel), the condition(mel spectrogram here).
+
+        Returns:
+            Variable: shape(B, T_audio), the synthesized waveform. (T_audio = T_mel * upscale_factor, where upscale_factor is the `upscale_factor` of the encoder.)
+        """
+        condition = self.encoder(mel)
+        samples_shape = (condition.shape[0], condition.shape[-1])
+        z = paddle.randn(samples_shape)
+        x, s_means, s_scales = self.student(z, condition)
+        return x
+    
+
+# TODO(chenfeiyu): ClariNetLoss
--- a/parakeet/models/clarinet/init.py
+++ b/parakeet/models/clarinet/init.py
@ -1,16 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .net import *
-from .parallel_wavenet import *
--- a/parakeet/models/clarinet/net.py
+++ b/parakeet/models/clarinet/net.py
@ -1,221 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-import itertools
-import numpy as np
-from scipy import signal
-from tqdm import trange
-
-import paddle.fluid.layers as F
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import paddle.fluid.layers.distributions as D
-
-from parakeet.modules.weight_norm import Conv2DTranspose
-from parakeet.models.wavenet import crop, WaveNet, UpsampleNet
-from parakeet.models.clarinet.parallel_wavenet import ParallelWaveNet
-from parakeet.models.clarinet.utils import conv2d
-
-
-# Gaussian IAF model
-class Clarinet(dg.Layer):
-    def __init__(self,
-                 encoder,
-                 teacher,
-                 student,
-                 stft,
-                 min_log_scale=-6.0,
-                 lmd=4.0):
-        """Clarinet model.
-
-        Args:
-            encoder (UpsampleNet): an UpsampleNet to upsample mel spectrogram.
-            teacher (WaveNet): a WaveNet, the teacher.
-            student (ParallelWaveNet): a ParallelWaveNet model, the student.
-            stft (STFT): a STFT model to perform differentiable stft transform.
-            min_log_scale (float, optional): used only for computing loss, the minimal value of log standard deviation of the output distribution of both the teacher and the student . Defaults to -6.0.
-            lmd (float, optional): weight for stft loss. Defaults to 4.0.
-        """
-        super(Clarinet, self).__init__()
-        self.encoder = encoder
-        self.teacher = teacher
-        self.student = student
-        self.stft = stft
-
-        self.lmd = lmd
-        self.min_log_scale = min_log_scale
-
-    def forward(self, audio, mel, audio_start, clip_kl=True):
-        """Compute loss of Clarinet model.
-
-        Args:
-            audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform.
-            mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here).
-            audio_start (Variable): shape(B, ), dtype int64, audio starts positions.
-            clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True.
-
-        Returns:
-            Dict(str, Variable)
-            loss (Variable): shape(1, ), dtype flaot32, total loss.
-            kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution.
-            regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence.
-            spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform.
-        """
-        batch_size, audio_length = audio.shape  # audio clip's length
-
-        z = F.gaussian_random(audio.shape)
-        condition = self.encoder(mel)  # (B, C, T)
-        condition_slice = crop(condition, audio_start, audio_length)
-
-        x, s_means, s_scales = self.student(z, condition_slice)  # all [0: T]
-        s_means = s_means[:, 1:]  # (B, T-1), time steps [1: T]
-        s_scales = s_scales[:, 1:]  # (B, T-1), time steps [1: T]
-        s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.)
-
-        # teacher outputs single gaussian
-        y = self.teacher(x[:, :-1], condition_slice[:, :, 1:])
-        _, t_means, t_scales = F.split(y, 3, -1)  # time steps [1: T]
-        t_means = F.squeeze(t_means, [-1])  # (B, T-1), time steps [1: T]
-        t_scales = F.squeeze(t_scales, [-1])  # (B, T-1), time steps [1: T]
-        t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.)
-
-        s_distribution = D.Normal(s_means, F.exp(s_clipped_scales))
-        t_distribution = D.Normal(t_means, F.exp(t_clipped_scales))
-
-        # kl divergence loss, so we only need to sample once? no MC
-        kl = s_distribution.kl_divergence(t_distribution)
-        if clip_kl:
-            kl = F.clip(kl, -100., 10.)
-        # context size dropped
-        kl = F.reduce_mean(kl[:, self.teacher.context_size:])
-        # major diff here
-        regularization = F.mse_loss(t_scales[:, self.teacher.context_size:],
-                                    s_scales[:, self.teacher.context_size:])
-
-        # introduce information from real target
-        spectrogram_frame_loss = F.mse_loss(
-            self.stft.magnitude(audio), self.stft.magnitude(x))
-        loss = kl + self.lmd * regularization + spectrogram_frame_loss
-        loss_dict = {
-            "loss": loss,
-            "kl_divergence": kl,
-            "regularization": regularization,
-            "stft_loss": spectrogram_frame_loss
-        }
-        return loss_dict
-
-    @dg.no_grad
-    def synthesis(self, mel):
-        """Synthesize waveform using the encoder and the student network.
-
-        Args:
-            mel (Variable): shape(B, F, T_mel), the condition(mel spectrogram here).
-
-        Returns:
-            Variable: shape(B, T_audio), the synthesized waveform. (T_audio = T_mel * upscale_factor, where upscale_factor is the `upscale_factor` of the encoder.)
-        """
-        condition = self.encoder(mel)
-        samples_shape = (condition.shape[0], condition.shape[-1])
-        z = F.gaussian_random(samples_shape)
-        x, s_means, s_scales = self.student(z, condition)
-        return x
-
-
-class STFT(dg.Layer):
-    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
-        """A module for computing differentiable stft transform. See `librosa.stft` for more details.
-
-        Args:
-            n_fft (int): number of samples in a frame.
-            hop_length (int): number of samples shifted between adjacent frames.
-            win_length (int): length of the window function.
-            window (str, optional): name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning".
-        """
-        super(STFT, self).__init__()
-        self.hop_length = hop_length
-        self.n_bin = 1 + n_fft // 2
-        self.n_fft = n_fft
-
-        # calculate window
-        window = signal.get_window(window, win_length)
-        if n_fft != win_length:
-            pad = (n_fft - win_length) // 2
-            window = np.pad(window, ((pad, pad), ), 'constant')
-
-        # calculate weights
-        r = np.arange(0, n_fft)
-        M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        w_real = np.reshape(window *
-                            np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-                            (self.n_bin, 1, 1, self.n_fft)).astype("float32")
-        w_imag = np.reshape(window *
-                            np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-                            (self.n_bin, 1, 1, self.n_fft)).astype("float32")
-
-        w = np.concatenate([w_real, w_imag], axis=0)
-        self.weight = dg.to_variable(w)
-
-    def forward(self, x):
-        """Compute the stft transform.
-
-        Args:
-            x (Variable): shape(B, T), dtype flaot32, the input waveform.
-
-        Returns:
-            (real, imag)
-            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram. (C = 1 + n_fft // 2)
-            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram. (C = 1 + n_fft // 2) 
-        """
-        # x(batch_size, time_steps)
-        # pad it first with reflect mode
-        pad_start = F.reverse(x[:, 1:1 + self.n_fft // 2], axis=1)
-        pad_stop = F.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=1)
-        x = F.concat([pad_start, x, pad_stop], axis=-1)
-
-        # to BC1T, C=1
-        x = F.unsqueeze(x, axes=[1, 2])
-        out = conv2d(x, self.weight, stride=(1, self.hop_length))
-        real, imag = F.split(out, 2, dim=1)  # BC1T
-        return real, imag
-
-    def power(self, x):
-        """Compute the power spectrogram.
-
-        Args:
-            (real, imag)
-            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
-            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
-
-        Returns:
-            Variable: shape(B, C, 1, T), dtype flaot32, the power spectrogram.
-        """
-        real, imag = self(x)
-        power = real**2 + imag**2
-        return power
-
-    def magnitude(self, x):
-        """Compute the magnitude spectrogram.
-
-        Args:
-            (real, imag)
-            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
-            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
-
-        Returns:
-            Variable: shape(B, C, 1, T), dtype flaot32, the magnitude spectrogram. It is the square root of the power spectrogram.
-        """
-        power = self.power(x)
-        magnitude = F.sqrt(power)
-        return magnitude
--- a/parakeet/models/clarinet/parallel_wavenet.py
+++ b/parakeet/models/clarinet/parallel_wavenet.py
@ -1,77 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-import math
-import time
-import itertools
-import numpy as np
-
-import paddle.fluid.layers as F
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import paddle.fluid.layers.distributions as D
-
-from parakeet.modules.weight_norm import Linear, Conv1D, Conv1DCell, Conv2DTranspose
-from parakeet.models.wavenet import WaveNet
-
-
-class ParallelWaveNet(dg.Layer):
-    def __init__(self, n_loops, n_layers, residual_channels, condition_dim,
-                 filter_size):
-        """ParallelWaveNet, an inverse autoregressive flow model, it contains several flows(WaveNets).
-
-        Args:
-            n_loops (List[int]): `n_loop` for each flow.
-            n_layers (List[int]): `n_layer` for each flow.
-            residual_channels (int): `residual_channels` for every flow.
-            condition_dim (int): `condition_dim` for every flow.
-            filter_size (int): `filter_size` for every flow.
-        """
-        super(ParallelWaveNet, self).__init__()
-        self.flows = dg.LayerList()
-        for n_loop, n_layer in zip(n_loops, n_layers):
-            # teacher's log_scale_min does not matter herem, -100 is a dummy value
-            self.flows.append(
-                WaveNet(n_loop, n_layer, residual_channels, 3, condition_dim,
-                        filter_size, "mog", -100.0))
-
-    def forward(self, z, condition=None):
-        """Transform a random noise sampled from a standard Gaussian distribution into sample from the target distribution. And output the mean and log standard deviation of the output distribution.
-
-        Args:
-            z (Variable): shape(B, T), random noise sampled from a standard gaussian disribution.
-            condition (Variable, optional): shape(B, F, T), dtype float, the upsampled condition. Defaults to None.
-
-        Returns:
-            (z, out_mu, out_log_std)
-            z (Variable): shape(B, T), dtype float, transformed noise, it is the synthesized waveform.
-            out_mu (Variable): shape(B, T), dtype float, means of the output distributions.
-            out_log_std (Variable): shape(B, T), dtype float, log standard deviations of the output distributions.
-        """
-        for i, flow in enumerate(self.flows):
-            theta = flow(z, condition)  # w, mu, log_std [0: T]
-            w, mu, log_std = F.split(theta, 3, dim=-1)  # (B, T, 1) for each
-            mu = F.squeeze(mu, [-1])  #[0: T]
-            log_std = F.squeeze(log_std, [-1])  #[0: T]
-            z = z * F.exp(log_std) + mu  #[0: T]
-
-            if i == 0:
-                out_mu = mu
-                out_log_std = log_std
-            else:
-                out_mu = out_mu * F.exp(log_std) + mu
-                out_log_std += log_std
-
-        return z, out_mu, out_log_std
--- a/parakeet/models/clarinet/utils.py
+++ b/parakeet/models/clarinet/utils.py
@ -1,38 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-from paddle import fluid
-from paddle.fluid.core import ops
-
-
-@fluid.framework.dygraph_only
-def conv2d(input,
-           weight,
-           stride=(1, 1),
-           padding=((0, 0), (0, 0)),
-           dilation=(1, 1),
-           groups=1,
-           use_cudnn=True,
-           data_format="NCHW"):
-    padding = tuple(pad for pad_dim in padding for pad in pad_dim)
-
-    attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-             'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-             'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
-             "EXPLICIT", "data_format", data_format)
-
-    out = ops.conv2d(input, weight, *attrs)
-    return out
--- a/parakeet/models/deepvoice3/model.py
+++ b/parakeet/models/deepvoice3/model.py
@ -1,35 +1,14 @@
-import numpy as np
 import math
+import numpy as np
+
 import paddle
-from paddle import fluid
-from paddle.fluid import layers as F
-from paddle.fluid import initializer as I
-from paddle.fluid import dygraph as dg
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I

-from .conv import Conv1D
-from .weight_norm_hook import weight_norm, remove_weight_norm
+from parakeet.modules import positional_encoding as pe

-def positional_encoding(tensor, start_index, omega):
-    """
-    tensor: a reference tensor we use to get shape. actually only T and C are needed. Shape(B, T, C)
-    start_index: int, we can actually use start and length to specify them.
-    omega (B,): speaker position rates
-
-    return (B, T, C), position embedding
-    """
-    dtype = omega.dtype
-    _, length, dimension = tensor.shape
-    index = F.range(start_index, start_index + length, 1, dtype=dtype)
-    channel = F.range(0, dimension, 2, dtype=dtype)
-
-    p = F.unsqueeze(omega, [1, 2]) \
-      * F.unsqueeze(index, [1]) \
-      / (10000 ** (channel / float(dimension)))
-
-    encodings = F.concat([F.sin(p), F.cos(p)], axis=2)
-    return encodings
-
-class ConvBlock(dg.Layer):
+class ConvBlock(nn.Layer):
    def __init__(self, in_channel, kernel_size, causal=False, has_bias=False, 
                 bias_dim=None, keep_prob=1.):
        super(ConvBlock, self).__init__()
@ -38,55 +17,56 @@ class ConvBlock(dg.Layer):
        self.in_channel = in_channel
        self.has_bias = has_bias

-        std = np.sqrt(4 * keep_prob / (kernel_size * in_channel))
+        std = math.sqrt(4 * keep_prob / (kernel_size * in_channel))
        padding = "valid" if causal else "same"
-        conv =  Conv1D(in_channel, 2 * in_channel, (kernel_size, ),
-                       padding=padding, 
-                       data_format="NTC",
-                       param_attr=I.Normal(scale=std))
-        self.conv = weight_norm(conv)
+        conv = nn.Conv1d(in_channel, 2 * in_channel, (kernel_size, ),
+                         padding=padding, 
+                         data_format="NLC",
+                         weight_attr=I.Normal(scale=std))
+        self.conv = nn.utils.weight_norm(conv)
        if has_bias:
-            std = np.sqrt(1 / bias_dim)
-            self.bias_affine = dg.Linear(bias_dim, 2 * in_channel, param_attr=I.Normal(scale=std))
+            std = math.sqrt(1 / bias_dim)
+            self.bias_affine = nn.Linear(bias_dim, 2 * in_channel, 
+                                         weight_attr=I.Normal(scale=std))

    def forward(self, input, bias=None, padding=None):
        """
        input: input feature (B, T, C)
        padding: only used when using causal conv, we pad mannually
        """
-        input_dropped = F.dropout(input, 1. - self.keep_prob,
-                                  dropout_implementation="upscale_in_train")
+        input_dropped = F.dropout(input, 1. - self.keep_prob, training=self.training)
        if self.causal:
            assert padding is not None
-            input_dropped = F.concat([padding, input_dropped], axis=1)
+            input_dropped = paddle.concat([padding, input_dropped], axis=1)
        hidden = self.conv(input_dropped)

        if self.has_bias:
            assert bias is not None
            transformed_bias = F.softsign(self.bias_affine(bias))
-            hidden_embedded = hidden + F.unsqueeze(transformed_bias, [1])
+            hidden_embedded = hidden + paddle.unsqueeze(transformed_bias, 1)
        else:
            hidden_embedded = hidden

        # glu
-        content, gate = F.split(hidden, num_or_sections=2, dim=-1)
+        content, gate = paddle.chunk(hidden, 2, axis=-1)
        content = hidden_embedded[:, :, :self.in_channel]
        hidden = F.sigmoid(gate) * content

        # # residual
-        hidden = F.scale(input + hidden, math.sqrt(0.5))
+        hidden = paddle.scale(input + hidden, math.sqrt(0.5))
        return hidden


-class AffineBlock1(dg.Layer):
+class AffineBlock1(nn.Layer):
    def __init__(self, in_channel, out_channel, has_bias=False, bias_dim=0):
        super(AffineBlock1, self).__init__()
-        std = np.sqrt(1.0 / in_channel)
-        affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std))
-        self.affine = weight_norm(affine, dim=-1)
+        std = math.sqrt(1.0 / in_channel)
+        affine = nn.Linear(in_channel, out_channel, weight_attr=I.Normal(scale=std))
+        self.affine = nn.utils.weight_norm(affine, dim=-1)
        if has_bias:
-            std = np.sqrt(1 / bias_dim)
-            self.bias_affine = dg.Linear(bias_dim, out_channel, param_attr=I.Normal(scale=std))
+            std = math.sqrt(1 / bias_dim)
+            self.bias_affine = nn.Linear(bias_dim, out_channel, 
+                                         weight_attr=I.Normal(scale=std))

        self.has_bias = has_bias
        self.bias_dim = bias_dim
@ -101,20 +81,20 @@ class AffineBlock1(dg.Layer):
        if self.has_bias:
            assert bias is not None
            transformed_bias = F.softsign(self.bias_affine(bias))
-            hidden += F.unsqueeze(transformed_bias, [1])
+            hidden += paddle.unsqueeze(transformed_bias, 1)
        return hidden


-class AffineBlock2(dg.Layer):
+class AffineBlock2(nn.Layer):
    def __init__(self, in_channel, out_channel,
                 has_bias=False, bias_dim=0, dropout=False, keep_prob=1.):
        super(AffineBlock2, self).__init__()
        if has_bias:
-            std = np.sqrt(1 / bias_dim)
-            self.bias_affine = dg.Linear(bias_dim, in_channel, param_attr=I.Normal(scale=std))
-        std = np.sqrt(1.0 / in_channel)
-        affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std))
-        self.affine = weight_norm(affine, dim=-1)
+            std = math.sqrt(1 / bias_dim)
+            self.bias_affine = nn.Linear(bias_dim, in_channel, weight_attr=I.Normal(scale=std))
+        std = math.sqrt(1.0 / in_channel)
+        affine = nn.Linear(in_channel, out_channel, weight_attr=I.Normal(scale=std))
+        self.affine = nn.utils.weight_norm(affine, dim=-1)

        self.has_bias = has_bias
        self.bias_dim = bias_dim
@ -130,22 +110,21 @@ class AffineBlock2(dg.Layer):
        """
        hidden = input
        if self.dropout:
-            hidden = F.dropout(hidden, 1. - self.keep_prob,
-                               dropout_implementation="upscale_in_train")
+            hidden = F.dropout(hidden, 1. - self.keep_prob, training=self.training)
        if self.has_bias:
            assert bias is not None
            transformed_bias = F.softsign(self.bias_affine(bias))
-            hidden += F.unsqueeze(transformed_bias, [1])
+            hidden += paddle.unsqueeze(transformed_bias, 1)
        hidden = F.relu(self.affine(hidden))
        return hidden


-class Encoder(dg.Layer):
+class Encoder(nn.Layer):
    def __init__(self, layers, in_channels, encoder_dim, kernel_size, 
                 has_bias=False, bias_dim=0, keep_prob=1.):
        super(Encoder, self).__init__()
        self.pre_affine = AffineBlock1(in_channels, encoder_dim, has_bias, bias_dim)
-        self.convs = dg.LayerList([
+        self.convs = nn.LayerList([
            ConvBlock(encoder_dim, kernel_size, False, has_bias, bias_dim, keep_prob) \
                for _ in range(layers)])
        self.post_affine = AffineBlock1(encoder_dim, in_channels, has_bias, bias_dim)
@ -156,11 +135,11 @@ class Encoder(dg.Layer):
            hidden = layer(hidden, speaker_embed)
        hidden = self.post_affine(hidden, speaker_embed)
        keys = hidden
-        values = F.scale(char_embed + hidden, np.sqrt(0.5))
+        values = paddle.scale(char_embed + hidden, math.sqrt(0.5))
        return keys, values


-class AttentionBlock(dg.Layer):
+class AttentionBlock(nn.Layer):
    def __init__(self, attention_dim, input_dim, position_encoding_weight=1., 
                 position_rate=1., reduction_factor=1, has_bias=False, bias_dim=0, 
                 keep_prob=1.):
@ -170,31 +149,37 @@ class AttentionBlock(dg.Layer):
        self.omega_default = omega_default
        # multispeaker case
        if has_bias:
-            std = np.sqrt(1.0 / bias_dim)
-            self.q_pos_affine = dg.Linear(bias_dim, 1, param_attr=I.Normal(scale=std))
-            self.k_pos_affine = dg.Linear(bias_dim, 1, param_attr=I.Normal(scale=std))
+            std = math.sqrt(1.0 / bias_dim)
+            self.q_pos_affine = nn.Linear(bias_dim, 1, weight_attr=I.Normal(scale=std))
+            self.k_pos_affine = nn.Linear(bias_dim, 1, weight_attr=I.Normal(scale=std))
            self.omega_initial = self.create_parameter(shape=[1], 
-                attr=I.ConstantInitializer(value=omega_default))
+                attr=I.Constant(value=omega_default))
        
        # mind the fact that q, k, v have the same feature dimension
        # so we can init k_affine and q_affine's weight as the same matrix
        # to get a better init attention
+        dtype = self.omega_initial.numpy().dtype
        init_weight = np.random.normal(size=(input_dim, attention_dim),
-                                       scale=np.sqrt(1. / input_dim))
-        initializer = I.NumpyArrayInitializer(init_weight.astype(np.float32))
+                                       scale=np.sqrt(1. / input_dim)).astype(dtype)
+        # TODO(chenfeiyu): to report an issue, there is no such initializer
+        #initializer = paddle.fluid.initializer.NumpyArrayInitializer(init_weight)
        # 3 affine transformation to project q, k, v into attention_dim
-        q_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer)
-        self.q_affine = weight_norm(q_affine, dim=-1)
-        k_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer)
-        self.k_affine = weight_norm(k_affine, dim=-1)
+        q_affine = nn.Linear(input_dim, attention_dim)
+        self.q_affine = nn.utils.weight_norm(q_affine, dim=-1)
+        k_affine = nn.Linear(input_dim, attention_dim)
+        self.k_affine = nn.utils.weight_norm(k_affine, dim=-1)
+        
+        # better to use this, since NumpyInitializer does not support float64
+        self.q_affine.weight.set_value(init_weight)
+        self.k_affine.weight.set_value(init_weight)

        std = np.sqrt(1.0 / input_dim)
-        v_affine = dg.Linear(input_dim, attention_dim, param_attr=I.Normal(scale=std))
-        self.v_affine = weight_norm(v_affine, dim=-1)
+        v_affine = nn.Linear(input_dim, attention_dim, weight_attr=I.Normal(scale=std))
+        self.v_affine = nn.utils.weight_norm(v_affine, dim=-1)

        std = np.sqrt(1.0 / attention_dim)
-        out_affine = dg.Linear(attention_dim, input_dim, param_attr=I.Normal(scale=std))
-        self.out_affine = weight_norm(out_affine, dim=-1)
+        out_affine = nn.Linear(attention_dim, input_dim, weight_attr=I.Normal(scale=std))
+        self.out_affine = nn.utils.weight_norm(out_affine, dim=-1)

        self.keep_prob = keep_prob
        self.has_bias = has_bias
@ -204,28 +189,30 @@ class AttentionBlock(dg.Layer):

    def forward(self, q, k, v, lengths, speaker_embed, start_index, 
                force_monotonic=False, prev_coeffs=None, window=None):
+        dtype = self.omega_initial.dtype
        # add position encoding as an inductive bias 
        if self.has_bias: # multi-speaker model
            omega_q = 2 * F.sigmoid(
-                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
-            omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze(
-                self.k_pos_affine(speaker_embed), axes=[-1]))
+                paddle.squeeze(self.q_pos_affine(speaker_embed), -1))
+            omega_k = 2 * self.omega_initial * F.sigmoid(paddle.squeeze(
+                self.k_pos_affine(speaker_embed), -1))
        else: # single-speaker case
            batch_size = q.shape[0]
-            omega_q = F.ones((batch_size, ), dtype="float32")
-            omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default
-        q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q)
-        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)
+            omega_q = paddle.ones((batch_size, ), dtype=dtype)
+            omega_k = paddle.ones((batch_size, ), dtype=dtype) * self.omega_default
+        q += self.position_encoding_weight * pe.scalable_positional_encoding(start_index, q.shape[1], q.shape[-1], omega_q)
+        k += self.position_encoding_weight * pe.scalable_positional_encoding(0, k.shape[1], k.shape[-1], omega_k)
+        

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
-        activations = F.matmul(q, k, transpose_y=True)
-        activations /= np.sqrt(self.attention_dim)
+        activations = paddle.matmul(q, k, transpose_y=True)
+        activations /= math.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
-            mask = F.sequence_mask(lengths, dtype="float32")
-            attn_bias = F.scale(1. - mask, -1000)
-            activations += F.unsqueeze(attn_bias, [1])
+            mask = paddle.fluid.layers.sequence_mask(lengths, dtype=dtype)
+            attn_bias = paddle.scale(1. - mask, -1000)
+            activations += paddle.unsqueeze(attn_bias, 1)
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
@ -233,31 +220,30 @@ class AttentionBlock(dg.Layer):
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
-            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
+            alpha = paddle.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
-                   else F.argmax(prev_coeffs, axis=-1)
-            backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool")
-            forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool")
-            mask = F.cast(F.logical_xor(backward, forward), "float32")
+                   else paddle.argmax(prev_coeffs, axis=-1)
+            backward = paddle.fluid.layers.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool")
+            forward = paddle.fluid.layers.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool")
+            mask = paddle.cast(paddle.logical_xor(backward, forward), activations.dtype)
            # print("mask's shape:", mask.shape)
-            attn_bias = F.scale(1. - mask, -1000)
+            attn_bias = paddle.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
-        coefficients = F.dropout(coefficients, 1. - self.keep_prob,
-                                 dropout_implementation='upscale_in_train')
-        contexts = F.matmul(coefficients, v)
+        coefficients = F.dropout(coefficients, 1. - self.keep_prob, training=self.training)
+        contexts = paddle.matmul(coefficients, v)
        # context normalization
-        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
-        contexts *= F.sqrt(enc_lengths)
+        enc_lengths = paddle.cast(paddle.unsqueeze(lengths, axis=[1, 2]), contexts.dtype)
+        contexts *= paddle.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
+    

-
-class Decoder(dg.Layer):
+class Decoder(nn.Layer):
    def __init__(self, in_channels, reduction_factor, prenet_sizes, 
                layers, kernel_size, attention_dim,
                position_encoding_weight=1., omega=1., 
@ -265,7 +251,7 @@ class Decoder(dg.Layer):
        super(Decoder, self).__init__()
        # prenet-mind the difference of AffineBlock2 and AffineBlock1
        c_in = in_channels
-        self.prenet = dg.LayerList()
+        self.prenet = nn.LayerList()
        for i, c_out in enumerate(prenet_sizes):
            affine = AffineBlock2(c_in, c_out, has_bias, bias_dim, dropout=(i!=0), keep_prob=keep_prob)
            self.prenet.append(affine)
@ -273,8 +259,8 @@ class Decoder(dg.Layer):
        
        # causal convolutions + multihop attention
        decoder_dim = prenet_sizes[-1]
-        self.causal_convs = dg.LayerList()
-        self.attention_blocks = dg.LayerList()
+        self.causal_convs = nn.LayerList()
+        self.attention_blocks = nn.LayerList()
        for i in range(layers):
            conv = ConvBlock(decoder_dim, kernel_size, True, has_bias, bias_dim, keep_prob)
            attn = AttentionBlock(attention_dim, decoder_dim, position_encoding_weight, omega, reduction_factor, has_bias, bias_dim, keep_prob)
@ -283,12 +269,12 @@ class Decoder(dg.Layer):

        # output mel spectrogram
        output_dim = reduction_factor * in_channels # r * mel_dim
-        std = np.sqrt(1.0 / decoder_dim)
-        out_affine = dg.Linear(decoder_dim, output_dim, param_attr=I.Normal(scale=std))
-        self.out_affine = weight_norm(out_affine, dim=-1)
+        std = math.sqrt(1.0 / decoder_dim)
+        out_affine = nn.Linear(decoder_dim, output_dim, weight_attr=I.Normal(scale=std))
+        self.out_affine = nn.utils.weight_norm(out_affine, dim=-1)
        if has_bias:
-            std = np.sqrt(1 / bias_dim)
-            self.out_sp_affine = dg.Linear(bias_dim, output_dim, param_attr=I.Normal(scale=std))
+            std = math.sqrt(1 / bias_dim)
+            self.out_sp_affine = nn.Linear(bias_dim, output_dim, weight_attr=I.Normal(scale=std))

        self.has_bias = has_bias
        self.kernel_size = kernel_size
@ -311,10 +297,10 @@ class Decoder(dg.Layer):

        for i in range(len(self.causal_convs)):
            if state is None:
-                padding = F.zeros(causal_padding_shape, dtype="float32")
+                padding = paddle.zeros(causal_padding_shape, dtype=inputs.dtype)
            else:
                padding = state[i]
-            new_state = F.concat([padding, hidden], axis=1) # => to be used next step
+            new_state = paddle.concat([padding, hidden], axis=1) # => to be used next step
            # causal conv, (B, T, C)
            hidden = self.causal_convs[i](hidden, speaker_embed, padding=padding)
            # attn
@ -324,7 +310,7 @@ class Decoder(dg.Layer):
                hidden, keys, values, lengths, speaker_embed, 
                start_index, force_monotonic, prev_coeffs, window)
            # residual connextion (B, T_dec, C_dec)
-            hidden = F.scale(hidden + context, np.sqrt(0.5))
+            hidden = paddle.scale(hidden + context, math.sqrt(0.5))

            attentions.append(attention) # layers * (B, T_dec, T_enc)
            # new state: shift a step, layers * (B, T, C)
@ -334,34 +320,35 @@ class Decoder(dg.Layer):
        # predict mel spectrogram (B, 1, T_dec, r * C_in)
        decoded = self.out_affine(hidden)
        if self.has_bias:
-            decoded *= F.sigmoid(F.unsqueeze(self.out_sp_affine(speaker_embed), [1]))
+            decoded *= F.sigmoid(paddle.unsqueeze(self.out_sp_affine(speaker_embed), 1))
        return decoded, hidden, attentions, final_state


-class PostNet(dg.Layer):
+class PostNet(nn.Layer):
    def __init__(self, layers, in_channels, postnet_dim, kernel_size, out_channels, upsample_factor, has_bias=False, bias_dim=0, keep_prob=1.):
        super(PostNet, self).__init__()
        self.pre_affine = AffineBlock1(in_channels, postnet_dim, has_bias, bias_dim)
-        self.convs = dg.LayerList([
+        self.convs = nn.LayerList([
            ConvBlock(postnet_dim, kernel_size, False, has_bias, bias_dim, keep_prob) for _ in range(layers)
        ])
-        std = np.sqrt(1.0 / postnet_dim)
-        post_affine = dg.Linear(postnet_dim, out_channels, param_attr=I.Normal(scale=std))
-        self.post_affine = weight_norm(post_affine, dim=-1)
+        std = math.sqrt(1.0 / postnet_dim)
+        post_affine = nn.Linear(postnet_dim, out_channels, weight_attr=I.Normal(scale=std))
+        self.post_affine = nn.utils.weight_norm(post_affine, dim=-1)
        self.upsample_factor = upsample_factor

    def forward(self, hidden, speaker_embed=None):
        hidden = self.pre_affine(hidden, speaker_embed)
        batch_size, time_steps, channels = hidden.shape # pylint: disable=unused-variable
-        hidden = F.expand(hidden, [1, 1, self.upsample_factor])
-        hidden = F.reshape(hidden, [batch_size, -1, channels])
+        # NOTE: paddle.expand can only expand dimension whose size is 1
+        hidden = paddle.expand(paddle.unsqueeze(hidden, 2), [-1, -1, self.upsample_factor, -1])
+        hidden = paddle.reshape(hidden, [batch_size, -1, channels])
        for layer in self.convs:
            hidden = layer(hidden, speaker_embed)
        spec = self.post_affine(hidden)
        return spec


-class SpectraNet(dg.Layer):
+class SpectraNet(nn.Layer):
    def __init__(self, char_embedding, speaker_embedding, encoder, decoder, postnet):
        super(SpectraNet, self).__init__()
        self.char_embedding = char_embedding
@ -386,33 +373,33 @@ class SpectraNet(dg.Layer):
        # build decoder inputs by shifting over by one frame and add all zero <start> frame
        # the mel input is downsampled by a reduction factor
        batch_size = mel.shape[0]
-        mel_input = F.reshape(mel, (batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels))
-        zero_frame = F.zeros((batch_size, 1, self.decoder.in_channels), dtype="float32")
+        mel_input = paddle.reshape(mel, (batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels))
+        zero_frame = paddle.zeros((batch_size, 1, self.decoder.in_channels), dtype=mel.dtype)
        # downsample mel input as a regularization
-        mel_input = F.concat([zero_frame, mel_input[:, :-1, -1, :]], axis=1)
+        mel_input = paddle.concat([zero_frame, mel_input[:, :-1, -1, :]], axis=1)

        # decoder
        decoded, hidden, attentions, final_state = self.decoder(mel_input, keys, values, text_lengths, 0, speaker_embed)
-        attentions = F.stack(attentions) # (N, B, T_dec, T_encs)
+        attentions = paddle.stack(attentions) # (N, B, T_dec, T_encs)
        # unfold frames
-        decoded = F.reshape(decoded, (batch_size, -1, self.decoder.in_channels))
+        decoded = paddle.reshape(decoded, (batch_size, -1, self.decoder.in_channels))
        # postnet
        refined = self.postnet(hidden, speaker_embed)
        return decoded, refined, attentions, final_state

    def spec_loss(self, decoded, input, num_frames=None):
        if num_frames is None:
-            l1_loss = F.reduce_mean(F.abs(decoded - input))
+            l1_loss = paddle.mean(paddle.abs(decoded - input))
        else:
            # mask the <pad> part of the decoder
            num_channels = decoded.shape[-1]
-            l1_loss = F.abs(decoded - input)
-            mask = F.sequence_mask(num_frames, dtype="float32")
-            l1_loss *= F.unsqueeze(mask, axes=[-1])
-            l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels)
+            l1_loss = paddle.abs(decoded - input)
+            mask = paddle.fluid.layers.sequence_mask(num_frames, dtype=decoded.dtype)
+            l1_loss *= paddle.unsqueeze(mask, axis=-1)
+            l1_loss = paddle.sum(l1_loss) / paddle.scale(paddle.sum(mask), num_channels)
        return l1_loss

-    @dg.no_grad
+    @paddle.no_grad()
    def inference(self, keys, values, text_lengths, speaker_embed, 
                  force_monotonic_attention, window):
        MAX_STEP = 500
@ -430,17 +417,17 @@ class SpectraNet(dg.Layer):
        # so we only supports batch_size == 0 in inference
        def should_continue(i, mel_input, outputs, hidden, attention, state, coeffs):
            T_enc = coeffs.shape[-1]
-            attn_peak = F.argmax(coeffs[first_mono_attention_layer, 0, 0]) \
+            attn_peak = paddle.argmax(coeffs[first_mono_attention_layer, 0, 0]) \
                if num_monotonic_attention_layers > 0 \
-                else F.fill_constant([1], "int64", value=0)
-            return i < MAX_STEP and F.reshape(attn_peak, [1]) < T_enc - 1
+                else paddle.fill_constant([1], "int64", value=0)
+            return i < MAX_STEP and paddle.reshape(attn_peak, [1]) < T_enc - 1
        
        def loop_body(i, mel_input, outputs, hiddens, attentions, state=None, coeffs=None):
            # state is None coeffs is None for the first step
            decoded, hidden, new_coeffs, new_state = self.decoder(
                mel_input, keys, values, text_lengths, i, speaker_embed, 
                state, force_monotonic_attention, coeffs, window)
-            new_coeffs = F.stack(new_coeffs) # (N, B, T_dec=1, T_enc)
+            new_coeffs = paddle.stack(new_coeffs) # (N, B, T_dec=1, T_enc)

            attentions.append(new_coeffs) # (N, B, T_dec=1, T_enc)
            outputs.append(decoded) # (B, T_dec=1, rC_mel)
@ -448,13 +435,13 @@ class SpectraNet(dg.Layer):

            # slice the last frame out of r generated frames to be used as the input for the next step
            batch_size = mel_input.shape[0]
-            frames = F.reshape(decoded, [batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels])
+            frames = paddle.reshape(decoded, [batch_size, -1, self.decoder.reduction_factor, self.decoder.in_channels])
            input_frame = frames[:, :, -1, :]
            return (i + 1, input_frame, outputs, hiddens, attentions, new_state, new_coeffs)

        i = 0
        batch_size = keys.shape[0]
-        input_frame = F.zeros((batch_size, 1, self.decoder.in_channels), dtype="float32")
+        input_frame = paddle.zeros((batch_size, 1, self.decoder.in_channels), dtype=keys.dtype)
        outputs = []
        hiddens = []
        attentions = []
@ -465,12 +452,12 @@ class SpectraNet(dg.Layer):
    
        outputs, hiddens, attention = loop_state[2], loop_state[3], loop_state[4]
        # concat decoder timesteps
-        outputs = F.concat(outputs, axis=1)
-        hiddens = F.concat(hiddens, axis=1)
-        attention = F.concat(attention, axis=2)
+        outputs = paddle.concat(outputs, axis=1)
+        hiddens = paddle.concat(hiddens, axis=1)
+        attention = paddle.concat(attention, axis=2)

        # unfold frames
-        outputs = F.reshape(outputs, (batch_size, -1, self.decoder.in_channels))
+        outputs = paddle.reshape(outputs, (batch_size, -1, self.decoder.in_channels))

        refined = self.postnet(hiddens, speaker_embed)
        return outputs, refined, attention
--- a/parakeet/models/deepvoice3/init.py
+++ b/parakeet/models/deepvoice3/init.py
@ -1 +0,0 @@
-from .model import *
--- a/parakeet/models/deepvoice3/conv.py
+++ b/parakeet/models/deepvoice3/conv.py
@ -1,245 +0,0 @@
-import numpy as np
-from paddle.fluid import layers as F
-from paddle.fluid.framework import Variable, in_dygraph_mode
-from paddle.fluid import core, dygraph_utils
-from paddle.fluid.layers import nn, utils
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph import layers
-from paddle.fluid.initializer import Normal
-
-
-def _is_list_or_tuple(input):
-    return isinstance(input, (list, tuple))
-
-
-def _zero_padding_in_batch_and_channel(padding, channel_last):
-    if channel_last:
-        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
-    else:
-        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
-
-
-def _exclude_padding_in_batch_and_channel(padding, channel_last):
-    padding_ = padding[1:-1] if channel_last else padding[2:]
-    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
-    return padding_
-
-
-def _update_padding_nd(padding, channel_last, num_dims):
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
-                format(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0] * num_dims
-        else:
-            padding_algorithm = "SAME"
-            padding = [0] * num_dims
-    elif _is_list_or_tuple(padding):
-        # for padding like
-        # [(pad_before, pad_after), (pad_before, pad_after), ...]
-        # padding for batch_dim and channel_dim included
-        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
-            if not _zero_padding_in_batch_and_channel(padding, channel_last):
-                raise ValueError(
-                    "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding))
-            padding_algorithm = "EXPLICIT"
-            padding = _exclude_padding_in_batch_and_channel(padding,
-                                                            channel_last)
-            if utils._is_symmetric_padding(padding, num_dims):
-                padding = padding[0::2]
-        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
-        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
-            padding_algorithm = "EXPLICIT"
-            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
-            if utils._is_symmetric_padding(padding, num_dims):
-                padding = padding[0::2]
-        # for padding like [pad_d1, pad_d2, ...]
-        elif len(padding) == num_dims and isinstance(padding[0], int):
-            padding_algorithm = "EXPLICIT"
-            padding = utils.convert_to_list(padding, num_dims, 'padding')
-        else:
-            raise ValueError("In valid padding: {}".format(padding))
-    # for integer padding
-    else:
-        padding_algorithm = "EXPLICIT"
-        padding = utils.convert_to_list(padding, num_dims, 'padding')
-    return padding, padding_algorithm
-
-def _get_default_param_initializer(num_channels, filter_size):
-    filter_elem_num = num_channels * np.prod(filter_size)
-    std = (2.0 / filter_elem_num)**0.5
-    return Normal(0.0, std, 0)
-
-def conv1d(input,
-           weight,
-           bias=None,
-           padding=0,
-           stride=1,
-           dilation=1,
-           groups=1,
-           use_cudnn=True,
-           act=None,
-           data_format="NCT",
-           name=None):
-    # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
-    if data_format not in ["NCT", "NTC"]:
-        raise ValueError("Attr(data_format) should be 'NCT' or 'NTC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
-
-    channel_last = (data_format == "NTC")
-    channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
-    num_filters = weight.shape[0]
-    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
-    if num_channels % groups != 0:
-        raise ValueError(
-            "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
-    if num_filters % groups != 0:
-        raise ValueError(
-            "the number of filters must be divisible by groups,"
-            "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups))
-
-    # update attrs
-    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
-    if len(padding) == 1: # synmmetric padding
-        padding = [0,] + padding
-    else:
-        # len(padding) == 2
-        padding = [0, 0] + padding
-    stride = [1,] + utils.convert_to_list(stride, 1, 'stride')
-    dilation = [1,] + utils.convert_to_list(dilation, 1, 'dilation')
-    data_format = "NHWC" if channel_last else "NCHW"
-
-    l_type = "conv2d"
-
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
-        l_type = 'depthwise_conv2d'
-    weight = F.unsqueeze(weight, [2])
-    input = F.unsqueeze(input, [1]) if channel_last else F.unsqueeze(input, [2])
-
-    if in_dygraph_mode():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
-                 padding_algorithm, "data_format", data_format)
-        pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
-    else:
-        inputs = {'Input': [input], 'Filter': [weight]}
-        attrs = {
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
-            'fuse_relu_before_depthwise_conv': False,
-            "padding_algorithm": padding_algorithm,
-            "data_format": data_format
-        }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv2d')
-        helper = LayerHelper(l_type, **locals())
-        dtype = helper.input_dtype()
-        pre_bias = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
-    out = F.squeeze(out, [1]) if channel_last else F.squeeze(out, [2])
-    return out
-
-class Conv1D(layers.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCT",
-                 dtype='float32'):
-        super(Conv1D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        self._act = act
-        self._data_format = data_format
-        self._dtype = dtype
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-
-        self._filter_size = utils.convert_to_list(filter_size, 1, 'filter_size')
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        channel_last = (data_format == "NTC")
-        self._padding = padding  # leave it to F.conv1d
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        num_filter_channels = num_channels // groups
-        filter_shape = [self._num_filters, num_filter_channels
-                        ] + self._filter_size
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, filter_shape))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = conv1d(
-            input,
-            self.weight,
-            bias=self.bias,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
-            data_format=self._data_format)
-        return out
-
--- a/parakeet/models/deepvoice3/weight_norm_hook.py
+++ b/parakeet/models/deepvoice3/weight_norm_hook.py
@ -1,148 +0,0 @@
-import paddle
-import paddle.fluid.dygraph as dg
-
-import numpy as np
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-import paddle.fluid.layers as F
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.data_feeder import check_variable_and_dtype
-
-
-def l2_norm(x, axis, epsilon=1e-12, name=None):
-    if len(x.shape) == 1:
-        axis = 0
-    check_variable_and_dtype(x, "X", ("float32", "float64"), "norm")
-
-    helper = LayerHelper("l2_normalize", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="norm",
-        inputs={"X": x},
-        outputs={"Out": out,
-                 "Norm": norm},
-        attrs={
-            "axis": 1 if axis is None else axis,
-            "epsilon": epsilon,
-        })
-    return F.squeeze(norm, axes=[axis])
-    
-def norm_except_dim(p, dim):
-    shape = p.shape
-    ndims = len(shape)
-    if dim is None:
-        return F.sqrt(F.reduce_sum(F.square(p)))
-    elif dim == 0:
-        p_matrix = F.reshape(p, (shape[0], -1))
-        return l2_norm(p_matrix, axis=1)
-    elif dim == -1 or dim == ndims - 1:
-        p_matrix = F.reshape(p, (-1, shape[-1]))
-        return l2_norm(p_matrix, axis=0)
-    else:
-        perm = list(range(ndims))
-        perm[0] = dim
-        perm[dim] = 0
-        p_transposed = F.transpose(p, perm)
-        return norm_except_dim(p_transposed, 0)
-
-def _weight_norm(v, g, dim):
-    shape = v.shape
-    ndims = len(shape)
-
-    if dim is None:
-        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
-    elif dim == 0:
-        p_matrix = F.reshape(v, (shape[0], -1))
-        v_normalized = F.l2_normalize(p_matrix, axis=1)
-        v_normalized = F.reshape(v_normalized, shape)
-    elif dim == -1 or dim == ndims - 1:
-        p_matrix = F.reshape(v, (-1, shape[-1]))
-        v_normalized = F.l2_normalize(p_matrix, axis=0)
-        v_normalized = F.reshape(v_normalized, shape)
-    else:
-        perm = list(range(ndims))
-        perm[0] = dim
-        perm[dim] = 0
-        p_transposed = F.transpose(v, perm)
-        transposed_shape = p_transposed.shape
-        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
-        v_normalized = F.l2_normalize(p_matrix, axis=1)
-        v_normalized = F.reshape(v_normalized, transposed_shape)
-        v_normalized = F.transpose(v_normalized, perm)
-    weight = F.elementwise_mul(v_normalized, g, axis=dim if dim is not None else -1)
-    return weight
-
-
-class WeightNorm(object):
-    def __init__(self, name, dim):
-        if dim is None:
-            dim = -1
-        self.name = name
-        self.dim = dim
-
-    def compute_weight(self, module):
-        g = getattr(module, self.name + '_g')
-        v = getattr(module, self.name + '_v')
-        w = _weight_norm(v, g, self.dim)
-        return w
-
-    @staticmethod
-    def apply(module: dg.Layer, name, dim):
-        for k, hook in module._forward_pre_hooks.items():
-            if isinstance(hook, WeightNorm) and hook.name == name:
-                raise RuntimeError("Cannot register two weight_norm hooks on "
-                                   "the same parameter {}".format(name))
-
-        if dim is None:
-            dim = -1
-
-        fn = WeightNorm(name, dim)
-
-        # remove w from parameter list
-        w = getattr(module, name)
-        del module._parameters[name]
-
-        # add g and v as new parameters and express w as g/||v|| * v
-        g_var = norm_except_dim(w, dim)
-        v = module.create_parameter(w.shape, dtype=w.dtype)
-        module.add_parameter(name + "_v", v)
-        g = module.create_parameter(g_var.shape, dtype=g_var.dtype)
-        module.add_parameter(name + "_g", g)
-        with dg.no_grad():
-            F.assign(w, v)
-            F.assign(g_var, g)
-        setattr(module, name, fn.compute_weight(module))
-
-        # recompute weight before every forward()
-        module.register_forward_pre_hook(fn)
-        return fn
-
-    def remove(self, module):
-        w_var = self.compute_weight(module)
-        delattr(module, self.name)
-        del module._parameters[self.name + '_g']
-        del module._parameters[self.name + '_v']
-        w = module.create_parameter(w_var.shape, dtype=w_var.dtype)
-        module.add_parameter(self.name, w)
-        with dg.no_grad():
-            F.assign(w_var, w)
-
-    def __call__(self, module, inputs):
-        setattr(module, self.name, self.compute_weight(module))
-
-
-def weight_norm(module, name='weight', dim=0):
-    WeightNorm.apply(module, name, dim)
-    return module
-
-
-def remove_weight_norm(module, name='weight'):
-    for k, hook in module._forward_pre_hooks.items():
-        if isinstance(hook, WeightNorm) and hook.name == name:
-            hook.remove(module)
-            del module._forward_pre_hooks[k]
-            return module
-
-    raise ValueError("weight_norm of '{}' not found in {}"
-                     .format(name, module))
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@ -0,0 +1,258 @@
+import math
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention
+from parakeet.modules.transformer import PositionwiseFFN, combine_mask
+from parakeet.modules.cbhg import Conv1dBatchNorm
+
+# Transformer TTS's own implementation of transformer
+class MultiheadAttention(nn.Layer):
+    """
+    Multihead scaled dot product attention with drop head. See 
+    [Scheduled DropHead: A Regularization Method for Transformer Models](https://arxiv.org/abs/2004.13342) 
+    for details.
+    
+    Another deviation is that it concats the input query and context vector before
+    applying the output projection.
+    """
+    def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None):
+        """
+        Args:
+            model_dim (int): the feature size of query.
+            num_heads (int): the number of attention heads.
+            k_dim (int, optional): feature size of the key of each scaled dot 
+                product attention. If not provided, it is set to 
+                model_dim / num_heads. Defaults to None.
+            v_dim (int, optional): feature size of the key of each scaled dot 
+                product attention. If not provided, it is set to 
+                model_dim / num_heads. Defaults to None.
+
+        Raises:
+            ValueError: if model_dim is not divisible by num_heads
+        """
+        super(MultiheadAttention, self).__init__()
+        if model_dim % num_heads !=0:
+            raise ValueError("model_dim must be divisible by num_heads")
+        depth = model_dim // num_heads
+        k_dim = k_dim or depth
+        v_dim = v_dim or depth
+        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
+        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
+        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
+        self.affine_o = nn.Linear(model_dim + num_heads * v_dim, model_dim)
+        
+        self.num_heads = num_heads
+        self.model_dim = model_dim
+    
+    def forward(self, q, k, v, mask, drop_n_heads=0):
+        """
+        Compute context vector and attention weights.
+        
+        Args:
+            q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
+            k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
+            v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
+            mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or 
+                broadcastable shape, dtype: float32 or float64, the mask.
+
+        Returns:
+            (out, attention_weights)
+            out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
+            attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
+        """
+        q_in = q
+        q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
+        k = _split_heads(self.affine_k(k), self.num_heads)
+        v = _split_heads(self.affine_v(v), self.num_heads)
+        mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
+        
+        context_vectors, attention_weights = scaled_dot_product_attention(
+            q, k, v, mask)
+        context_vectors = drop_head(context_vectors, drop_n_heads, self.training)
+        context_vectors = _concat_heads(context_vectors) # (B, T, h*C)
+        
+        concat_feature = paddle.concat([q_in, context_vectors], -1)
+        out = self.affine_o(concat_feature)
+        return out, attention_weights
+
+
+class TransformerEncoderLayer(nn.Layer):
+    """
+    Transformer encoder layer.
+    """
+    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
+        """
+        Args:
+            d_model (int): the feature size of the input, and the output.
+            n_heads (int): the number of heads in the internal MultiHeadAttention layer.
+            d_ffn (int): the hidden size of the internal PositionwiseFFN.
+            dropout (float, optional): the probability of the dropout in 
+                MultiHeadAttention and PositionwiseFFN. Defaults to 0.
+        """
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_mha = MultiheadAttention(d_model, n_heads)
+        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
+        
+        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
+        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
+    
+    def forward(self, x, mask):
+        """
+        Args:
+            x (Tensor): shape(batch_size, time_steps, d_model), the decoder input.
+            mask (Tensor): shape(batch_size, time_steps), the padding mask.
+        
+        Returns:
+            (x, attn_weights)
+            x (Tensor): shape(batch_size, time_steps, d_model), the decoded.
+            attn_weights (Tensor), shape(batch_size, n_heads, time_steps, time_steps), self attention.
+        """
+        # pre norm
+        x_in = x
+        x = self.layer_norm1(x)
+        context_vector, attn_weights = self.self_mha(x, x, x, paddle.unsqueeze(mask, 1))
+        x = x_in + context_vector # here, the order can be tuned
+        
+        # pre norm
+        x = x + self.ffn(self.layer_norm2(x))
+        return x, attn_weights
+
+
+class TransformerDecoderLayer(nn.Layer):
+    """
+    Transformer decoder layer.
+    """
+    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
+        """
+        Args:
+            d_model (int): the feature size of the input, and the output.
+            n_heads (int): the number of heads in the internal MultiHeadAttention layer.
+            d_ffn (int): the hidden size of the internal PositionwiseFFN.
+            dropout (float, optional): the probability of the dropout in 
+                MultiHeadAttention and PositionwiseFFN. Defaults to 0.
+        """
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_mha = MultiheadAttention(d_model, n_heads)
+        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
+        
+        self.cross_mha = MultiheadAttention(d_model, n_heads)
+        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
+        
+        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
+        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
+    
+    def forward(self, q, k, v, encoder_mask, decoder_mask):
+        """
+        Args:
+            q (Tensor): shape(batch_size, time_steps_q, d_model), the decoder input.
+            k (Tensor): shape(batch_size, time_steps_k, d_model), keys.
+            v (Tensor): shape(batch_size, time_steps_k, d_model), values
+            encoder_mask (Tensor): shape(batch_size, time_steps_k) encoder padding mask.
+            decoder_mask (Tensor): shape(batch_size, time_steps_q) decoder padding mask.
+        
+        Returns:
+            (q, self_attn_weights, cross_attn_weights)
+            q (Tensor): shape(batch_size, time_steps_q, d_model), the decoded.
+            self_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_q), decoder self attention.
+            cross_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_k), decoder-encoder cross attention.
+        """
+        tq = q.shape[1]
+        no_future_mask = paddle.tril(paddle.ones([tq, tq])) #(tq, tq)
+        combined_mask = combine_mask(decoder_mask, no_future_mask)
+        
+        # pre norm
+        q_in = q
+        q = self.layer_norm1(q)
+        context_vector, self_attn_weights = self.self_mha(q, q, q, combined_mask)
+        q = q_in + context_vector
+        
+        # pre norm
+        q_in = q
+        q = self.layer_norm2(q)
+        context_vector, cross_attn_weights = self.cross_mha(q, k, v, paddle.unsqueeze(encoder_mask, 1))
+        q = q_in + context_vector
+        
+        # pre norm
+        q = q + self.ffn(self.layer_norm3(q))
+        return q, self_attn_weights, cross_attn_weights
+
+
+class TransformerEncoder(nn.LayerList):
+    def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0.):
+        super(TransformerEncoder, self).__init__()
+        for _ in range(n_layers):
+            self.append(TransformerEncoderLayer(d_model, n_heads, d_ffn, dropout))
+
+    def forward(self, x, mask):
+        attention_weights = []
+        for layer in self:
+            x, attention_weights_i = layer(x, mask)
+            attention_weights.append(attention_weights_i)
+        return x, attention_weights
+
+
+class TransformerDecoder(nn.LayerList):
+    def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0.):
+        super(TransformerDecoder, self).__init__()
+        for _ in range(n_layers):
+            self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout))
+
+    def forward(self, x, mask):
+        self_attention_weights = []
+        cross_attention_weights = []
+        for layer in self:
+            x, self_attention_weights_i, cross_attention_weights_i = layer(x, mask)
+            self_attention_weights.append(self_attention_weights_i)
+            cross_attention_weights.append(cross_attention_weights_i)
+        return x, self_attention_weights, cross_attention_weights
+    
+    
+class DecoderPreNet(nn.Layer):
+    def __init__(self, d_model, d_hidden, dropout):
+        self.lin1 = nn.Linear(d_model, d_hidden)
+        self.dropout1 = nn.Dropout(dropout)
+        self.lin2 = nn.Linear(d_hidden, d_model)
+        self.dropout2 = nn.Dropout(dropout)
+        
+    def forward(self, x):
+        # the original code said also use dropout in inference
+        return self.dropout2(F.relu(self.lin2(self.dropout1(F.relu(self.lin1(x))))))
+
+
+class PostNet(nn.Layer):
+    def __init__(self, d_input, d_hidden, d_output, kernel_size, n_layers):
+        self.convs = nn.LayerList()
+        kernel_size = kernel_size if isinstance(tuple, kernel_size) else (kernel_size, ) 
+        padding = (kernel_size[0] - 1, 0)
+        for i in range(n_layers):
+            c_in = d_input if i == 0 else d_hidden
+            c_out = d_output if i == n_layers - 1 else d_hidden
+            self.convs.append(
+                Conv1dBatchNorm(c_in, c_out, kernel_size, padding=padding))
+        self.last_norm = nn.BatchNorm1d(d_output)
+    
+    def forward(self, x):
+        for layer in self.convs:
+            x = paddle.tanh(layer(x))
+        x = self.last_norm(x)
+        return x
+
+
+class TransformerTTS(nn.Layer):
+    def __init__(self, vocab_size, padding_idx, d_model, d_mel, n_heads, d_ffn, 
+                 encoder_layers, decoder_layers, d_prenet, d_postnet, postnet_layers, 
+                 postnet_kernel_size, reduction_factor, dropout):
+        self.encoder_prenet = nn.Embedding(vocab_size, d_model, padding_idx)
+        self.encoder = TransformerEncoder(d_model, n_heads, d_ffn, encoder_layers, dropout)
+        self.decoder_prenet = DecoderPreNet(d_model, d_prenet, dropout)
+        self.decoder = TransformerDecoder(d_model, n_heads, d_ffn, decoder_layers, dropout)
+        self.decoder_postnet = nn.Linear(d_model, reduction_factor * d_mel)
+        self.postnet = PostNet(d_mel, d_postnet, d_mel, postnet_kernel_size, postnet_layers)
+    
+    def forward(self):
+        pass
+    
+    def infer(self):
+        pass
--- a/parakeet/models/transformer_tts_deprecated/init.py
+++ b/parakeet/models/transformer_tts_deprecated/init.py
--- a/parakeet/models/transformer_tts_deprecated/cbhg.py
+++ b/parakeet/models/transformer_tts_deprecated/cbhg.py
--- a/parakeet/models/transformer_tts_deprecated/decoder.py
+++ b/parakeet/models/transformer_tts_deprecated/decoder.py
--- a/parakeet/models/transformer_tts_deprecated/encoder.py
+++ b/parakeet/models/transformer_tts_deprecated/encoder.py
--- a/parakeet/models/transformer_tts_deprecated/encoderprenet.py
+++ b/parakeet/models/transformer_tts_deprecated/encoderprenet.py
--- a/parakeet/models/transformer_tts_deprecated/post_convnet.py
+++ b/parakeet/models/transformer_tts_deprecated/post_convnet.py
--- a/parakeet/models/transformer_tts_deprecated/prenet.py
+++ b/parakeet/models/transformer_tts_deprecated/prenet.py
--- a/parakeet/models/transformer_tts_deprecated/transformer_tts.py
+++ b/parakeet/models/transformer_tts_deprecated/transformer_tts.py
--- a/parakeet/models/transformer_tts_deprecated/utils.py
+++ b/parakeet/models/transformer_tts_deprecated/utils.py
--- a/parakeet/models/transformer_tts_deprecated/vocoder.py
+++ b/parakeet/models/transformer_tts_deprecated/vocoder.py
--- a/parakeet/models/waveflow.py
+++ b/parakeet/models/waveflow.py
@ -0,0 +1,229 @@
+import math
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+from typing import Sequence
+from parakeet.modules import geometry as geo
+
+import itertools
+import numpy as np
+import paddle.fluid.dygraph as dg
+from paddle import fluid
+from parakeet.modules import weight_norm
+
+def fold(x, n_group):
+    """Fold audio or spectrogram's temporal dimension in to groups.
+
+    Args:
+        x (Tensor): shape(*, time_steps), the input tensor
+        n_group (int): the size of a group.
+
+    Returns:
+        Tensor: shape(*, time_steps // n_group, group), folded tensor.
+    """
+    *spatial_shape, time_steps = x.shape
+    new_shape = spatial_shape + [time_steps // n_group, n_group]
+    return paddle.reshape(x, new_shape)
+
+class UpsampleNet(nn.LayerList):
+    def __init__(self, upsample_factors: Sequence[int]):
+        super(UpsampleNet, self).__init__()
+        for factor in upsample_factors:
+            std = math.sqrt(1 / (3 * 2 * factor))
+            init = I.Uniform(-std, std)
+            self.append(
+                nn.utils.weight_norm(
+                    nn.ConvTranspose2d(1, 1, (3, 2 * factor), 
+                        padding=(1, factor // 2),
+                        stride=(1, factor),
+                        weight_attr=init,
+                        bias_attr=init)))
+            
+        # upsample factors
+        self.upsample_factor = np.prod(upsample_factors)
+        self.upsample_factors = upsample_factors
+    
+    def forward(self, x, trim_conv_artifact=False):
+        """
+        Args:
+            x (Tensor): shape(batch_size, input_channels, time_steps), the input 
+                spectrogram.
+            trim_conv_artifact (bool, optional): trim deconvolution artifact at 
+                each layer. Defaults to False.
+
+        Returns:
+            Tensor: shape(batch_size, input_channels, time_steps * upsample_factors).
+                If trim_conv_artifact is True, the output time steps is less 
+                than time_steps * upsample_factors.
+        """
+        x = paddle.unsqueeze(x, 1)
+        for layer in self:
+            x = layer(x)
+            if trim_conv_artifact:
+                time_cutoff = layer._kernel_size[1] - layer._stride[1]
+                x = x[:, :, :, -time_cutoff]
+            x = F.leaky_relu(x, 0.4)
+        x = paddle.squeeze(x, 1)
+        return x
+
+
+class ResidualBlock(nn.Layer):
+    def __init__(self, channels, cond_channels, kernel_size, dilations):
+        super(ResidualBlock, self).__init__()
+        # input conv
+        std = math.sqrt(1 / channels * np.prod(kernel_size))
+        init = I.Uniform(-std, std)
+        conv = nn.Conv2d(channels, 2 * channels, kernel_size, dilation=dilations, 
+                         weight_attr=init, bias_attr=init)
+        self.conv = nn.utils.weight_norm(conv)
+        
+        # condition projection
+        std = math.sqrt(1 / cond_channels)
+        init = I.Uniform(-std, std)
+        condition_proj = nn.Conv2d(cond_channels, 2 * channels, (1, 1),
+                                   weight_attr=init, bias_attr=init)
+        self.condition_proj = nn.utils.weight_norm(condition_proj)
+        
+        # parametric residual & skip connection
+        std = math.sqrt(1 / channels)
+        init = I.Uniform(-std, std)
+        out_proj = nn.Conv2d(channels, 2 * channels, (1, 1),
+                                   weight_attr=init, bias_attr=init)
+        self.out_proj = nn.utils.weight_norm(out_proj)
+        
+        # specs
+        self.kernel_size = self.conv._kernel_size
+        self.dilations = self.conv._dilation
+        
+    def forward(self, x, condition):
+        receptive_field = tuple(
+            [1 + (k -1) * d for (k, d) in zip(self.kernel_size, self.dilations)])
+        rh, rw = receptive_field
+        paddings = (rh - 1, 0, (rw - 1) // 2, (rw - 1) // 2)
+        x = self.conv(F.pad2d(x, paddings))
+        x += self.condition_proj(condition)
+        
+        content, gate = paddle.chunk(x, 2, axis=1)
+        x = paddle.tanh(content) * F.sigmoid(gate)
+        
+        x = self.out_proj(x)
+        res, skip = paddle.chunk(x, 2, axis=1)
+        return res, skip
+        
+        
+class ResidualNet(nn.LayerList):
+    def __init__(self, n_layer, residual_channels, condition_channels, kernel_size, dilations_h):
+        if len(dilations_h) != n_layer:
+            raise ValueError("number of dilations_h should equals num of layers")
+        super(ResidualNet, self).__init__()
+        for i in range(n_layer):
+            dilation = (dilations_h[i], 2 ** i)
+            layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation)
+            self.append(layer)
+            
+    def forward(self, x, condition):
+        skip_connections = []
+        for layer in self:
+            x, skip = layer(x, condition)
+            skip_connections.append(skip)
+        out = paddle.sum(paddle.stack(skip_connections, 0), 0)
+        return out
+    
+
+class Flow(nn.Layer):
+    dilations_dict = {
+            8: [1, 1, 1, 1, 1, 1, 1, 1],
+            16: [1, 1, 1, 1, 1, 1, 1, 1],
+            32: [1, 2, 4, 1, 2, 4, 1, 2],
+            64: [1, 2, 4, 8, 16, 1, 2, 4],
+            128: [1, 2, 4, 8, 16, 32, 64, 1]
+    }
+    
+    def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
+        super(Flow, self).__init__()
+        # input projection
+        self.first_conv = nn.utils.weight_norm(
+            nn.Conv2d(1, channels, (1, 1), 
+                      weight_attr=I.Uniform(-1., 1.), 
+                      bias_attr=I.Uniform(-1., 1.)))
+        
+        # residual net
+        self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, 
+                                  self.dilations_dict[n_group])
+        
+        # output projection
+        self.last_conv = nn.utils.weight_norm(
+            nn.Conv2d(channels, 2, (1, 1),
+                      weight_attr=I.Constant(0.),
+                      bias_attr=I.Constant(0.)))
+    
+    def forward(self, x, condition):
+        return self.last_conv(self.resnet(self.first_conv(x), condition))
+
+
+class WaveFlow(nn.LayerList):
+    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
+        if n_group % 2 or n_flows % 2:
+            raise ValueError("number of flows and number of group must be even "
+                             "since a permutation along group among flows is used.")
+        super(WaveFlow, self).__init__()
+        for i in range(n_flows):
+            self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group))
+        
+        # permutations in h
+        indices = list(range(n_group))
+        half = n_group // 2
+        self.perms = []
+        for i in range(n_flows):
+            if i < n_flows // 2:
+                self.perms.append(indices[::-1])
+            else:
+                perm = list(reversed(indices[:half])) + list(reversed(indices[half:]))
+                self.perms.append(perm)
+                
+        self.n_group = n_group
+        
+    def trim(self, x, condition):
+        assert condition.shape[-1] >= x.shape[-1]
+        pruned_len = int(x.shape[-1] // self.n_group * self.n_group)
+        
+        if x.shape[-1] > pruned_len:
+            x = x[:, :pruned_len]
+        if condition.shape[-1] > pruned_len:
+            condition = condition[:, :, :pruned_len]
+        return x, condition
+    
+    def forward(self, x, condition):
+        # x: (B, T)
+        # condition: (B, C, T) upsampled condition
+        x, condition = self.trim(x, condition)
+        
+        # transpose to (B, C, h, T //h) layout
+        x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
+        condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
+        
+        # flows
+        logs_list = []
+        for i, layer in enumerate(self):
+            # shiting: z[i, j] depends only on x[<i, :]
+            input = x[:, :, :-1, :]
+            cond = condition[:, :, 1:, :]
+            output = layer(input, cond)
+            logs, b = paddle.chunk(output, 2, axis=1)
+            logs_list.append(logs)
+
+            x_0 = x[:, :, :1, :] # the first row, just  copy
+            x_out = x[:, :, 1:, :] * paddle.exp(logs) + b            
+            x = paddle.concat([x_0, x_out], axis=2)
+            
+            # permute paddle has no shuffle dim
+            x = geo.shuffle_dim(x, 2, perm=self.perms[i])
+            condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
+        
+        z = paddle.squeeze(x, 1)
+        return z, logs_list
+
+
+# TODO(chenfeiyu): WaveFlowLoss
--- a/parakeet/models/waveflow_deprecated/init.py
+++ b/parakeet/models/waveflow_deprecated/init.py
--- a/parakeet/models/waveflow_deprecated/waveflow_modules.py
+++ b/parakeet/models/waveflow_deprecated/waveflow_modules.py
--- a/parakeet/models/wavenet.py
+++ b/parakeet/models/wavenet.py
@ -0,0 +1,715 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+import math
+import time
+from tqdm import trange
+import numpy as np
+
+import paddle 
+from paddle import nn
+from paddle.nn import functional as F
+import paddle.fluid.initializer as I
+import paddle.fluid.layers.distributions as D
+
+from parakeet.modules.conv import Conv1dCell
+
+def quantize(values, n_bands):
+    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in [0, n_bands).
+
+    Args:
+        values (Variable): dtype: flaot32 or float64. the floating point value.
+        n_bands (int): the number of bands. The output integer Tensor's value is in the range [0, n_bans).
+
+    Returns:
+        Variable: the quantized tensor, dtype: int64.
+    """
+    quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
+    return quantized
+
+
+def dequantize(quantized, n_bands, dtype=None):
+    """Linearlly dequantize an integer Tensor into a float Tensor in the range [-1, 1).
+
+    Args:
+        quantized (Variable): dtype: int64. The quantized value in the range [0, n_bands).
+        n_bands (int): number of bands. The input integer Tensor's value is in the range [0, n_bans).
+
+    Returns:
+        Variable: the dequantized tensor, dtype is specified by dtype.
+    """
+    dtype = dtype or paddle.get_default_dtype()
+    value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
+    return value
+
+
+def crop(x, audio_start, audio_length):
+    """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice.
+
+    Args:
+        x (Variable): shape(B, C, T), dtype float32, the upsample condition.
+        audio_start (Variable): shape(B, ), dtype: int64, the index the starting point.
+        audio_length (int): the length of the audio (number of samples it contaions).
+
+    Returns:
+        Variable: shape(B, C, audio_length), cropped condition.
+    """
+    # crop audio
+    slices = []  # for each example
+    # paddle now supports Tensor of shape [1] in slice
+    # starts = audio_start.numpy()
+    for i in range(x.shape[0]):
+        start = audio_start[i]
+        end = start + audio_length
+        slice = paddle.slice(x[i], axes=[1], starts=[start], ends=[end])
+        slices.append(slice)
+    out = paddle.stack(slices)
+    return out
+
+
+class ResidualBlock(nn.Layer):
+    def __init__(self, residual_channels, condition_dim, filter_size,
+                 dilation):
+        """A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition.
+
+        Args:
+            residual_channels (int): the channels of the input, residual and skip.
+            condition_dim (int): the channels of the condition.
+            filter_size (int): filter size of the internal convolution cell.
+            dilation (int): dilation of the internal convolution cell.
+        """
+        super(ResidualBlock, self).__init__()
+        dilated_channels = 2 * residual_channels
+        # following clarinet's implementation, we do not have parametric residual
+        # & skip connection.
+
+        _filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size
+        std = math.sqrt(1 / (_filter_size * residual_channels))
+        conv = Conv1dCell(residual_channels, 
+                          dilated_channels, 
+                          filter_size, 
+                          dilation=dilation, 
+                          weight_attr=I.Normal(scale=std))
+        self.conv = nn.utils.weight_norm(conv)
+
+        std = math.sqrt(1 / condition_dim)
+        condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), 
+                                   weight_attr=I.Normal(scale=std))
+        self.condition_proj = nn.utils.weight_norm(condition_proj)
+
+        self.filter_size = filter_size
+        self.dilation = dilation
+        self.dilated_channels = dilated_channels
+        self.residual_channels = residual_channels
+        self.condition_dim = condition_dim
+
+    def forward(self, x, condition=None):
+        """Conv1D gated-tanh Block.
+
+        Args:
+            x (Tensor): shape(B, C_res, T), the input. (B stands for batch_size, 
+                C_res stands for residual channels, T stands for time steps.) 
+                dtype float32.
+            condition (Tensor, optional): shape(B, C_cond, T), the condition, 
+                it has been upsampled in time steps, so it has the same time 
+                steps as the input does.(C_cond stands for the condition's channels). 
+                Defaults to None.
+
+        Returns:
+            (residual, skip_connection)
+            residual (Tensor): shape(B, C_res, T), the residual, which is used 
+                as the input to the next layer of ResidualBlock.
+            skip_connection (Tensor): shape(B, C_res, T), the skip connection. 
+                This output is accumulated with that of other ResidualBlocks. 
+        """
+        h = x
+
+        # dilated conv
+        h = self.conv(h)
+
+        # condition
+        if condition is not None:
+            h += self.condition_proj(condition)
+
+        # gated tanh
+        content, gate = paddle.split(h, 2, axis=1)
+        z = F.sigmoid(gate) * paddle.tanh(content)
+
+        # projection
+        residual = paddle.scale(z + x, math.sqrt(.5))
+        skip_connection = z
+        return residual, skip_connection
+
+    def start_sequence(self):
+        """
+        Prepare the ResidualBlock to generate a new sequence. This method 
+        should be called before starting calling `add_input` multiple times.
+        """
+        self.conv.start_sequence()
+        self.condition_proj.start_sequence()
+
+    def add_input(self, x, condition=None):
+        """
+        Add a step input. This method works similarily with `forward` but 
+        in a `step-in-step-out` fashion.
+
+        Args:
+            x (Variable): shape(B, C_res), input for a step, dtype float32.
+            condition (Variable, optional): shape(B, C_cond). condition for a 
+                step, dtype float32. Defaults to None.
+
+        Returns:
+            (residual, skip_connection)
+            residual (Variable): shape(B, C_res), the residual for a step, 
+                which is used as the input to the next layer of ResidualBlock.
+            skip_connection (Variable): shape(B, C_res), the skip connection 
+                for a step. This output is accumulated with that of other 
+                ResidualBlocks. 
+        """
+        h = x
+
+        # dilated conv
+        h = self.conv.add_input(h)
+
+        # condition
+        if condition is not None:
+            h += self.condition_proj.add_input(condition)
+
+        # gated tanh
+        content, gate = paddle.split(h, 2, axis=1)
+        z = F.sigmoid(gate) * paddle.tanh(content)
+
+        # projection
+        residual = paddle.scale(z + x, math.sqrt(0.5))
+        skip_connection = z
+        return residual, skip_connection
+
+
+class ResidualNet(nn.LayerList):
+    def __init__(self, n_loop, n_layer, residual_channels, condition_dim,
+                 filter_size):
+        """The residual network in wavenet. It consists of `n_layer` stacks, 
+            each of which consists of `n_loop` ResidualBlocks.
+
+        Args:
+            n_loop (int): number of ResidualBlocks in a stack.
+            n_layer (int): number of stacks in the `ResidualNet`.
+            residual_channels (int): channels of each `ResidualBlock`'s input.
+            condition_dim (int): channels of the condition.
+            filter_size (int): filter size of the internal Conv1DCell of each 
+                `ResidualBlock`.
+        """
+        super(ResidualNet, self).__init__()
+        # double the dilation at each layer in a loop(n_loop layers)
+        dilations = [2**i for i in range(n_loop)] * n_layer
+        self.context_size = 1 + sum(dilations)
+        for dilation in dilations:
+            self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
+
+    def forward(self, x, condition=None):
+        """
+        Args:
+            x (Tensor): shape(B, C_res, T), dtype float32, the input. 
+                (B stands for batch_size, C_res stands for residual channels, 
+                T stands for time steps.)
+            condition (Tensor, optional): shape(B, C_cond, T), dtype float32, 
+                the condition, it has been upsampled in time steps, so it has 
+                the same time steps as the input does.(C_cond stands for the 
+                condition's channels) Defaults to None.
+
+        Returns:
+            skip_connection (Tensor): shape(B, C_res, T), dtype float32, the output.
+        """
+        for i, func in enumerate(self):
+            x, skip = func(x, condition)
+            if i == 0:
+                skip_connections = skip
+            else:
+                skip_connections = paddle.scale(skip_connections + skip,
+                                        math.sqrt(0.5))
+        return skip_connections
+
+    def start_sequence(self):
+        """Prepare the ResidualNet to generate a new sequence. This method 
+            should be called before starting calling `add_input` multiple times.
+        """
+        for block in self:
+            block.start_sequence()
+
+    def add_input(self, x, condition=None):
+        """Add a step input. This method works similarily with `forward` but 
+            in a `step-in-step-out` fashion.
+
+        Args:
+            x (Tensor): shape(B, C_res), dtype float32, input for a step.
+            condition (Tensor, optional): shape(B, C_cond), dtype float32, 
+                condition for a step. Defaults to None.
+
+        Returns:
+            skip_connection (Tensor): shape(B, C_res), dtype float32, the 
+                output for a step.
+        """
+
+        for i, func in enumerate(self):
+            x, skip = func.add_input(x, condition)
+            if i == 0:
+                skip_connections = skip
+            else:
+                skip_connections = paddle.scale(skip_connections + skip,
+                                        math.sqrt(0.5))
+        return skip_connections
+
+
+class WaveNet(nn.Layer):
+    def __init__(self, n_loop, n_layer, residual_channels, output_dim,
+                 condition_dim, filter_size, loss_type, log_scale_min):
+        """Wavenet that transform upsampled mel spectrogram into waveform.
+
+        Args:
+            n_loop (int): n_loop for the internal ResidualNet.
+            n_layer (int): n_loop for the internal ResidualNet.
+            residual_channels (int): the channel of the input.
+            output_dim (int): the channel of the output distribution. 
+            condition_dim (int): the channel of the condition.
+            filter_size (int): the filter size of the internal ResidualNet.
+            loss_type (str): loss type of the wavenet. Possible values are 
+                'softmax' and 'mog'. 
+                If `loss_type` is 'softmax', the output is the logits of the 
+                catrgotical(multinomial) distribution, `output_dim` means the 
+                number of classes of the categorical distribution. 
+                If `loss_type` is mog(mixture of gaussians), the output is the 
+                parameters of a mixture of gaussians, which consists of weight
+                (in the form of logit) of each gaussian distribution and its 
+                mean and log standard deviaton. So when `loss_type` is 'mog', 
+                `output_dim` should be perfectly divided by 3.
+            log_scale_min (int): the minimum value of log standard deviation 
+                of the output gaussian distributions. Note that this value is 
+                only used for computing loss if `loss_type` is 'mog', values 
+                less than `log_scale_min` is clipped when computing loss.
+        """
+        super(WaveNet, self).__init__()
+        if loss_type not in ["softmax", "mog"]:
+            raise ValueError("loss_type {} is not supported".format(loss_type))
+        if loss_type == "softmax":
+            self.embed = nn.Embedding(output_dim, residual_channels)
+        else:
+            if (output_dim % 3 != 0):
+                raise ValueError(
+                    "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim))
+            self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=-1)
+
+        self.resnet = ResidualNet(n_loop, n_layer, residual_channels,
+                                  condition_dim, filter_size)
+        self.context_size = self.resnet.context_size
+
+        skip_channels = residual_channels  # assume the same channel
+        self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=-1)
+        self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=-1)
+        # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
+        # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
+        self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=-1)
+
+        self.loss_type = loss_type
+        self.output_dim = output_dim
+        self.input_dim = 1
+        self.skip_channels = skip_channels
+        self.log_scale_min = log_scale_min
+
+    def forward(self, x, condition=None):
+        """compute the output distribution (represented by its parameters).
+
+        Args:
+            x (Tensor): shape(B, T), dtype float32, the input waveform.
+            condition (Tensor, optional): shape(B, C_cond, T), dtype float32, 
+                the upsampled condition. Defaults to None.
+
+        Returns:
+            Tensor: shape(B, T, C_output), dtype float32, the parameter of 
+            the output distributions.
+        """
+
+        # Causal Conv
+        if self.loss_type == "softmax":
+            x = paddle.clip(x, min=-1., max=0.99999)
+            x = quantize(x, self.output_dim)
+            x = self.embed(x)  # (B, T, C)
+        else:
+            x = paddle.unsqueeze(x, -1)  # (B, T, 1)
+            x = self.embed(x)  # (B, T, C)
+        x = paddle.transpose(x, perm=[0, 2, 1])  # (B, C, T)
+
+        # Residual & Skip-conenection & linears
+        z = self.resnet(x, condition)
+
+        z = paddle.transpose(z, [0, 2, 1])
+        z = F.relu(self.proj2(F.relu(self.proj1(z))))
+
+        y = self.proj3(z)
+        return y
+
+    def start_sequence(self):
+        """Prepare the WaveNet to generate a new sequence. This method should 
+            be called before starting calling `add_input` multiple times.
+        """
+        self.resnet.start_sequence()
+
+    def add_input(self, x, condition=None):
+        """compute the output distribution (represented by its parameters) for 
+            a step. It works similarily with the `forward` method but in a 
+            `step-in-step-out` fashion.
+
+        Args:
+            x (Tensor): shape(B,), dtype float32, a step of the input waveform.
+            condition (Tensor, optional): shape(B, C_cond, ), dtype float32, a 
+                step of the upsampled condition. Defaults to None.
+
+        Returns:
+            Tensor: shape(B, C_output), dtype float32, the parameter of the 
+                output distributions.
+        """
+        # Causal Conv
+        if self.loss_type == "softmax":
+            x = paddle.clip(x, min=-1., max=0.99999)
+            x = quantize(x, self.output_dim)
+            x = self.embed(x)  # (B, C)
+        else:
+            x = paddle.unsqueeze(x, -1)  # (B, 1)
+            x = self.embed(x)  # (B, C)
+
+        # Residual & Skip-conenection & linears
+        z = self.resnet.add_input(x, condition)
+        z = F.relu(self.proj2(F.relu(self.proj1(z))))  # (B, C)
+
+        # Output
+        y = self.proj3(z)
+        return y
+
+    def compute_softmax_loss(self, y, t):
+        """compute the loss where output distribution is a categorial distribution.
+
+        Args:
+            y (Tensor): shape(B, T, C_output), dtype float32, the logits of the 
+                output distribution.
+            t (Tensor): shape(B, T), dtype float32, the target audio. Note that 
+                the target's corresponding time index is one step ahead of the 
+                output distribution. And output distribution whose input contains 
+                padding is neglected in loss computation.
+
+        Returns:
+            Tensor: shape(1, ), dtype float32, the loss.
+        """
+        # context size is not taken into account
+        y = y[:, self.context_size:, :]
+        t = t[:, self.context_size:]
+        t = paddle.clip(t, min=-1.0, max=0.99999)
+        quantized = quantize(t, n_bands=self.output_dim)
+        label = paddle.unsqueeze(quantized, -1)
+
+        loss = F.softmax_with_cross_entropy(y, label)
+        reduced_loss = paddle.reduce_mean(loss)
+        return reduced_loss
+
+    def sample_from_softmax(self, y):
+        """Sample from the output distribution where the output distribution is 
+            a categorical distriobution.
+
+        Args:
+            y (Tensor): shape(B, T, C_output), the logits of the output distribution.
+
+        Returns:
+            Tensor: shape(B, T), waveform sampled from the output distribution.
+        """
+        # dequantize
+        batch_size, time_steps, output_dim, = y.shape
+        y = paddle.reshape(y, (batch_size * time_steps, output_dim))
+        prob = F.softmax(y)
+        quantized = paddle.fluid.layers.sampling_id(prob)
+        samples = dequantize(quantized, n_bands=self.output_dim)
+        samples = paddle.reshape(samples, (batch_size, -1))
+        return samples
+
+    def compute_mog_loss(self, y, t):
+        """compute the loss where output distribution is a mixture of Gaussians.
+
+        Args:
+            y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of 
+                the output distribution. It is the concatenation of 3 parts, 
+                the logits of every distribution, the mean of each distribution 
+                and the log standard deviation of each distribution. Each part's 
+                shape is (B, T, n_mixture), where `n_mixture` means the number 
+                of Gaussians in the mixture.
+            t (Tensor): shape(B, T), dtype float32, the target audio. Note that 
+                the target's corresponding time index is one step ahead of the 
+                output distribution. And output distribution whose input contains 
+                padding is neglected in loss computation.
+
+        Returns:
+            Tensor: shape(1, ), dtype float32, the loss.
+        """
+        n_mixture = self.output_dim // 3
+
+        # context size is not taken in to account
+        y = y[:, self.context_size:, :]
+        t = t[:, self.context_size:]
+
+        w, mu, log_std = paddle.split(y, 3, axis=2)
+        # 100.0 is just a large float
+        log_std = paddle.clip(log_std, min=self.log_scale_min, max=100.)
+        inv_std = paddle.exp(-log_std)
+        p_mixture = F.softmax(w, -1)
+
+        t = paddle.unsqueeze(t, -1)
+        if n_mixture > 1:
+            # t = F.expand_as(t, log_std)
+            t = paddle.expand(t, [-1, -1, n_mixture])
+
+        x_std = inv_std * (t - mu)
+        exponent = paddle.exp(-0.5 * x_std * x_std)
+        pdf_x = 1.0 / math.sqrt(2.0 * math.pi) * inv_std * exponent
+
+        pdf_x = p_mixture * pdf_x
+        # pdf_x: [bs, len]
+        pdf_x = paddle.reduce_sum(pdf_x, -1)
+        per_sample_loss = -paddle.log(pdf_x + 1e-9)
+
+        loss = paddle.reduce_mean(per_sample_loss)
+        return loss
+
+    def sample_from_mog(self, y):
+        """Sample from the output distribution where the output distribution is 
+            a mixture of Gaussians.
+        Args:
+            y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of 
+            the output distribution. It is the concatenation of 3 parts, the 
+            logits of every distribution, the mean of each distribution and the 
+            log standard deviation of each distribution. Each part's shape is 
+            (B, T, n_mixture), where `n_mixture` means the number of Gaussians 
+            in the mixture.
+
+        Returns:
+            Tensor: shape(B, T), waveform sampled from the output distribution.
+        """
+        batch_size, time_steps, output_dim = y.shape
+        n_mixture = output_dim // 3
+
+        w, mu, log_std = paddle.split(y, 3, -1)
+
+        reshaped_w = paddle.reshape(w, (batch_size * time_steps, n_mixture))
+        prob_ids = paddle.fluid.layers.sampling_id(F.softmax(reshaped_w))
+        prob_ids = paddle.reshape(prob_ids, (batch_size, time_steps))
+        prob_ids = prob_ids.numpy()
+
+        # do it 
+        index = np.array([[[b, t, prob_ids[b, t]] for t in range(time_steps)]
+                          for b in range(batch_size)]).astype("int32")
+        index_var = paddle.to_tensor(index)
+
+        mu_ = paddle.gather_nd(mu, index_var)
+        log_std_ = paddle.gather_nd(log_std, index_var)
+
+        dist = D.Normal(mu_, paddle.exp(log_std_))
+        samples = dist.sample(shape=[])
+        samples = paddle.clip(samples, min=-1., max=1.)
+        return samples
+
+    def sample(self, y):
+        """Sample from the output distribution.
+        Args:
+            y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of 
+                the output distribution.
+
+        Returns:
+            Tensor: shape(B, T), waveform sampled from the output distribution.
+        """
+        if self.loss_type == "softmax":
+            return self.sample_from_softmax(y)
+        else:
+            return self.sample_from_mog(y)
+
+    def loss(self, y, t):
+        """compute the loss where output distribution is a mixture of Gaussians.
+
+        Args:
+            y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of 
+                the output distribution.
+            t (Tensor): shape(B, T), dtype float32, the target audio. Note that 
+                the target's corresponding time index is one step ahead of the 
+                output distribution. And output distribution whose input contains 
+                padding is neglected in loss computation.
+
+        Returns:
+            Tensor: shape(1, ), dtype float32, the loss.
+        """
+        if self.loss_type == "softmax":
+            return self.compute_softmax_loss(y, t)
+        else:
+            return self.compute_mog_loss(y, t)
+
+
+class UpsampleNet(nn.LayerList):
+    def __init__(self, upscale_factors=[16, 16]):
+        """UpsamplingNet.
+        It consists of several layers of Conv2DTranspose. Each Conv2DTranspose 
+            layer upsamples the time dimension by its `stride` times. And each 
+            Conv2DTranspose's filter_size at frequency dimension is 3.
+
+        Args:
+            upscale_factors (list[int], optional): time upsampling factors for 
+                each Conv2DTranspose Layer. The `UpsampleNet` contains 
+                len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor 
+                is used as the `stride` for the corresponding Conv2DTranspose. 
+                Defaults to [16, 16].
+        Note:
+            np.prod(upscale_factors) should equals the `hop_length` of the stft 
+                transformation used to extract spectrogram features from audios. 
+                For example, 16 * 16 = 256, then the spectram extracted using a 
+                stft transformation whose `hop_length` is 256. See `librosa.stft` 
+                for more details.
+        """
+        super(UpsampleNet, self).__init__()
+        self.upscale_factors = list(upscale_factors)
+        self.upscale_factor = 1
+        for item in upscale_factors:
+            self.upscale_factor *= item
+
+        for factor in self.upscale_factors:
+            self.append(
+                nn.utils.weight_norm(
+                    nn.ConvTranspose2d(1, 1, 
+                        kernel_size=(3, 2 * factor), 
+                        stride=(1, factor), 
+                        padding=(1, factor // 2))))
+
+    def forward(self, x):
+        """Compute the upsampled condition.
+
+        Args:
+            x (Tensor): shape(B, F, T), dtype float32, the condition 
+                (mel spectrogram here.) (F means the frequency bands). In the 
+                internal Conv2DTransposes, the frequency dimension is treated 
+                as `height` dimension instead of `in_channels`.
+
+        Returns:
+            Tensor: shape(B, F, T * upscale_factor), dtype float32, the 
+                upsampled condition.
+        """
+        x = paddle.unsqueeze(x, 1)
+        for sublayer in self:
+            x = F.leaky_relu(sublayer(x), 0.4)
+        x = paddle.squeeze(x, 1)
+        return x
+
+
+class ConditionalWavenet(nn.Layer):
+    def __init__(self, encoder, decoder):
+        """Conditional Wavenet, which contains an UpsampleNet as the encoder 
+            and a WaveNet as the decoder. It is an autoregressive model.
+
+        Args:
+            encoder (UpsampleNet): the UpsampleNet as the encoder.
+            decoder (WaveNet): the WaveNet as the decoder.
+        """
+        super(ConditionalWavenet, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(self, audio, mel, audio_start):
+        """Compute the output distribution given the mel spectrogram and the 
+            input(for teacher force training).
+
+        Args:
+            audio (Tensor): shape(B, T_audio), dtype float32, ground truth 
+                waveform, used for teacher force training.
+            mel (Tensor): shape(B, F, T_mel), dtype float32, mel spectrogram. 
+                Note that it is the spectrogram for the whole utterance.
+            audio_start (Tensor): shape(B, ), dtype: int, audio slices' start 
+                positions for each utterance.
+
+        Returns:
+            Tensor: shape(B, T_audio - 1, C_putput), parameters for the output 
+                distribution.(C_output is the `output_dim` of the decoder.)
+        """
+        audio_length = audio.shape[1]  # audio clip's length
+        condition = self.encoder(mel)
+        condition_slice = crop(condition, audio_start, audio_length)
+
+        # shifting 1 step
+        audio = audio[:, :-1]
+        condition_slice = condition_slice[:, :, 1:]
+
+        y = self.decoder(audio, condition_slice)
+        return y
+
+    def loss(self, y, t):
+        """compute loss with respect to the output distribution and the targer 
+            audio.
+
+        Args:
+            y (Tensor): shape(B, T - 1, C_output), dtype float32, parameters of 
+                the output distribution.
+            t (Tensor): shape(B, T), dtype float32, target waveform.
+
+        Returns:
+            Tensor: shape(1, ), dtype float32, the loss.
+        """
+        t = t[:, 1:]
+        loss = self.decoder.loss(y, t)
+        return loss
+
+    def sample(self, y):
+        """Sample from the output distribution.
+
+        Args:
+            y (Tensor): shape(B, T, C_output), dtype float32, parameters of the 
+                output distribution.
+
+        Returns:
+            Tensor: shape(B, T), dtype float32, sampled waveform from the output 
+                distribution.
+        """
+        samples = self.decoder.sample(y)
+        return samples
+
+    @paddle.no_grad()
+    def synthesis(self, mel):
+        """Synthesize waveform from mel spectrogram.
+
+        Args:
+            mel (Tensor): shape(B, F, T), condition(mel spectrogram here).
+
+        Returns:
+            Tensor: shape(B, T * upsacle_factor), synthesized waveform.
+                (`upscale_factor` is the `upscale_factor` of the encoder 
+                `UpsampleNet`)
+        """
+        condition = self.encoder(mel)
+        batch_size, _, time_steps = condition.shape
+        samples = []
+
+        self.decoder.start_sequence()
+        x_t = paddle.zeros((batch_size, ), dtype=mel.dtype)
+        for i in trange(time_steps):
+            c_t = condition[:, :, i]
+            y_t = self.decoder.add_input(x_t, c_t)
+            y_t = paddle.unsqueeze(y_t, 1)
+            x_t = self.sample(y_t)
+            x_t = paddle.squeeze(x_t, 1)
+            samples.append(x_t)
+
+        samples = paddle.concat(samples, -1)
+        return samples
+
+
+# TODO WaveNetLoss
--- a/parakeet/models/wavenet/init.py
+++ b/parakeet/models/wavenet/init.py
@ -1,16 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .net import *
-from .wavenet import *
--- a/parakeet/models/wavenet/net.py
+++ b/parakeet/models/wavenet/net.py
@ -1,179 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-import itertools
-import numpy as np
-from scipy import signal
-from tqdm import trange
-
-import paddle.fluid.layers as F
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import paddle.fluid.layers.distributions as D
-
-from parakeet.modules.weight_norm import Conv2DTranspose
-from parakeet.models.wavenet.wavenet import WaveNet
-
-
-def crop(x, audio_start, audio_length):
-    """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice.
-
-    Args:
-        x (Variable): shape(B, C, T), dtype float32, the upsample condition.
-        audio_start (Variable): shape(B, ), dtype: int64, the index the starting point.
-        audio_length (int): the length of the audio (number of samples it contaions).
-
-    Returns:
-        Variable: shape(B, C, audio_length), cropped condition.
-    """
-    # crop audio
-    slices = []  # for each example
-    starts = audio_start.numpy()
-    for i in range(x.shape[0]):
-        start = starts[i]
-        end = start + audio_length
-        slice = F.slice(x[i], axes=[1], starts=[start], ends=[end])
-        slices.append(slice)
-    out = F.stack(slices)
-    return out
-
-
-class UpsampleNet(dg.Layer):
-    def __init__(self, upscale_factors=[16, 16]):
-        """UpsamplingNet.
-        It consists of several layers of Conv2DTranspose. Each Conv2DTranspose layer upsamples the time dimension by its `stride` times. And each Conv2DTranspose's filter_size at frequency dimension is 3.
-
-        Args:
-            upscale_factors (list[int], optional): time upsampling factors for each Conv2DTranspose Layer. The `UpsampleNet` contains len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor is used as the `stride` for the corresponding Conv2DTranspose. Defaults to [16, 16].
-        Note:
-            np.prod(upscale_factors) should equals the `hop_length` of the stft transformation used to extract spectrogram features from audios. For example, 16 * 16 = 256, then the spectram extracted using a stft transformation whose `hop_length` is 256. See `librosa.stft` for more details.
-        """
-        super(UpsampleNet, self).__init__()
-        self.upscale_factors = list(upscale_factors)
-        self.upsample_convs = dg.LayerList()
-        for i, factor in enumerate(upscale_factors):
-            self.upsample_convs.append(
-                Conv2DTranspose(
-                    1,
-                    1,
-                    filter_size=(3, 2 * factor),
-                    stride=(1, factor),
-                    padding=(1, factor // 2)))
-
-    @property
-    def upscale_factor(self):
-        return np.prod(self.upscale_factors)
-
-    def forward(self, x):
-        """Compute the upsampled condition.
-
-        Args:
-            x (Variable): shape(B, F, T), dtype float32, the condition (mel spectrogram here.) (F means the frequency bands). In the internal Conv2DTransposes, the frequency dimension is treated as `height` dimension instead of `in_channels`.
-
-        Returns:
-            Variable: shape(B, F, T * upscale_factor), dtype float32, the upsampled condition.
-        """
-        x = F.unsqueeze(x, axes=[1])
-        for sublayer in self.upsample_convs:
-            x = F.leaky_relu(sublayer(x), alpha=.4)
-        x = F.squeeze(x, [1])
-        return x
-
-
-# AutoRegressive Model
-class ConditionalWavenet(dg.Layer):
-    def __init__(self, encoder, decoder):
-        """Conditional Wavenet, which contains an UpsampleNet as the encoder and a WaveNet as the decoder. It is an autoregressive model.
-
-        Args:
-            encoder (UpsampleNet): the UpsampleNet as the encoder.
-            decoder (WaveNet): the WaveNet as the decoder.
-        """
-        super(ConditionalWavenet, self).__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-
-    def forward(self, audio, mel, audio_start):
-        """Compute the output distribution given the mel spectrogram and the input(for teacher force training).
-
-        Args:
-            audio (Variable): shape(B, T_audio), dtype float32, ground truth waveform, used for teacher force training.
-            mel ([Variable): shape(B, F, T_mel), dtype float32, mel spectrogram. Note that it is the spectrogram for the whole utterance.
-            audio_start (Variable): shape(B, ), dtype: int, audio slices' start positions for each utterance.
-
-        Returns:
-            Variable: shape(B, T_audio - 1, C_putput), parameters for the output distribution.(C_output is the `output_dim` of the decoder.)
-        """
-        audio_length = audio.shape[1]  # audio clip's length
-        condition = self.encoder(mel)
-        condition_slice = crop(condition, audio_start, audio_length)
-
-        # shifting 1 step
-        audio = audio[:, :-1]
-        condition_slice = condition_slice[:, :, 1:]
-
-        y = self.decoder(audio, condition_slice)
-        return y
-
-    def loss(self, y, t):
-        """compute loss with respect to the output distribution and the targer audio.
-
-        Args:
-            y (Variable): shape(B, T - 1, C_output), dtype float32, parameters of the output distribution.
-            t (Variable): shape(B, T), dtype float32, target waveform.
-
-        Returns:
-            Variable: shape(1, ), dtype float32, the loss.
-        """
-        t = t[:, 1:]
-        loss = self.decoder.loss(y, t)
-        return loss
-
-    def sample(self, y):
-        """Sample from the output distribution.
-
-        Args:
-            y (Variable): shape(B, T, C_output), dtype float32, parameters of the output distribution.
-
-        Returns:
-            Variable: shape(B, T), dtype float32, sampled waveform from the output distribution.
-        """
-        samples = self.decoder.sample(y)
-        return samples
-
-    @dg.no_grad
-    def synthesis(self, mel):
-        """Synthesize waveform from mel spectrogram.
-
-        Args:
-            mel (Variable): shape(B, F, T), condition(mel spectrogram here).
-
-        Returns:
-            Variable: shape(B, T * upsacle_factor), synthesized waveform.(`upscale_factor` is the `upscale_factor` of the encoder `UpsampleNet`)
-        """
-        condition = self.encoder(mel)
-        batch_size, _, time_steps = condition.shape
-        samples = []
-
-        self.decoder.start_sequence()
-        x_t = F.zeros((batch_size, 1), dtype="float32")
-        for i in trange(time_steps):
-            c_t = condition[:, :, i:i + 1]
-            y_t = self.decoder.add_input(x_t, c_t)
-            x_t = self.sample(y_t)
-            samples.append(x_t)
-
-        samples = F.concat(samples, axis=-1)
-        return samples
--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
@ -1,467 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-import math
-import time
-import itertools
-import numpy as np
-
-import paddle.fluid.layers as F
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import paddle.fluid.layers.distributions as D
-
-from parakeet.modules.weight_norm import Linear, Conv1D, Conv1DCell, Conv2DTranspose
-
-
-# for wavenet with softmax loss
-def quantize(values, n_bands):
-    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in [0, n_bands).
-
-    Args:
-        values (Variable): dtype: flaot32 or float64. the floating point value.
-        n_bands (int): the number of bands. The output integer Tensor's value is in the range [0, n_bans).
-
-    Returns:
-        Variable: the quantized tensor, dtype: int64.
-    """
-    quantized = F.cast((values + 1.0) / 2.0 * n_bands, "int64")
-    return quantized
-
-
-def dequantize(quantized, n_bands):
-    """Linearlly dequantize an integer Tensor into a float Tensor in the range [-1, 1).
-
-    Args:
-        quantized (Variable): dtype: int64. The quantized value in the range [0, n_bands).
-        n_bands (int): number of bands. The input integer Tensor's value is in the range [0, n_bans).
-
-    Returns:
-        Variable: the dequantized tensor, dtype float3232.
-    """
-    value = (F.cast(quantized, "float32") + 0.5) * (2.0 / n_bands) - 1.0
-    return value
-
-
-class ResidualBlock(dg.Layer):
-    def __init__(self, residual_channels, condition_dim, filter_size,
-                 dilation):
-        """A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition.
-
-        Args:
-            residual_channels (int): the channels of the input, residual and skip.
-            condition_dim (int): the channels of the condition.
-            filter_size (int): filter size of the internal convolution cell.
-            dilation (int): dilation of the internal convolution cell.
-        """
-        super(ResidualBlock, self).__init__()
-        dilated_channels = 2 * residual_channels
-        # following clarinet's implementation, we do not have parametric residual
-        # & skip connection.
-
-        std = np.sqrt(1 / (filter_size * residual_channels))
-        self.conv = Conv1DCell(
-            residual_channels,
-            dilated_channels,
-            filter_size,
-            dilation=dilation,
-            causal=True,
-            param_attr=I.Normal(scale=std))
-
-        std = np.sqrt(1 / condition_dim)
-        self.condition_proj = Conv1D(
-            condition_dim, dilated_channels, 1, param_attr=I.Normal(scale=std))
-
-        self.filter_size = filter_size
-        self.dilation = dilation
-        self.dilated_channels = dilated_channels
-        self.residual_channels = residual_channels
-        self.condition_dim = condition_dim
-
-    def forward(self, x, condition=None):
-        """Conv1D gated-tanh Block.
-
-        Args:
-            x (Variable): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype float32.
-            condition (Variable, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None.
-
-        Returns:
-            (residual, skip_connection)
-            residual (Variable): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock.
-            skip_connection (Variable): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks. 
-        """
-        time_steps = x.shape[-1]
-        h = x
-
-        # dilated conv
-        h = self.conv(h)
-        if h.shape[-1] != time_steps:
-            h = h[:, :, :time_steps]
-
-        # condition
-        if condition is not None:
-            h += self.condition_proj(condition)
-
-        # gated tanh
-        content, gate = F.split(h, 2, dim=1)
-        z = F.sigmoid(gate) * F.tanh(content)
-
-        # projection
-        residual = F.scale(z + x, math.sqrt(.5))
-        skip_connection = z
-        return residual, skip_connection
-
-    def start_sequence(self):
-        """Prepare the ResidualBlock to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
-        """
-        self.conv.start_sequence()
-
-    def add_input(self, x, condition=None):
-        """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion.
-
-        Args:
-            x (Variable): shape(B, C_res, T=1), input for a step, dtype float32.
-            condition (Variable, optional): shape(B, C_cond, T=1). condition for a step, dtype float32. Defaults to None.
-
-        Returns:
-            (residual, skip_connection)
-            residual (Variable): shape(B, C_res, T=1), the residual for a step, which is used as the input to the next layer of ResidualBlock.
-            skip_connection (Variable): shape(B, C_res, T=1), the skip connection for a step. This output is accumulated with that of other ResidualBlocks. 
-        """
-        h = x
-
-        # dilated conv
-        h = self.conv.add_input(h)
-
-        # condition
-        if condition is not None:
-            h += self.condition_proj(condition)
-
-        # gated tanh
-        content, gate = F.split(h, 2, dim=1)
-        z = F.sigmoid(gate) * F.tanh(content)
-
-        # projection
-        residual = F.scale(z + x, np.sqrt(0.5))
-        skip_connection = z
-        return residual, skip_connection
-
-
-class ResidualNet(dg.Layer):
-    def __init__(self, n_loop, n_layer, residual_channels, condition_dim,
-                 filter_size):
-        """The residual network in wavenet. It consists of `n_layer` stacks, each of which consists of `n_loop` ResidualBlocks.
-
-        Args:
-            n_loop (int): number of ResidualBlocks in a stack.
-            n_layer (int): number of stacks in the `ResidualNet`.
-            residual_channels (int): channels of each `ResidualBlock`'s input.
-            condition_dim (int): channels of the condition.
-            filter_size (int): filter size of the internal Conv1DCell of each `ResidualBlock`.
-        """
-        super(ResidualNet, self).__init__()
-        # double the dilation at each layer in a loop(n_loop layers)
-        dilations = [2**i for i in range(n_loop)] * n_layer
-        self.context_size = 1 + sum(dilations)
-        self.residual_blocks = dg.LayerList([
-            ResidualBlock(residual_channels, condition_dim, filter_size,
-                          dilation) for dilation in dilations
-        ])
-
-    def forward(self, x, condition=None):
-        """
-        Args:
-            x (Variable): shape(B, C_res, T), dtype float32, the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.)
-            condition (Variable, optional): shape(B, C_cond, T), dtype float32, the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels) Defaults to None.
-
-        Returns:
-            skip_connection (Variable): shape(B, C_res, T), dtype float32, the output.
-        """
-        for i, func in enumerate(self.residual_blocks):
-            x, skip = func(x, condition)
-            if i == 0:
-                skip_connections = skip
-            else:
-                skip_connections = F.scale(skip_connections + skip,
-                                           np.sqrt(0.5))
-        return skip_connections
-
-    def start_sequence(self):
-        """Prepare the ResidualNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
-        """
-        for block in self.residual_blocks:
-            block.start_sequence()
-
-    def add_input(self, x, condition=None):
-        """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion.
-
-        Args:
-            x (Variable): shape(B, C_res, T=1), dtype float32, input for a step.
-            condition (Variable, optional): shape(B, C_cond, T=1), dtype float32, condition for a step. Defaults to None.
-
-        Returns:
-            skip_connection (Variable): shape(B, C_res, T=1), dtype float32, the output for a step.
-        """
-
-        for i, func in enumerate(self.residual_blocks):
-            x, skip = func.add_input(x, condition)
-            if i == 0:
-                skip_connections = skip
-            else:
-                skip_connections = F.scale(skip_connections + skip,
-                                           np.sqrt(0.5))
-        return skip_connections
-
-
-class WaveNet(dg.Layer):
-    def __init__(self, n_loop, n_layer, residual_channels, output_dim,
-                 condition_dim, filter_size, loss_type, log_scale_min):
-        """Wavenet that transform upsampled mel spectrogram into waveform.
-
-        Args:
-            n_loop (int): n_loop for the internal ResidualNet.
-            n_layer (int): n_loop for the internal ResidualNet.
-            residual_channels (int): the channel of the input.
-            output_dim (int): the channel of the output distribution. 
-            condition_dim (int): the channel of the condition.
-            filter_size (int): the filter size of the internal ResidualNet.
-            loss_type (str): loss type of the wavenet. Possible values are 'softmax' and 'mog'. If `loss_type` is 'softmax', the output is the logits of the catrgotical(multinomial) distribution, `output_dim` means the number of classes of the categorical distribution. If `loss_type` is mog(mixture of gaussians), the output is the parameters of a mixture of gaussians, which consists of weight(in the form of logit) of each gaussian distribution and its mean and log standard deviaton. So when `loss_type` is 'mog', `output_dim` should be perfectly divided by 3.
-            log_scale_min (int): the minimum value of log standard deviation of the output gaussian distributions. Note that this value is only used for computing loss if `loss_type` is 'mog', values less than `log_scale_min` is clipped when computing loss.
-        """
-        super(WaveNet, self).__init__()
-        if loss_type not in ["softmax", "mog"]:
-            raise ValueError("loss_type {} is not supported".format(loss_type))
-        if loss_type == "softmax":
-            self.embed = dg.Embedding((output_dim, residual_channels))
-        else:
-            assert output_dim % 3 == 0, "with MoG output, the output dim must be divided by 3"
-            self.embed = Linear(1, residual_channels)
-
-        self.resnet = ResidualNet(n_loop, n_layer, residual_channels,
-                                  condition_dim, filter_size)
-        self.context_size = self.resnet.context_size
-
-        skip_channels = residual_channels  # assume the same channel
-        self.proj1 = Linear(skip_channels, skip_channels)
-        self.proj2 = Linear(skip_channels, skip_channels)
-        # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
-        # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
-        self.proj3 = Linear(skip_channels, output_dim)
-
-        self.loss_type = loss_type
-        self.output_dim = output_dim
-        self.input_dim = 1
-        self.skip_channels = skip_channels
-        self.log_scale_min = log_scale_min
-
-    def forward(self, x, condition=None):
-        """compute the output distribution (represented by its parameters).
-
-        Args:
-            x (Variable): shape(B, T), dtype float32, the input waveform.
-            condition (Variable, optional): shape(B, C_cond, T), dtype float32, the upsampled condition. Defaults to None.
-
-        Returns:
-            Variable: shape(B, T, C_output), dtype float32, the parameter of the output distributions.
-        """
-
-        # Causal Conv
-        if self.loss_type == "softmax":
-            x = F.clip(x, min=-1., max=0.99999)
-            x = quantize(x, self.output_dim)
-            x = self.embed(x)  # (B, T, C)
-        else:
-            x = F.unsqueeze(x, axes=[-1])  # (B, T, 1)
-            x = self.embed(x)  # (B, T, C)
-        x = F.transpose(x, perm=[0, 2, 1])  # (B, C, T)
-
-        # Residual & Skip-conenection & linears
-        z = self.resnet(x, condition)
-
-        z = F.transpose(z, [0, 2, 1])
-        z = F.relu(self.proj2(F.relu(self.proj1(z))))
-
-        y = self.proj3(z)
-        return y
-
-    def start_sequence(self):
-        """Prepare the WaveNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
-        """
-        self.resnet.start_sequence()
-
-    def add_input(self, x, condition=None):
-        """compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion.
-
-        Args:
-            x (Variable): shape(B, T=1), dtype float32, a step of the input waveform.
-            condition (Variable, optional): shape(B, C_cond, T=1), dtype float32, a step of the upsampled condition. Defaults to None.
-
-        Returns:
-            Variable: shape(B, T=1, C_output), dtype float32, the parameter of the output distributions.
-        """
-        # Causal Conv
-        if self.loss_type == "softmax":
-            x = F.clip(x, min=-1., max=0.99999)
-            x = quantize(x, self.output_dim)
-            x = self.embed(x)  # (B, T, C), T=1
-        else:
-            x = F.unsqueeze(x, axes=[-1])  # (B, T, 1), T=1
-            x = self.embed(x)  # (B, T, C)
-        x = F.transpose(x, perm=[0, 2, 1])
-
-        # Residual & Skip-conenection & linears
-        z = self.resnet.add_input(x, condition)
-        z = F.transpose(z, [0, 2, 1])
-        z = F.relu(self.proj2(F.relu(self.proj1(z))))  # (B, T, C)
-
-        # Output
-        y = self.proj3(z)
-        return y
-
-    def compute_softmax_loss(self, y, t):
-        """compute the loss where output distribution is a categorial distribution.
-
-        Args:
-            y (Variable): shape(B, T, C_output), dtype float32, the logits of the output distribution.
-            t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.
-
-        Returns:
-            Variable: shape(1, ), dtype float32, the loss.
-        """
-        # context size is not taken into account
-        y = y[:, self.context_size:, :]
-        t = t[:, self.context_size:]
-        t = F.clip(t, min=-1.0, max=0.99999)
-        quantized = quantize(t, n_bands=self.output_dim)
-        label = F.unsqueeze(quantized, axes=[-1])
-
-        loss = F.softmax_with_cross_entropy(y, label)
-        reduced_loss = F.reduce_mean(loss)
-        return reduced_loss
-
-    def sample_from_softmax(self, y):
-        """Sample from the output distribution where the output distribution is a categorical distriobution.
-
-        Args:
-            y (Variable): shape(B, T, C_output), the logits of the output distribution
-
-        Returns:
-            Variable: shape(B, T), waveform sampled from the output distribution.
-        """
-        # dequantize
-        batch_size, time_steps, output_dim, = y.shape
-        y = F.reshape(y, (batch_size * time_steps, output_dim))
-        prob = F.softmax(y)
-        quantized = F.sampling_id(prob)
-        samples = dequantize(quantized, n_bands=self.output_dim)
-        samples = F.reshape(samples, (batch_size, -1))
-        return samples
-
-    def compute_mog_loss(self, y, t):
-        """compute the loss where output distribution is a mixture of Gaussians.
-
-        Args:
-            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
-            t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.
-
-        Returns:
-            Variable: shape(1, ), dtype float32, the loss.
-        """
-        n_mixture = self.output_dim // 3
-
-        # context size is not taken in to account
-        y = y[:, self.context_size:, :]
-        t = t[:, self.context_size:]
-
-        w, mu, log_std = F.split(y, 3, dim=2)
-        # 100.0 is just a large float
-        log_std = F.clip(log_std, min=self.log_scale_min, max=100.)
-        inv_std = F.exp(-log_std)
-        p_mixture = F.softmax(w, axis=-1)
-
-        t = F.unsqueeze(t, axes=[-1])
-        if n_mixture > 1:
-            # t = F.expand_as(t, log_std)
-            t = F.expand(t, [1, 1, n_mixture])
-
-        x_std = inv_std * (t - mu)
-        exponent = F.exp(-0.5 * x_std * x_std)
-        pdf_x = 1.0 / math.sqrt(2.0 * math.pi) * inv_std * exponent
-
-        pdf_x = p_mixture * pdf_x
-        # pdf_x: [bs, len]
-        pdf_x = F.reduce_sum(pdf_x, dim=-1)
-        per_sample_loss = -F.log(pdf_x + 1e-9)
-
-        loss = F.reduce_mean(per_sample_loss)
-        return loss
-
-    def sample_from_mog(self, y):
-        """Sample from the output distribution where the output distribution is a mixture of Gaussians.
-        Args:
-            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
-
-        Returns:
-            Variable: shape(B, T), waveform sampled from the output distribution.
-        """
-        batch_size, time_steps, output_dim = y.shape
-        n_mixture = output_dim // 3
-
-        w, mu, log_std = F.split(y, 3, dim=-1)
-
-        reshaped_w = F.reshape(w, (batch_size * time_steps, n_mixture))
-        prob_ids = F.sampling_id(F.softmax(reshaped_w))
-        prob_ids = F.reshape(prob_ids, (batch_size, time_steps))
-        prob_ids = prob_ids.numpy()
-
-        index = np.array([[[b, t, prob_ids[b, t]] for t in range(time_steps)]
-                          for b in range(batch_size)]).astype("int32")
-        index_var = dg.to_variable(index)
-
-        mu_ = F.gather_nd(mu, index_var)
-        log_std_ = F.gather_nd(log_std, index_var)
-
-        dist = D.Normal(mu_, F.exp(log_std_))
-        samples = dist.sample(shape=[])
-        samples = F.clip(samples, min=-1., max=1.)
-        return samples
-
-    def sample(self, y):
-        """Sample from the output distribution.
-        Args:
-            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution.
-
-        Returns:
-            Variable: shape(B, T), waveform sampled from the output distribution.
-        """
-        if self.loss_type == "softmax":
-            return self.sample_from_softmax(y)
-        else:
-            return self.sample_from_mog(y)
-
-    def loss(self, y, t):
-        """compute the loss where output distribution is a mixture of Gaussians.
-
-        Args:
-            y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution.
-            t (Variable): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.
-
-        Returns:
-            Variable: shape(1, ), dtype float32, the loss.
-        """
-        if self.loss_type == "softmax":
-            return self.compute_softmax_loss(y, t)
-        else:
-            return self.compute_mog_loss(y, t)
--- a/parakeet/modules/init.py
+++ b/parakeet/modules/init.py
@ -12,5 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from . import weight_norm
-from .customized import *
--- a/parakeet/modules/attention.py
+++ b/parakeet/modules/attention.py
@ -0,0 +1,197 @@
+import math
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0, training=True):
+    """
+    scaled dot product attention with mask. Assume q, k, v all have the same 
+    leader dimensions(denoted as * in descriptions below). Dropout is applied to 
+    attention weights before weighted sum of values.
+
+    Args:
+        q (Tensor): shape(*, T_q, d), the query tensor.
+        k (Tensor): shape(*, T_k, d), the key tensor.
+        v (Tensor): shape(*, T_k, d_v), the value tensor.
+        mask (Tensor, optional): shape(*, T_q, T_k) or broadcastable shape, the 
+            mask tensor, 0 correspond to padding. Defaults to None.
+    
+    Returns:
+        (out, attn_weights)
+        out (Tensor): shape(*, T_q, d_v), the context vector.
+        attn_weights (Tensor): shape(*, T_q, T_k), the attention weights.
+    """
+    d = q.shape[-1] # we only support imperative execution
+    qk = paddle.matmul(q, k, transpose_y=True)
+    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
+    
+    if mask is not None:
+        scaled_logit += paddle.scale((1.0 - mask), -1e12) # hard coded here
+    
+    attn_weights = F.softmax(scaled_logit, axis=-1)
+    attn_weights = F.dropout(attn_weights, dropout, training=training)
+    out = paddle.matmul(attn_weights, v)
+    return out, attn_weights
+
+def drop_head(x, drop_n_heads, training):
+    """
+    Drop n heads from multiple context vectors.
+
+    Args:
+        x (Tensor): shape(batch_size, num_heads, time_steps, channels), the input.
+        drop_n_heads (int): [description]
+        training ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    if not training or (drop_n_heads == 0):
+        return x
+    
+    batch_size, num_heads, _, _ = x.shape
+    # drop all heads
+    if num_heads == drop_n_heads:
+        return paddle.zeros_like(x)
+    
+    mask = np.ones([batch_size, num_heads])
+    mask[:, :drop_n_heads] = 0
+    for subarray in mask:
+        np.random.shuffle(subarray)
+    scale = float(num_heads) / (num_heads - drop_n_heads)
+    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
+    out = x * paddle.to_tensor(mask)
+    return out
+
+def _split_heads(x, num_heads):
+    batch_size, time_steps, _ = x.shape
+    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
+    x = paddle.transpose(x, [0, 2, 1, 3])
+    return x
+
+def _concat_heads(x):
+    batch_size, _, time_steps, _ = x.shape
+    x = paddle.transpose(x, [0, 2, 1, 3])
+    x = paddle.reshape(x, [batch_size, time_steps, -1])
+    return x
+
+# Standard implementations of Monohead Attention & Multihead Attention
+class MonoheadAttention(nn.Layer):
+    def __init__(self, model_dim, dropout=0.0, k_dim=None, v_dim=None):
+        """
+        Monohead Attention module.
+
+        Args:
+            model_dim (int): the feature size of query.
+            dropout (float, optional): dropout probability of scaled dot product
+                attention and final context vector. Defaults to 0.0.
+            k_dim (int, optional): feature size of the key of each scaled dot 
+                product attention. If not provided, it is set to 
+                model_dim / num_heads. Defaults to None.
+            v_dim (int, optional): feature size of the key of each scaled dot 
+                product attention. If not provided, it is set to 
+                model_dim / num_heads. Defaults to None.
+        """
+        super(MonoheadAttention, self).__init__()
+        k_dim = k_dim or model_dim
+        v_dim = v_dim or model_dim
+        self.affine_q = nn.Linear(model_dim, k_dim)
+        self.affine_k = nn.Linear(model_dim, k_dim)
+        self.affine_v = nn.Linear(model_dim, v_dim)
+        self.affine_o = nn.Linear(v_dim, model_dim)
+        
+        self.model_dim = model_dim
+        self.dropout = dropout
+    
+    def forward(self, q, k, v, mask):
+        """
+        Compute context vector and attention weights.
+        
+        Args:
+            q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
+            k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
+            v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
+            mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or 
+                broadcastable shape, dtype: float32 or float64, the mask.
+
+        Returns:
+            (out, attention_weights)
+            out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
+            attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
+        """
+        q = self.affine_q(q) # (B, T, C)
+        k = self.affine_k(k)
+        v = self.affine_v(v)
+        
+        context_vectors, attention_weights = scaled_dot_product_attention(
+            q, k, v, mask, self.dropout, self.training)
+        
+        out = self.affine_o(context_vectors)
+        return out, attention_weights
+
+        
+class MultiheadAttention(nn.Layer):
+    """
+    Multihead scaled dot product attention.
+    """
+    def __init__(self, model_dim, num_heads, dropout=0.0, k_dim=None, v_dim=None):
+        """
+        Multihead Attention module.
+
+        Args:
+            model_dim (int): the feature size of query.
+            num_heads (int): the number of attention heads.
+            dropout (float, optional): dropout probability of scaled dot product
+                attention and final context vector. Defaults to 0.0.
+            k_dim (int, optional): feature size of the key of each scaled dot 
+                product attention. If not provided, it is set to 
+                model_dim / num_heads. Defaults to None.
+            v_dim (int, optional): feature size of the key of each scaled dot 
+                product attention. If not provided, it is set to 
+                model_dim / num_heads. Defaults to None.
+
+        Raises:
+            ValueError: if model_dim is not divisible by num_heads
+        """
+        super(MultiheadAttention, self).__init__()
+        if model_dim % num_heads !=0:
+            raise ValueError("model_dim must be divisible by num_heads")
+        depth = model_dim // num_heads
+        k_dim = k_dim or depth
+        v_dim = v_dim or depth
+        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
+        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
+        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
+        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
+        
+        self.num_heads = num_heads
+        self.model_dim = model_dim
+        self.dropout = dropout
+    
+    def forward(self, q, k, v, mask):
+        """
+        Compute context vector and attention weights.
+        
+        Args:
+            q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
+            k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
+            v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
+            mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or 
+                broadcastable shape, dtype: float32 or float64, the mask.
+
+        Returns:
+            (out, attention_weights)
+            out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
+            attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
+        """
+        q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
+        k = _split_heads(self.affine_k(k), self.num_heads)
+        v = _split_heads(self.affine_v(v), self.num_heads)
+        mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
+        
+        context_vectors, attention_weights = scaled_dot_product_attention(
+            q, k, v, mask, self.dropout, self.training)
+        # NOTE: there is more sophisticated implementation: Scheduled DropHead
+        context_vectors = _concat_heads(context_vectors) # (B, T, h*C)
+        out = self.affine_o(context_vectors)
+        return out, attention_weights
--- a/parakeet/modules/cbhg.py
+++ b/parakeet/modules/cbhg.py
@ -0,0 +1,104 @@
+import math
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+
+class Conv1dBatchNorm(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0,
+                 weight_attr=None, bias_attr=None):
+        super(Conv1dBatchNorm, self).__init__()
+        # TODO(chenfeiyu): carefully initialize Conv1d's weight
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride,
+                              padding=padding,
+                              weight_attr=weight_attr,
+                              bias_attr=bias_attr)
+        # TODO: channel last, but BatchNorm1d does not support channel last layout
+        self.bn = nn.BatchNorm1d(out_channels)
+
+    def forward(self, x):
+        return self.bn(self.conv(x))
+
+
+class Highway(nn.Layer):
+    def __init__(self, num_features):
+        super(Highway, self).__init__()
+        self.H = nn.Linear(num_features, num_features)
+        self.T = nn.Linear(num_features, num_features,
+                           bias_attr=I.Constant(-1.))
+
+        self.num_features = num_features
+
+    def forward(self, x):
+        H = F.relu(self.H(x))
+        T = F.sigmoid(self.T(x))  # gate
+        return H * T + x * (1.0 - T)
+
+
+class CBHG(nn.Layer):
+    def __init__(self, in_channels, out_channels_per_conv, max_kernel_size,
+                 projection_channels,
+                 num_highways, highway_features,
+                 gru_features):
+        super(CBHG, self).__init__()
+        self.conv1d_banks = nn.LayerList(
+            [Conv1dBatchNorm(in_channels, out_channels_per_conv, (k,),
+                             padding=((k - 1) // 2, k // 2))
+             for k in range(1, 1 + max_kernel_size)])
+
+        self.projections = nn.LayerList()
+        projection_channels = list(projection_channels)
+        proj_in_channels = [max_kernel_size *
+                            out_channels_per_conv] + projection_channels
+        proj_out_channels = projection_channels + \
+            [in_channels]  # ensure residual connection
+        for c_in, c_out in zip(proj_in_channels, proj_out_channels):
+            conv = nn.Conv1d(c_in, c_out, (3,), padding=(1, 1))
+            self.projections.append(conv)
+
+        if in_channels != highway_features:
+            self.pre_highway = nn.Linear(in_channels, highway_features)
+
+        self.highways = nn.LayerList(
+            [Highway(highway_features) for _ in range(num_highways)])
+
+        self.gru = nn.GRU(highway_features, gru_features,
+                          direction="bidirectional")
+
+        self.in_channels = in_channels
+        self.out_channels_per_conv = out_channels_per_conv
+        self.max_kernel_size = max_kernel_size
+        self.num_projections = 1 + len(projection_channels)
+        self.num_highways = num_highways
+        self.highway_features = highway_features
+        self.gru_features = gru_features
+
+    def forward(self, x):
+        input = x
+
+        # conv banks
+        conv_outputs = []
+        for conv in self.conv1d_banks:
+            conv_outputs.append(conv(x))
+        x = F.relu(paddle.concat(conv_outputs, 1))
+
+        # max pool
+        x = F.max_pool1d(x, 2, stride=1, padding=(0, 1))
+
+        # conv1d projections
+        n_projections = len(self.projections)
+        for i, conv in enumerate(self.projections):
+            x = conv(x)
+            if i != n_projections:
+                x = F.relu(x)
+        x += input  # residual connection
+
+        # highway
+        x = paddle.transpose(x, [0, 2, 1])
+        if hasattr(self, "pre_highway"):
+            x = self.pre_highway(x)
+
+        # gru
+        x, _ = self.gru(x)
+        return x
--- a/parakeet/modules/connections.py
+++ b/parakeet/modules/connections.py
@ -0,0 +1,62 @@
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+def residual_connection(input, layer):
+    """residual connection, only used for single input-single output layer.
+    y = x + F(x) where F corresponds to the layer.
+
+    Args:
+        x (Tensor): the input tensor.
+        layer (callable): a callable that preserve tensor shape.
+    """
+    return input + layer(input)
+
+class ResidualWrapper(nn.Layer):
+    def __init__(self, layer):
+        super(ResidualWrapper, self).__init__()
+        self.layer = layer
+    
+    def forward(self, x):
+        return residual_connection(x, self.layer)
+
+
+class PreLayerNormWrapper(nn.Layer):
+    def __init__(self, layer, d_model):
+        super(PreLayerNormWrapper, self).__init__()
+        self.layer = layer
+        self.layer_norm = nn.LayerNorm([d_model], epsilon=1e-6)
+    
+    def forward(self, x):
+        return x + self.layer(self.layer_norm(x))
+
+
+class PostLayerNormWrapper(nn.Layer):
+    def __init__(self, layer, d_model):
+        super(PostLayerNormWrapper, self).__init__()
+        self.layer = layer
+        self.layer_norm = nn.LayerNorm([d_model], epsilon=1e-6)
+    
+    def forward(self, x):
+        return self.layer_norm(x + self.layer(x))
+
+
+def context_gate(input, axis):
+    """sigmoid gate the content by gate.
+
+    Args:
+        input (Tensor): shape(*, d_axis, *), the input, treated as content & gate.
+        axis (int): the axis to chunk content and gate.
+
+    Raises:
+        ValueError: if input.shape[axis] is not even.
+
+    Returns:
+        Tensor: shape(*, d_axis / 2 , *), the gated content.
+    """
+    size = input.shape[axis]
+    if size % 2 != 0:
+        raise ValueError("the size of the {}-th dimension of input should "
+                         "be even, but received {}".format(axis, size))
+    content, gate = paddle.chunk(input, 2, axis)
+    return F.sigmoid(gate) * content
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
@ -0,0 +1,81 @@
+import paddle
+from paddle import nn
+
+class Conv1dCell(nn.Conv1d):
+    """
+    A subclass of Conv1d layer, which can be used like an RNN cell. It can take 
+    step input and return step output. It is done by keeping an internal buffer, 
+    when adding a step input, we shift the buffer and return a step output. For 
+    single step case, convolution devolves to a linear transformation.
+    
+    That it can be used as a cell depends on several restrictions:
+    1. stride must be 1;
+    2. padding must be an asymmetric padding (recpetive_field - 1, 0).
+    
+    As a result, these arguments are removed form the initializer.
+    """
+    def __init__(self, 
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 dilation=1,
+                 weight_attr=None,
+                 bias_attr=None):
+        _dilation = dilation[0] if isinstance(dilation, (tuple, list)) else dilation
+        _kernel_size = kernel_size[0] if isinstance(kernel_size, (tuple, list)) else kernel_size
+        self._r = 1 + (_kernel_size - 1) * _dilation
+        super(Conv1dCell, self).__init__(
+            in_channels, 
+            out_channels, 
+            kernel_size, 
+            padding=(self._r - 1, 0), 
+            dilation=dilation, 
+            weight_attr=weight_attr, 
+            bias_attr=bias_attr, 
+            data_format="NCL")
+
+    @property
+    def receptive_field(self):
+        return self._r
+    
+    def start_sequence(self):
+        if self.training:
+            raise Exception("only use start_sequence in evaluation")
+        self._buffer = None
+        self._reshaped_weight = paddle.reshape(
+            self.weight, (self._out_channels, -1))
+    
+    def initialize_buffer(self, x_t):
+        batch_size, _ = x_t.shape
+        self._buffer = paddle.zeros(
+            (batch_size, self._in_channels, self.receptive_field), 
+            dtype=x_t.dtype)
+    
+    def update_buffer(self, x_t):
+        self._buffer = paddle.concat(
+            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
+
+    def add_input(self, x_t):
+        """
+        Arguments:
+            x_t (Tensor): shape (batch_size, in_channels), step input.
+        Rerurns:
+            y_t (Tensor): shape (batch_size, out_channels), step output.
+        """
+        batch_size = x_t.shape[0]
+        if self.receptive_field > 1:
+            if self._buffer is None:
+                self.initialize_buffer(x_t)
+            
+            # update buffer
+            self.update_buffer(x_t)
+            if self._dilation[0] > 1:
+                input = self._buffer[:, :, ::self._dilation[0]]
+            else:
+                input = self._buffer
+            input = paddle.reshape(input, (batch_size, -1))
+        else:
+            input = x_t
+        y_t = paddle.matmul(input, self._reshaped_weight, transpose_y=True)
+        y_t = y_t + self.bias
+        return y_t
--- a/parakeet/modules/geometry.py
+++ b/parakeet/modules/geometry.py
@ -0,0 +1,29 @@
+import numpy as np
+import paddle
+
+def shuffle_dim(x, axis, perm=None):
+    """Permute input tensor along aixs given the permutation or randomly.
+
+    Args:
+        x (Tensor): shape(*, d_{axis}, *), the input tensor.
+        axis (int): the axis to shuffle.
+        perm (list[int], ndarray, optional): a permutation of [0, d_{axis}), 
+            the order to reorder the tensor along the `axis`-th dimension, if 
+            not provided, randomly shuffle the `axis`-th dimension. Defaults to 
+            None.
+
+    Returns:
+        Tensor: the shuffled tensor, it has the same shape as x does.
+    """
+    size = x.shape[axis]
+    if perm is not None and len(perm) != size:
+        raise ValueError("length of permutation should equals the input "
+                         "tensor's axis-th dimension's size")
+    if perm is not None:
+        perm = np.array(perm)
+    else:
+        perm = np.random.permutation(size)
+    
+    perm = paddle.to_tensor(perm)
+    out = paddle.gather(x, perm, axis)
+    return out
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
@ -0,0 +1,61 @@
+import math
+import paddle
+from paddle.nn import functional as F
+
+def positional_encoding(start_index, length, size, dtype="float32"):
+    """
+    Generate standard positional encoding.
+    
+    pe(pos, 2i) = sin(pos / 10000 ** (2i / size))
+    pe(pos, 2i+1) = cos(pos / 10000 ** (2i / size))
+    
+    This implementation deviates from the standard implementation in that the
+    sin/cos channels are not interleaved.
+
+    Args:
+        start_index (int): the start index.
+        length (int): the length of the positional encoding.
+        size (int): positional encoding dimension.
+    
+    Returns:
+        encodings (Tensor): shape(length, size), the positional encoding.
+    """
+    if (size % 2 != 0):
+        raise ValueError("size should be divisible by 2")
+    channel = paddle.arange(0, size, 2, dtype=dtype)
+    index = paddle.arange(start_index, start_index + length, 1, dtype=dtype)
+    p = paddle.unsqueeze(index, -1) / (10000 ** (channel / float(size)))
+    encodings = paddle.concat([paddle.sin(p), paddle.cos(p)], axis=-1)
+    return encodings
+
+def scalable_positional_encoding(start_index, length, size, omega):
+    """
+    A scalable positional encoding, which extends the standard positional 
+    encoding by adding positioning rate (denoted as omega).
+    
+    pe(pos, 2i) = sin(omega * pos / 10000 ** (2i / size))
+    pe(pos, 2i+1) = cos(omega * pos / 10000 ** (2i / size))
+    
+    This implementation deviates from the standard implementation in that the
+    sin/cos channels are not interleaved.
+    
+    Args:
+        start_index (int): the start index.
+        length (int): the length of the positional encoding.
+        size (int): positional encoding dimension.
+        omgea (Tensor): shape(batch_size, ), positional rates.
+
+    Returns:
+        encodings: shape(batch_size, length, size), position embedding, the 
+        data type is the same as omega.
+    """
+    dtype = omega.dtype
+    index = paddle.arange(start_index, start_index + length, 1, dtype=dtype)
+    channel = paddle.arange(0, size, 2, dtype=dtype)
+
+    p = paddle.unsqueeze(omega, [1, 2]) \
+      * paddle.unsqueeze(index, [1]) \
+      / (10000 ** (channel / float(size)))
+
+    encodings = paddle.concat([paddle.sin(p), paddle.cos(p)], axis=-1)
+    return encodings
--- a/parakeet/modules/stft.py
+++ b/parakeet/modules/stft.py
@ -0,0 +1,93 @@
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from scipy import signal
+import numpy as np 
+
+class STFT(nn.Layer):
+    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
+        """A module for computing differentiable stft transform. See `librosa.stft` for more details.
+
+        Args:
+            n_fft (int): number of samples in a frame.
+            hop_length (int): number of samples shifted between adjacent frames.
+            win_length (int): length of the window function.
+            window (str, optional): name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning".
+        """
+        super(STFT, self).__init__()
+        self.hop_length = hop_length
+        self.n_bin = 1 + n_fft // 2
+        self.n_fft = n_fft
+
+        # calculate window
+        window = signal.get_window(window, win_length)
+        if n_fft != win_length:
+            pad = (n_fft - win_length) // 2
+            window = np.pad(window, ((pad, pad), ), 'constant')
+
+        # calculate weights
+        r = np.arange(0, n_fft)
+        M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
+        w_real = np.reshape(window *
+                            np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
+                            (self.n_bin, 1, 1, self.n_fft))
+        w_imag = np.reshape(window *
+                            np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
+                            (self.n_bin, 1, 1, self.n_fft))
+
+        w = np.concatenate([w_real, w_imag], axis=0)
+        self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
+
+    def forward(self, x):
+        """Compute the stft transform.
+
+        Args:
+            x (Variable): shape(B, T), dtype flaot32, the input waveform.
+
+        Returns:
+            (real, imag)
+            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram. (C = 1 + n_fft // 2)
+            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram. (C = 1 + n_fft // 2) 
+        """
+        # x(batch_size, time_steps)
+        # pad it first with reflect mode
+        # TODO(chenfeiyu): report an issue on paddle.flip
+        pad_start = paddle.reverse(x[:, 1:1 + self.n_fft // 2], axis=[1])
+        pad_stop = paddle.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=[1])
+        x = paddle.concat([pad_start, x, pad_stop], axis=-1)
+
+        # to BC1T, C=1
+        x = paddle.unsqueeze(x, axis=[1, 2])
+        out = F.conv2d(x, self.weight, stride=(1, self.hop_length))
+        real, imag = paddle.chunk(out, 2, axis=1)  # BC1T
+        return real, imag
+
+    def power(self, x):
+        """Compute the power spectrogram.
+
+        Args:
+            (real, imag)
+            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
+            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
+
+        Returns:
+            Variable: shape(B, C, 1, T), dtype flaot32, the power spectrogram.
+        """
+        real, imag = self(x)
+        power = real**2 + imag**2
+        return power
+
+    def magnitude(self, x):
+        """Compute the magnitude spectrogram.
+
+        Args:
+            (real, imag)
+            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
+            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
+
+        Returns:
+            Variable: shape(B, C, 1, T), dtype flaot32, the magnitude spectrogram. It is the square root of the power spectrogram.
+        """
+        power = self.power(x)
+        magnitude = paddle.sqrt(power)
+        return magnitude
--- a/parakeet/modules/transformer.py
+++ b/parakeet/modules/transformer.py
@ -0,0 +1,146 @@
+import math
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from parakeet.modules import attention as attn
+
+class PositionwiseFFN(nn.Layer):
+    """
+    A faithful implementation of Position-wise Feed-Forward Network 
+    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    It is basically a 3-layer MLP, with relu actication and dropout in between.
+    """
+    def __init__(self, 
+                 input_size: int, 
+                 hidden_size: int, 
+                 dropout=0.0):
+        """
+        Args:
+            input_size (int): the input feature size.
+            hidden_size (int): the hidden layer's feature size.
+            dropout (float, optional): probability of dropout applied to the 
+                output of the first fully connected layer. Defaults to 0.0.
+        """
+        super(PositionwiseFFN, self).__init__()
+        self.linear1 = nn.Linear(input_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, input_size)
+        self.dropout = nn.Dropout(dropout)
+
+        self.input_size = input_size
+        self.hidden_szie = hidden_size
+
+    def forward(self, x):
+        """positionwise feed forward network.
+
+        Args:
+            x (Tensor): shape(*, input_size), the input tensor.
+
+        Returns:
+            Tensor: shape(*, input_size), the output tensor.
+        """
+        return self.linear2(self.dropout(F.relu(self.linear1(x))))
+
+def combine_mask(padding_mask, no_future_mask):
+    """
+    Combine the padding mask and no future mask for transformer decoder. 
+    Padding mask is used to mask padding positions and no future mask is used 
+    to prevent the decoder to see future information.
+
+    Args:
+        padding_mask (Tensor): shape(batch_size, time_steps), dtype: float32 or float64, decoder padding mask. 
+        no_future_mask (Tensor): shape(time_steps, time_steps), dtype: float32 or float64, no future mask.
+
+    Returns:
+        Tensor: shape(batch_size, time_steps, time_steps), combined mask.
+    """
+    # TODO: to support boolean mask by using logical_and?
+    return paddle.unsqueeze(padding_mask, 1) * no_future_mask
+
+class TransformerEncoderLayer(nn.Layer):
+    """
+    Transformer encoder layer.
+    """
+    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
+        """
+        Args:
+            d_model (int): the feature size of the input, and the output.
+            n_heads (int): the number of heads in the internal MultiHeadAttention layer.
+            d_ffn (int): the hidden size of the internal PositionwiseFFN.
+            dropout (float, optional): the probability of the dropout in 
+                MultiHeadAttention and PositionwiseFFN. Defaults to 0.
+        """
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
+        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
+        
+        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
+        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
+    
+    def forward(self, x, mask):
+        """
+        Args:
+            x (Tensor): shape(batch_size, time_steps, d_model), the decoder input.
+            mask (Tensor): shape(batch_size, time_steps), the padding mask.
+        
+        Returns:
+            (x, attn_weights)
+            x (Tensor): shape(batch_size, time_steps, d_model), the decoded.
+            attn_weights (Tensor), shape(batch_size, n_heads, time_steps, time_steps), self attention.
+        """
+        context_vector, attn_weights = self.self_mha(x, x, x, paddle.unsqueeze(mask, 1))
+        x = self.layer_norm1(x + context_vector)
+        
+        x = self.layer_norm2(x + self.ffn(x))
+        return x, attn_weights
+
+
+class TransformerDecoderLayer(nn.Layer):
+    """
+    Transformer decoder layer.
+    """
+    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
+        """
+        Args:
+            d_model (int): the feature size of the input, and the output.
+            n_heads (int): the number of heads in the internal MultiHeadAttention layer.
+            d_ffn (int): the hidden size of the internal PositionwiseFFN.
+            dropout (float, optional): the probability of the dropout in 
+                MultiHeadAttention and PositionwiseFFN. Defaults to 0.
+        """
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
+        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
+        
+        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
+        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
+        
+        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
+        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
+    
+    def forward(self, q, k, v, encoder_mask, decoder_mask):
+        """
+        Args:
+            q (Tensor): shape(batch_size, time_steps_q, d_model), the decoder input.
+            k (Tensor): shape(batch_size, time_steps_k, d_model), keys.
+            v (Tensor): shape(batch_size, time_steps_k, d_model), values
+            encoder_mask (Tensor): shape(batch_size, time_steps_k) encoder padding mask.
+            decoder_mask (Tensor): shape(batch_size, time_steps_q) decoder padding mask.
+        
+        Returns:
+            (q, self_attn_weights, cross_attn_weights)
+            q (Tensor): shape(batch_size, time_steps_q, d_model), the decoded.
+            self_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_q), decoder self attention.
+            cross_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_k), decoder-encoder cross attention.
+        """
+        tq = q.shape[1]
+        no_future_mask = paddle.tril(paddle.ones([tq, tq])) #(tq, tq)
+        combined_mask = combine_mask(decoder_mask, no_future_mask)
+        context_vector, self_attn_weights = self.self_mha(q, q, q, combined_mask)
+        q = self.layer_norm1(q + context_vector)
+        
+        context_vector, cross_attn_weights = self.cross_mha(q, k, v, paddle.unsqueeze(encoder_mask, 1))
+        q = self.layer_norm2(q + context_vector)
+        
+        q = self.layer_norm3(q + self.ffn(q))
+        return q, self_attn_weights, cross_attn_weights
--- a/parakeet/utils/internals.py
+++ b/parakeet/utils/internals.py
@ -0,0 +1,36 @@
+import numpy as np
+from paddle.framework import core
+
+def convert_dtype_to_np_dtype_(dtype):
+    """
+    Convert paddle's data type to corrsponding numpy data type.
+
+    Args:
+        dtype(np.dtype): the data type in paddle.
+
+    Returns:
+        type: the data type in numpy.
+
+    """
+    if dtype is core.VarDesc.VarType.FP32:
+        return np.float32
+    elif dtype is core.VarDesc.VarType.FP64:
+        return np.float64
+    elif dtype is core.VarDesc.VarType.FP16:
+        return np.float16
+    elif dtype is core.VarDesc.VarType.BOOL:
+        return np.bool
+    elif dtype is core.VarDesc.VarType.INT32:
+        return np.int32
+    elif dtype is core.VarDesc.VarType.INT64:
+        return np.int64
+    elif dtype is core.VarDesc.VarType.INT16:
+        return np.int16
+    elif dtype is core.VarDesc.VarType.INT8:
+        return np.int8
+    elif dtype is core.VarDesc.VarType.UINT8:
+        return np.uint8
+    elif dtype is core.VarDesc.VarType.BF16:
+        return np.uint16
+    else:
+        raise ValueError("Not supported dtype %s" % dtype)
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
@ -13,10 +13,10 @@
 # limitations under the License.

 import numpy as np
-import paddle.fluid.dygraph as dg
+from paddle import nn


-def summary(layer):
+def summary(layer: nn.Layer):
    num_params = num_elements = 0
    print("layer summary:")
    for name, param in layer.state_dict().items():
@ -26,12 +26,10 @@ def summary(layer):
    print("layer has {} parameters, {} elements.".format(num_params,
                                                         num_elements))

-
-def freeze(layer):
+def freeze(layer: nn.Layer):
    for param in layer.parameters():
        param.trainable = False

-
-def unfreeze(layer):
+def unfreeze(layer: nn.Layer):
    for param in layer.parameters():
        param.trainable = True
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@ -0,0 +1,104 @@
+import unittest
+import numpy as np
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.modules import attention as attn
+
+class TestScaledDotProductAttention(unittest.TestCase):
+    def test_without_mask(self):
+        x = paddle.randn([4, 16, 8])
+        context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
+        assert(list(context_vector.shape) == [4, 16, 8])
+        assert(list(attention_weights.shape) == [4, 16, 16])
+        
+    def test_with_mask(self):
+        x = paddle.randn([4, 16, 8])
+        mask = paddle.fluid.layers.sequence_mask(
+            paddle.to_tensor([16, 15, 13, 14]), dtype=x.dtype)
+        mask = mask.unsqueeze(1) # unsqueeze for the decoder time steps
+        context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x, mask)
+        assert(list(context_vector.shape) == [4, 16, 8])
+        assert(list(attention_weights.shape) == [4, 16, 16])
+        
+    def test_4d(self):
+        x = paddle.randn([4, 6, 16, 8])
+        context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
+        assert(list(context_vector.shape) == [4, 6, 16, 8])
+        assert(list(attention_weights.shape) == [4, 6, 16, 16])
+
+
+class TestMonoheadAttention(unittest.TestCase):
+    def test_io(self):
+        net = attn.MonoheadAttention(6, 0.1)
+        q = paddle.randn([4, 18, 6])
+        k = paddle.randn([4, 12, 6])
+        v = paddle.randn([4, 12, 6])
+        mask = paddle.fluid.layers.sequence_mask(
+            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
+        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
+        context_vector, attn_weights = net(q, k, v, mask)
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 18, 6))
+        self.assertTupleEqual(attn_weights.numpy().shape, (4, 18, 12))
+
+
+class TestDropHead(unittest.TestCase):
+    def test_drop(self):
+        x = paddle.randn([4, 6, 16, 8])
+        out = attn.drop_head(x, 2, training=True)
+        # drop 2 head from 6 at all positions
+        np.testing.assert_allclose(np.sum(out.numpy() == 0., axis=1), 2)
+    
+    def test_drop_all(self):
+        x = paddle.randn([4, 6, 16, 8])
+        out = attn.drop_head(x, 6, training=True)
+        np.testing.assert_allclose(np.sum(out.numpy()), 0)
+    
+    def test_eval(self):
+        x = paddle.randn([4, 6, 16, 8])
+        out = attn.drop_head(x, 6, training=False)
+        self.assertIs(x, out)
+
+
+class TestMultiheadAttention(unittest.TestCase):
+    def __init__(self, methodName="test_io", same_qk=True):
+        super(TestMultiheadAttention, self).__init__(methodName)
+        self.same_qk = same_qk
+    
+    def setUp(self):
+        if self.same_qk:
+            net = attn.MultiheadAttention(64, 8, dropout=0.3)
+        else:
+            net = attn.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
+        self.net =net
+            
+    def test_io(self):
+        q = paddle.randn([4, 12, 64])
+        mask = paddle.fluid.layers.sequence_mask(
+            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
+        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
+        context_vector, attention_weights = self.net(q, q, q, mask)
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
+        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    suite.addTest(TestScaledDotProductAttention("test_without_mask"))
+    suite.addTest(TestScaledDotProductAttention("test_with_mask"))
+    suite.addTest(TestScaledDotProductAttention("test_4d"))
+    
+    suite.addTest(TestDropHead("test_drop"))
+    suite.addTest(TestDropHead("test_drop_all"))
+    suite.addTest(TestDropHead("test_eval"))
+    
+    suite.addTest(TestMonoheadAttention("test_io"))
+    
+    suite.addTest(TestMultiheadAttention("test_io", same_qk=True))
+    suite.addTest(TestMultiheadAttention("test_io", same_qk=False))
+    
+    suite.addTest(TestDropHeadMultiheadAttention("test_io", same_qk=True))
+    suite.addTest(TestDropHeadMultiheadAttention("test_io", same_qk=False))
+    
+    return suite
--- a/tests/test_cbhg.py
+++ b/tests/test_cbhg.py
@ -0,0 +1,55 @@
+import unittest
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+from parakeet.modules import cbhg
+
+class TestConv1dBatchNorm(unittest.TestCase):
+    def __init__(self, methodName="runTest", causal=False):
+        super(TestConv1dBatchNorm, self).__init__(methodName)
+        self.causal = causal
+        
+    def setUp(self):
+        k = 5
+        paddding = (k - 1, 0) if self.causal else ((k-1) // 2, k //2)
+        self.net = cbhg.Conv1dBatchNorm(4, 6, (k,), 1, padding=paddding)
+
+    def test_input_output(self):
+        x = paddle.randn([4, 4, 16])
+        out = self.net(x)
+        out_np = out.numpy()
+        self.assertTupleEqual(out_np.shape, (4, 6, 16))
+    
+    def runTest(self):
+        self.test_input_output()
+
+
+class TestHighway(unittest.TestCase):
+    def test_io(self):
+        net = cbhg.Highway(4)
+        x = paddle.randn([2, 12, 4])
+        y = net(x)
+        self.assertTupleEqual(y.numpy().shape, (2, 12, 4))
+
+
+class TestCBHG(unittest.TestCase):
+    def __init__(self, methodName="runTest", ):
+        super(TestCBHG, self).__init__(methodName)
+    
+    def test_io(self):
+        self.net = cbhg.CBHG(64, 32, 16, 
+                             projection_channels=[64, 128], 
+                             num_highways=4, highway_features=128, 
+                             gru_features=64)
+        x = paddle.randn([4, 64, 32])
+        y = self.net(x)
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 128))
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    suite.addTest(TestConv1dBatchNorm("runTest", True))
+    suite.addTest(TestConv1dBatchNorm("runTest", False))
+    
+    suite.addTest(TestHighway("test_io"))
+    suite.addTest(TestCBHG("test_io"))
+    return suite
--- a/tests/test_clarinet.py
+++ b/tests/test_clarinet.py
@ -0,0 +1,43 @@
+import unittest
+import numpy as np
+
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.models import clarinet
+from parakeet.modules import stft
+
+class TestParallelWaveNet(unittest.TestCase):
+    def test_io(self):
+        net = clarinet.ParallelWaveNet([8, 8, 8], [1, 1, 1], 16, 12, 2)
+        x = paddle.randn([4, 6073])
+        condition = paddle.randn([4, 12, 6073])
+        z, out_mu, out_log_std = net(x, condition)
+        self.assertTupleEqual(z.numpy().shape, (4, 6073))
+        self.assertTupleEqual(out_mu.numpy().shape, (4, 6073))
+        self.assertTupleEqual(out_log_std.numpy().shape, (4, 6073))
+        
+
+class TestClariNet(unittest.TestCase):
+    def setUp(self):
+        encoder = clarinet.UpsampleNet([2, 2])
+        teacher = clarinet.WaveNet(8, 3, 16, 3, 12, 2, "mog", -9.0)
+        student = clarinet.ParallelWaveNet([8, 8, 8, 8, 8, 8], [1, 1, 1, 1, 1, 1], 16, 12, 2)
+        stft_module = stft.STFT(16, 4, 8)
+        net = clarinet.Clarinet(encoder, teacher, student, stft_module, -6.0, lmd=4)
+        print("context size is: ", teacher.context_size)
+        self.net = net
+        
+    def test_io(self):
+        audio = paddle.randn([4, 1366])
+        mel = paddle.randn([4, 12, 512]) # 512 * 4 =2048
+        audio_start = paddle.zeros([4], dtype="int64")
+        loss = self.net(audio, mel, audio_start, clip_kl=True)
+        loss["loss"].numpy()
+        
+    def test_synthesis(self):
+        mel = paddle.randn([4, 12, 512]) # 64 = 246 / 4
+        out = self.net.synthesis(mel)
+        self.assertTupleEqual(out.numpy().shape, (4, 2048))
+        
--- a/tests/test_connections.py
+++ b/tests/test_connections.py
@ -0,0 +1,33 @@
+import unittest
+import paddle
+from paddle import nn
+paddle.disable_static(paddle.CPUPlace())
+paddle.set_default_dtype("float64")
+
+from parakeet.modules import connections as conn
+
+class TestPreLayerNormWrapper(unittest.TestCase):
+    def test_io(self):
+        net = nn.Linear(8, 8)
+        net = conn.PreLayerNormWrapper(net, 8)
+        x = paddle.randn([4, 8])
+        y = net(x)
+        self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
+        
+
+class TestPostLayerNormWrapper(unittest.TestCase):
+    def test_io(self):
+        net = nn.Linear(8, 8)
+        net = conn.PostLayerNormWrapper(net, 8)
+        x = paddle.randn([4, 8])
+        y = net(x)
+        self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
+        
+        
+class TestResidualWrapper(unittest.TestCase):
+    def test_io(self):
+        net = nn.Linear(8, 8)
+        net = conn.ResidualWrapper(net)
+        x = paddle.randn([4, 8])
+        y = net(x)
+        self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
--- a/tests/test_conv.py
+++ b/tests/test_conv.py
@ -0,0 +1,32 @@
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+import unittest
+import numpy as np
+
+from parakeet.modules import conv
+
+class TestConv1dCell(unittest.TestCase):
+    def setUp(self):
+        self.net = conv.Conv1dCell(4, 6, 5, dilation=2)
+    
+    def forward_incremental(self, x):
+        outs = []
+        self.net.start_sequence()
+        with paddle.no_grad():
+            for i in range(x.shape[-1]):
+                xt = x[:, :, i]
+                yt = self.net.add_input(xt)
+                outs.append(yt)
+            y2 = paddle.stack(outs, axis=-1)
+        return y2
+            
+    def test_equality(self):
+        x = paddle.randn([2, 4, 16])
+        y1 = self.net(x)
+        
+        self.net.eval()
+        y2 = self.forward_incremental(x)
+
+        np.testing.assert_allclose(y2.numpy(), y1.numpy())
+        
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@ -0,0 +1,122 @@
+import unittest
+import numpy as np
+import paddle
+from paddle import io
+from parakeet import data
+
+class MyDataset(io.Dataset):
+    def __init__(self, size):
+        self._data = np.random.randn(size, 6)
+    
+    def __getitem__(self, i):
+        return self._data[i]
+    
+    def __len__(self):
+        return self._data.shape[0]
+
+
+class TestTransformDataset(unittest.TestCase):
+    def test(self):
+        dataset = MyDataset(20)
+        dataset = data.TransformDataset(dataset, lambda x: np.abs(x))
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("TransformDataset")
+        for batch, in dataloader:
+            print(type(batch), batch.dtype, batch.shape)
+
+
+class TestChainDataset(unittest.TestCase):
+    def test(self):
+        dataset1 = MyDataset(20)
+        dataset2 = MyDataset(40)
+        dataset = data.ChainDataset(dataset1, dataset2)
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("ChainDataset")
+        for batch, in dataloader:
+            print(type(batch), batch.dtype, batch.shape)
+
+
+class TestTupleDataset(unittest.TestCase):
+    def test(self):
+        dataset1 = MyDataset(20)
+        dataset2 = MyDataset(20)
+        dataset = data.TupleDataset(dataset1, dataset2)
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("TupleDataset")
+        for field1, field2 in dataloader:
+            print(type(field1), field1.dtype, field1.shape)
+            print(type(field2), field2.dtype, field2.shape)
+
+
+class TestDictDataset(unittest.TestCase):
+    def test(self):
+        dataset1 = MyDataset(20)
+        dataset2 = MyDataset(20)
+        dataset = data.DictDataset(field1=dataset1, field2=dataset2)
+        def collate_fn(examples):
+            examples_tuples = []
+            for example in examples:
+                examples_tuples.append(example.values())
+            return paddle.fluid.dataloader.dataloader_iter.default_collate_fn(examples_tuples)
+            
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1, collate_fn=collate_fn)
+        print("DictDataset")
+        for field1, field2 in dataloader:
+            print(type(field1), field1.dtype, field1.shape)
+            print(type(field2), field2.dtype, field2.shape)
+
+
+class TestSliceDataset(unittest.TestCase):
+    def test(self):
+        dataset = MyDataset(40)
+        dataset = data.SliceDataset(dataset, 0, 20)
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("SliceDataset")
+        for batch, in dataloader:
+            print(type(batch), batch.dtype, batch.shape)
+
+
+class TestSplit(unittest.TestCase):
+    def test(self):
+        dataset = MyDataset(40)
+        train, valid = data.split(dataset, 10)
+        dataloader1 = io.DataLoader(train, batch_size=4, shuffle=True, num_workers=1)
+        dataloader2 = io.DataLoader(valid, batch_size=4, shuffle=True, num_workers=1)
+        print("First Dataset")
+        for batch, in dataloader1:
+            print(type(batch), batch.dtype, batch.shape)
+            
+        print("Second Dataset")
+        for batch, in dataloader2:
+            print(type(batch), batch.dtype, batch.shape)
+
+
+class TestSubsetDataset(unittest.TestCase):
+    def test(self):
+        dataset = MyDataset(40)
+        indices = np.random.choice(np.arange(40), [20], replace=False).tolist()
+        dataset = data.SubsetDataset(dataset, indices)
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("SubsetDataset")
+        for batch, in dataloader:
+            print(type(batch), batch.dtype, batch.shape)
+
+
+class TestFilterDataset(unittest.TestCase):
+    def test(self):
+        dataset = MyDataset(40)
+        dataset = data.FilterDataset(dataset, lambda x: np.mean(x)> 0.3)
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("FilterDataset")
+        for batch, in dataloader:
+            print(type(batch), batch.dtype, batch.shape)
+
+
+class TestCacheDataset(unittest.TestCase):
+    def test(self):
+        dataset = MyDataset(40)
+        dataset = data.CacheDataset(dataset)
+        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+        print("CacheDataset")
+        for batch, in dataloader:
+            print(type(batch), batch.dtype, batch.shape)
--- a/tests/test_deepvoice3.py
+++ b/tests/test_deepvoice3.py
@ -0,0 +1,107 @@
+import numpy as np
+import unittest
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.models import deepvoice3 as dv3
+
+class TestConvBlock(unittest.TestCase):
+    def test_io_causal(self):
+        net = dv3.ConvBlock(6, 5, True, True, 8, 0.9)
+        x = paddle.randn([4, 32, 6])
+        condition = paddle.randn([4, 8])
+        # TODO(chenfeiyu): to report an issue on default data type
+        padding = paddle.zeros([4, 4, 6], dtype=x.dtype)
+        y = net.forward(x, condition, padding)
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
+        
+    def test_io_non_causal(self):
+        net = dv3.ConvBlock(6, 5, False, True, 8, 0.9)
+        x = paddle.randn([4, 32, 6])
+        condition = paddle.randn([4, 8])
+        y = net.forward(x, condition)
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
+        
+        
+class TestAffineBlock1(unittest.TestCase):
+    def test_io(self):
+        net = dv3.AffineBlock1(6, 16, True, 8)
+        x = paddle.randn([4, 32, 6])
+        condition = paddle.randn([4, 8])
+        y = net(x, condition)
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
+        
+
+class TestAffineBlock2(unittest.TestCase):
+    def test_io(self):
+        net = dv3.AffineBlock2(6, 16, True, 8)
+        x = paddle.randn([4, 32, 6])
+        condition = paddle.randn([4, 8])
+        y = net(x, condition)
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
+        
+
+class TestEncoder(unittest.TestCase):
+    def test_io(self):
+        net = dv3.Encoder(5, 8, 16, 5, True, 6)
+        x = paddle.randn([4, 32, 8])
+        condition = paddle.randn([4, 6])
+        keys, values = net(x, condition)
+        self.assertTupleEqual(keys.numpy().shape, (4, 32, 8))
+        self.assertTupleEqual(values.numpy().shape, (4, 32, 8))
+        
+        
+class TestAttentionBlock(unittest.TestCase):
+    def test_io(self):
+        net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
+        q = paddle.randn([4, 32, 6])
+        k = paddle.randn([4, 24, 6])
+        v = paddle.randn([4, 24, 6])
+        lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
+        condition = paddle.randn([4, 8])
+        context_vector, attention_weight = net(q, k, v, lengths, condition, 0)
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
+        self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
+        
+    def test_io_with_previous_attn(self):
+        net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
+        q = paddle.randn([4, 32, 6])
+        k = paddle.randn([4, 24, 6])
+        v = paddle.randn([4, 24, 6])
+        lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
+        condition = paddle.randn([4, 8])
+        prev_attn_weight = paddle.randn([4, 32, 16])
+        
+        context_vector, attention_weight = net(
+            q, k, v, lengths, condition, 0, 
+            force_monotonic=True, prev_coeffs=prev_attn_weight, window=(0, 4))
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
+        self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
+        
+        
+class TestDecoder(unittest.TestCase):
+    def test_io(self):
+        net = dv3.Decoder(8, 4, [4, 12], 5, 3, 16, 1.0, 1.45, True, 6)
+        x = paddle.randn([4, 32, 8])
+        k = paddle.randn([4, 24, 12]) # prenet's last size should equals k's feature size
+        v = paddle.randn([4, 24, 12])
+        lengths = paddle.to_tensor([24, 18, 19, 22])
+        condition = paddle.randn([4, 6])
+        decoded, hidden, attentions, final_state = net(x, k, v, lengths, 0, condition)
+        self.assertTupleEqual(decoded.numpy().shape, (4, 32, 4 * 8))
+        self.assertTupleEqual(hidden.numpy().shape, (4, 32, 12))
+        self.assertEqual(len(attentions), 5)
+        self.assertTupleEqual(attentions[0].numpy().shape, (4, 32, 24))
+        self.assertEqual(len(final_state), 5)
+        self.assertTupleEqual(final_state[0].numpy().shape, (4, 2, 12))
+        
+        
+class TestPostNet(unittest.TestCase):
+    def test_io(self):
+        net = dv3.PostNet(3, 8, 16, 3, 12, 4, True, 6)
+        x = paddle.randn([4, 32, 8])
+        condition = paddle.randn([4, 6])
+        y = net(x, condition)
+        self.assertTupleEqual(y.numpy().shape, (4, 32 * 4, 12))
+        
--- a/tests/test_geometry.py
+++ b/tests/test_geometry.py
@ -0,0 +1,19 @@
+import unittest
+import numpy as np
+
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.modules import geometry as geo
+
+class TestShuffleDim(unittest.TestCase):
+    def test_perm(self):
+        x = paddle.randn([2, 3, 4, 6])
+        y = geo.shuffle_dim(x, 2, [3, 2, 1, 0])
+        np.testing.assert_allclose(x.numpy()[0, 0, :, 0], y.numpy()[0, 0, ::-1, 0])
+        
+    def test_random_perm(self):
+        x = paddle.randn([2, 3, 4, 6])
+        y = geo.shuffle_dim(x, 2)
+        np.testing.assert_allclose(x.numpy().sum(2), y.numpy().sum(2))
--- a/tests/test_position_encoding.py
+++ b/tests/test_position_encoding.py
@ -0,0 +1,64 @@
+import unittest
+import numpy as np
+import paddle
+
+from parakeet.modules import positional_encoding as pe
+
+def positional_encoding(start_index, length, size, dtype="float32"):
+    if (size % 2 != 0):
+        raise ValueError("size should be divisible by 2")
+    channel = np.arange(0, size, 2, dtype=dtype)
+    index = np.arange(start_index, start_index + length, 1, dtype=dtype)
+    p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
+    encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
+    return encodings
+
+def scalable_positional_encoding(start_index, length, size, omega):
+    dtype = omega.dtype
+    index = np.arange(start_index, start_index + length, 1, dtype=dtype)
+    channel = np.arange(0, size, 2, dtype=dtype)
+
+    p = np.reshape(omega, omega.shape + (1, 1)) \
+      * np.expand_dims(index, -1) \
+      / (10000 ** (channel / float(size)))
+
+    encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
+    return encodings
+
+class TestPositionEncoding(unittest.TestCase):
+    def __init__(self, start=0, length=20, size=16, dtype="float64"):
+        super(TestPositionEncoding, self).__init__("runTest")
+        self.spec = (start, length, size, dtype)
+    
+    def test_equality(self):
+        start, length, size, dtype = self.spec
+        position_embed1 = positional_encoding(start, length, size, dtype)
+        position_embed2 = pe.positional_encoding(start, length, size, dtype)
+        np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
+        
+    def runTest(self):
+        paddle.disable_static(paddle.CPUPlace())
+        self.test_equality()
+
+class TestScalablePositionEncoding(unittest.TestCase):
+    def __init__(self, start=0, length=20, size=16, dtype="float64"):
+        super(TestScalablePositionEncoding, self).__init__("runTest")
+        self.spec = (start, length, size, dtype)
+    
+    def test_equality(self):
+        start, length, size, dtype = self.spec
+        omega = np.random.uniform(1, 2, size=(4,)).astype(dtype)
+        position_embed1 = scalable_positional_encoding(start, length, size, omega)
+        position_embed2 = pe.scalable_positional_encoding(start, length, size, paddle.to_tensor(omega))
+        np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
+        
+    def runTest(self):
+        paddle.disable_static(paddle.CPUPlace())
+        self.test_equality()
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    suite.addTest(TestPositionEncoding(0, 20, 16, "float64"))
+    suite.addTest(TestScalablePositionEncoding(0, 20, 16))
+    return suite
--- a/tests/test_stft.py
+++ b/tests/test_stft.py
@ -0,0 +1,27 @@
+import unittest
+import numpy as np
+import librosa
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.modules import stft
+
+class TestSTFT(unittest.TestCase):
+    def test(self):
+        path = librosa.util.example("choice")
+        wav, sr = librosa.load(path, duration=5)
+        wav = wav.astype("float64")
+        
+        spec = librosa.stft(wav, n_fft=2048, hop_length=256, win_length=1024)
+        mag1 = np.abs(spec)
+        
+        wav_in_batch = paddle.unsqueeze(paddle.to_tensor(wav), 0)
+        mag2 = stft.STFT(2048, 256, 1024).magnitude(wav_in_batch)
+        mag2 = paddle.squeeze(mag2, [0, 2]).numpy()
+        
+        print("mag1", mag1)
+        print("mag2", mag2)
+        # TODO(chenfeiyu): Is there something wrong? there is some elements that
+        # does not match
+        # np.testing.assert_allclose(mag2, mag1)
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@ -0,0 +1,65 @@
+import unittest
+import numpy as np
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.modules import transformer
+
+def sequence_mask(lengths, max_length=None, dtype="bool"):
+    max_length = max_length or np.max(lengths)
+    ids = np.arange(max_length)
+    return (ids < np.expand_dims(lengths, -1)).astype(dtype)
+
+def future_mask(lengths, max_length=None, dtype="bool"):
+    max_length = max_length or np.max(lengths)
+    return np.tril(np.tril(np.ones(max_length)))
+
+class TestPositionwiseFFN(unittest.TestCase):
+    def test_io(self):
+        net = transformer.PositionwiseFFN(8, 12)
+        x = paddle.randn([2, 3, 4, 8])
+        y = net(x)
+        self.assertTupleEqual(y.numpy().shape, (2, 3, 4, 8))
+
+
+class TestCombineMask(unittest.TestCase):
+    def test_equality(self):
+        lengths = np.array([12, 8, 9, 10])
+        padding_mask = sequence_mask(lengths, dtype="float64")
+        no_future_mask = future_mask(lengths, dtype="float64")
+        combined_mask1 = np.expand_dims(padding_mask, 1) * no_future_mask
+        
+        combined_mask2 = transformer.combine_mask(
+            paddle.to_tensor(padding_mask), paddle.to_tensor(no_future_mask)
+        )
+        np.testing.assert_allclose(combined_mask2.numpy(), combined_mask1)
+
+
+class TestTransformerEncoderLayer(unittest.TestCase):
+    def test_io(self):
+        net = transformer.TransformerEncoderLayer(64, 8, 128, 0.5)
+        x = paddle.randn([4, 12, 64])
+        lengths = paddle.to_tensor([12, 8, 9, 10])
+        mask = paddle.fluid.layers.sequence_mask(lengths, dtype=x.dtype)
+        y, attn_weights = net(x, mask)
+        
+        self.assertTupleEqual(y.numpy().shape, (4, 12, 64))
+        self.assertTupleEqual(attn_weights.numpy().shape, (4, 8, 12, 12))
+
+
+class TestTransformerDecoderLayer(unittest.TestCase):
+    def test_io(self):
+        net = transformer.TransformerDecoderLayer(64, 8, 128, 0.5)
+        q = paddle.randn([4, 32, 64])
+        k = paddle.randn([4, 24, 64])
+        v = paddle.randn([4, 24, 64])
+        enc_lengths = paddle.to_tensor([24, 18, 20, 22])
+        dec_lengths = paddle.to_tensor([32, 28, 30, 31])
+        enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype)
+        dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype)
+        y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
+        
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
+        self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
+        self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
--- a/tests/test_transformer_tts.py
+++ b/tests/test_transformer_tts.py
@ -0,0 +1,56 @@
+import unittest
+import numpy as np
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.models import transformer_tts as tts
+
+class TestMultiheadAttention(unittest.TestCase):
+    def test_io_same_qk(self):
+        net = tts.MultiheadAttention(64, 8)
+        q = paddle.randn([4, 12, 64])
+        mask = paddle.fluid.layers.sequence_mask(
+            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
+        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
+        context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
+        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
+    
+    def test_io(self):
+        net = tts.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
+        q = paddle.randn([4, 12, 64])
+        mask = paddle.fluid.layers.sequence_mask(
+            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
+        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
+        context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
+        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
+        
+        
+class TestTransformerEncoderLayer(unittest.TestCase):
+    def test_io(self):
+        net = tts.TransformerEncoderLayer(64, 8, 128)
+        x = paddle.randn([4, 12, 64])
+        mask = paddle.fluid.layers.sequence_mask(
+            paddle.to_tensor([12, 10, 8, 9]), dtype=x.dtype)
+        context_vector, attention_weights = net(x, mask)
+        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
+        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
+        
+        
+class TestTransformerDecoderLayer(unittest.TestCase):
+    def test_io(self):
+        net = tts.TransformerDecoderLayer(64, 8, 128, 0.5)
+        q = paddle.randn([4, 32, 64])
+        k = paddle.randn([4, 24, 64])
+        v = paddle.randn([4, 24, 64])
+        enc_lengths = paddle.to_tensor([24, 18, 20, 22])
+        dec_lengths = paddle.to_tensor([32, 28, 30, 31])
+        enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype)
+        dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype)
+        y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
+        
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
+        self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
+        self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
--- a/tests/test_waveflow.py
+++ b/tests/test_waveflow.py
@ -0,0 +1,64 @@
+import numpy as np
+import unittest
+
+import paddle
+paddle.set_default_dtype("float64")
+paddle.disable_static(paddle.CPUPlace())
+
+from parakeet.models import waveflow
+
+class TestFold(unittest.TestCase):
+    def test_audio(self):
+        x = paddle.randn([4, 32 * 8])
+        y = waveflow.fold(x, 8)
+        self.assertTupleEqual(y.numpy().shape, (4, 32, 8))
+    
+    def test_spec(self):
+        x = paddle.randn([4, 80, 32 * 8])
+        y = waveflow.fold(x, 8)
+        self.assertTupleEqual(y.numpy().shape, (4, 80, 32, 8))
+
+
+class TestUpsampleNet(unittest.TestCase):
+    def test_io(self):
+        net = waveflow.UpsampleNet([2, 2])
+        x = paddle.randn([4, 8, 6])
+        y = net(x)
+        self.assertTupleEqual(y.numpy().shape, (4, 8, 2 * 2 * 6))
+        
+
+class TestResidualBlock(unittest.TestCase):
+    def test_io(self):
+        net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
+        x = paddle.randn([4, 4, 16, 32])
+        condition = paddle.randn([4, 6, 16, 32])
+        res, skip = net(x, condition)
+        self.assertTupleEqual(res.numpy().shape, (4, 4, 16, 32))
+        self.assertTupleEqual(skip.numpy().shape, (4, 4, 16, 32))
+        
+        
+class TestResidualNet(unittest.TestCase):
+    def test_io(self):
+        net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
+        x = paddle.randn([4, 6, 8, 32])
+        condition = paddle.randn([4, 8, 8, 32])
+        y = net(x, condition)
+        self.assertTupleEqual(y.numpy().shape, (4, 6, 8, 32))
+        
+        
+class TestFlow(unittest.TestCase):
+    def test_io(self):
+        x = paddle.randn([4, 1, 8, 32])
+        condition = paddle.randn([4, 7, 8, 32])
+        net = waveflow.Flow(8, 16, 7, (3, 3), 8)
+        y = net(x, condition)
+        self.assertTupleEqual(y.numpy().shape, (4, 2, 8, 32))
+        
+        
+class TestWaveflow(unittest.TestCase):
+    def test_io(self):
+        x = paddle.randn([4, 32 * 8 ])
+        condition = paddle.randn([4, 7, 32 * 8])
+        net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
+        z, logs = net(x, condition)
+