Merge pull request #66 from iclementine/reborn

format code and discard opencc
2020-12-20 13:53:31 +08:00 · 2020-12-20 13:53:31 +08:00 · fe7ddc2aaf
parent 2c952fbd70 bb64e4659a
commit fe7ddc2aaf
72 changed files with 1258 additions and 1571 deletions
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@ -14,7 +28,6 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 # -- Project information -----------------------------------------------------
 project = 'parakeet'
@ -24,7 +37,6 @@ author = 'parakeet-developers'
 # The full version, including alpha/beta/rc tags
 release = '0.2'
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be
@ -46,7 +58,6 @@ templates_path = ['_templates']
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
--- a/docs/config_cn.md
+++ b/docs/config_cn.md
@ -102,11 +102,3 @@ optional arguments:
  --opts ...            options to overwrite --config file and the default
                        config, passing in KEY VALUE pairs
 ```
--- a/docs/experiment_guide_cn.md
+++ b/docs/experiment_guide_cn.md
@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset  --+
 ```
 在这个软件源中包含了几个例子，可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户，可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
--- a/docs/overview_cn.md
+++ b/docs/overview_cn.md
@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了
 1. 可复用的模型以及常用的模块；
 2. 从数据处理，模型训练到预测等一系列过程的完整实验；
 3. 高质量的开箱即用模型。
--- a/examples/transformer_tts/config.py
+++ b/examples/transformer_tts/config.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode as CN
 _C = CN()
@ -14,8 +28,7 @@ _C.data = CN(
        padding_idx=0,  # text embedding's padding index
        mel_start_value=0.5,  # value for starting frame
        mel_end_value=-0.5,  # # value for ending frame
-    )
+    ))
 )
 _C.model = CN(
    dict(
@ -33,8 +46,7 @@ _C.model = CN(
        dropout=0.1,  # global droput probability
        stop_loss_scale=8.0,  # scaler for stop _loss
        decoder_prenet_dropout=0.5,  # decoder prenet dropout probability
-    )
+    ))
 )
 _C.training = CN(
    dict(
@ -45,8 +57,8 @@ _C.training = CN(
        valid_interval=1000,  # validation
        save_interval=10000,  # checkpoint
        max_iteration=900000,  # max iteration to train
-    )
+    ))
-)
+
 def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
--- a/examples/transformer_tts/ljspeech.py
+++ b/examples/transformer_tts/ljspeech.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from pathlib import Path
 import pickle
@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader
 from parakeet.data.batch import batch_spec, batch_text_id
 from parakeet.data import dataset
 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
    def __init__(self, root):
        self.root = Path(root).expanduser()
        records = []
@ -38,8 +54,8 @@ class Transform(object):
        ids, mel = example  # ids already have <s> and </s>
        ids = np.array(ids, dtype=np.int64)
        # add start and end frame
-        mel = np.pad(mel, 
+        mel = np.pad(
-                     [(0, 0), (1, 1)], 
+            mel, [(0, 0), (1, 1)],
            mode='constant',
            constant_values=[(0, 0), (self.start_value, self.end_value)])
        stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
@ -50,6 +66,7 @@ class Transform(object):
 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
    def __init__(self, padding_idx=0, padding_value=0.):
        self.padding_idx = padding_idx
        self.padding_value = padding_value
@ -67,7 +84,8 @@ class LJSpeechCollector(object):
 def create_dataloader(config, source_path):
    lj = LJSpeech(source_path)
-    transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
+    transform = Transform(config.data.mel_start_value,
                          config.data.mel_end_value)
    lj = dataset.TransformDataset(lj, transform)
    valid_set, train_set = dataset.split(lj, config.data.valid_size)
@ -85,4 +103,3 @@ def create_dataloader(config, source_path):
        drop_last=False,
        collate_fn=data_collator)
    return train_loader, valid_loader
--- a/examples/transformer_tts/preprocess.py
+++ b/examples/transformer_tts/preprocess.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import tqdm
 import pickle
@ -11,6 +25,7 @@ from parakeet.frontend import English
 from config import get_cfg_defaults
 def create_dataset(config, source_path, target_path, verbose=False):
    # create output dir
    target_path = Path(target_path).expanduser()
@ -47,7 +62,8 @@ def create_dataset(config, source_path, target_path, verbose=False):
    with open(target_path / "metadata.pkl", 'wb') as f:
        pickle.dump(records, f)
        if verbose:
-            print("saved metadata into {}".format(target_path / "metadata.pkl"))
+            print("saved metadata into {}".format(target_path /
                                                  "metadata.pkl"))
    # also save meta data into text format for inspection
    with open(target_path / "metadata.txt", 'wt') as f:
@ -55,20 +71,30 @@ def create_dataset(config, source_path, target_path, verbose=False):
            phoneme_str = "|".join(phonemes)
            f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
        if verbose:
-            print("saved metadata into {}".format(target_path / "metadata.txt"))
+            print("saved metadata into {}".format(target_path /
                                                  "metadata.txt"))
    print("Done.")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
+    parser.add_argument(
-    parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
+        "--config",
-    parser.add_argument("--output", type=str, help="path to save output dataset")
+        type=str,
-    parser.add_argument("--opts", nargs=argparse.REMAINDER,
+        metavar="FILE",
        help="extra config to overwrite the default config")
    parser.add_argument(
        "--input", type=str, help="path of the ljspeech dataset")
    parser.add_argument(
        "--output", type=str, help="path to save output dataset")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
+    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print msg")
    config = get_cfg_defaults()
    args = parser.parse_args()
--- a/examples/transformer_tts/synthesize.py
+++ b/examples/transformer_tts/synthesize.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import time
 from pathlib import Path
@ -13,14 +27,15 @@ from parakeet.utils.display import add_attention_plots
 from config import get_cfg_defaults
@paddle.fluid.dygraph.no_grad
 def main(config, args):
    paddle.set_device(args.device)
    # model
    frontend = English()
-    model = TransformerTTS.from_pretrained(
+    model = TransformerTTS.from_pretrained(frontend, config,
-        frontend, config, args.checkpoint_path)
+                                           args.checkpoint_path)
    model.eval()
    # inputs
@ -38,19 +53,33 @@ def main(config, args):
        mel_output = mel_output.T  #(C, T)
        np.save(str(output_dir / f"sentence_{i}"), mel_output)
        if args.verbose:
-            print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy"))
+            print("spectrogram saved at {}".format(output_dir /
                                                   f"sentence_{i}.npy"))
 if __name__ == "__main__":
    config = get_cfg_defaults()
-    parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
+    parser = argparse.ArgumentParser(
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
+        description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser.add_argument(
        "--config",
        type=str,
        metavar="FILE",
        help="extra config to overwrite the default config")
    parser.add_argument(
        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
    parser.add_argument("--input", type=str, help="path of the text sentences")
    parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
+    parser.add_argument(
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        "--device", type=str, default="cpu", help="device type to use.")
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
+    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print msg")
    args = parser.parse_args()
    if args.config:
--- a/examples/transformer_tts/train.py
+++ b/examples/transformer_tts/train.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 import logging
 from pathlib import Path
@ -19,6 +33,7 @@ from parakeet.training.experiment import ExperimentBase
 from config import get_cfg_defaults
 from ljspeech import LJSpeech, LJSpeechCollector, Transform
 class Experiment(ExperimentBase):
    def setup_model(self):
        config = self.config
@ -46,8 +61,7 @@ class Experiment(ExperimentBase):
            beta1=0.9,
            beta2=0.98,
            epsilon=1e-9,
-            parameters=model.parameters()
+            parameters=model.parameters())
        )
        criterion = TransformerTTSLoss(config.model.stop_loss_scale)
        drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
        reduction_factor = scheduler.StepWise(config.training.reduction_factor)
@ -63,9 +77,12 @@ class Experiment(ExperimentBase):
        config = self.config
        ljspeech_dataset = LJSpeech(args.data)
-        transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
+        transform = Transform(config.data.mel_start_value,
-        ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
+                              config.data.mel_end_value)
-        valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
+        ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
                                                    transform)
        valid_set, train_set = dataset.split(ljspeech_dataset,
                                             config.data.valid_size)
        batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
        if not self.parallel:
@ -99,7 +116,7 @@ class Experiment(ExperimentBase):
            self.drop_n_heads(self.iteration))
        # TODO(chenfeiyu): we can combine these 2 slices
-        mel_input = mel[:,:-1, :]
+        mel_input = mel[:, :-1, :]
        reduced_mel_input = mel_input[:, ::model_core.r, :]
        outputs = self.model(text, reduced_mel_input)
        return outputs
@ -115,11 +132,8 @@ class Experiment(ExperimentBase):
        time_steps = mel_target.shape[1]
        losses = self.criterion(
-            mel_output[:,:time_steps, :], 
+            mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
-            mel_intermediate[:,:time_steps, :], 
+            mel_target, stop_logits[:, :time_steps, :], stop_label_target)
            mel_target, 
            stop_logits[:,:time_steps, :], 
            stop_label_target)
        return losses
    def train_batch(self):
@ -141,13 +155,16 @@ class Experiment(ExperimentBase):
        # logging
        msg = "Rank: {}, ".format(dist.get_rank())
        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
-        msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items())
+                                                  iteration_time)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        self.logger.info(msg)
        if dist.get_rank() == 0:
            for k, v in losses_np.items():
-                self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
+                self.visualizer.add_scalar(f"train_loss/{k}", v,
                                           self.iteration)
    @mp_tools.rank_zero_only
    @paddle.no_grad()
@ -165,8 +182,7 @@ class Experiment(ExperimentBase):
                display.add_multi_attention_plots(
                    self.visualizer,
                    f"valid_sentence_{i}_cross_attention_weights",
-                    attention_weights, 
+                    attention_weights, self.iteration)
                    self.iteration)
        # write visual log
        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
--- a/examples/waveflow/config.py
+++ b/examples/waveflow/config.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode as CN
 _C = CN()
@ -12,8 +26,7 @@ _C.data = CN(
        f_max=8000,  # Hz, max frequency when converting to mel
        n_mels=80,  # mel bands
        clip_frames=65,  # mel clip frames
-    )
+    ))
 )
 _C.model = CN(
    dict(
@ -24,8 +37,7 @@ _C.model = CN(
        channels=128,  # resiaudal channel in each flow
        kernel_size=[3, 3],  # kernel size in each conv block
        sigma=1.0,  # stddev of the random noise
-    )
+    ))
 )
 _C.training = CN(
    dict(
@ -33,8 +45,8 @@ _C.training = CN(
        valid_interval=1000,  # validation
        save_interval=10000,  # checkpoint
        max_iteration=3000000,  # max iteration to train
-    )
+    ))
-)
+
 def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
--- a/examples/waveflow/ljspeech.py
+++ b/examples/waveflow/ljspeech.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from pathlib import Path
 import pickle
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
 from parakeet.data import dataset
 from parakeet.audio import AudioProcessor
 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
    def __init__(self, root):
        self.root = Path(root).expanduser()
        meta_data = pandas.read_csv(
            str(self.root / "metadata.csv"),
            sep="\t",
            header=None,
-            names=["fname", "frames", "samples"]
+            names=["fname", "frames", "samples"])
        )
        records = []
-        for row in meta_data.itertuples() :
+        for row in meta_data.itertuples():
            mel_path = str(self.root / "mel" / (row.fname + ".npy"))
            wav_path = str(self.root / "wav" / (row.fname + ".npy"))
            records.append((mel_path, wav_path))
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
    def __init__(self, padding_value=0.):
        self.padding_value = padding_value
@ -70,9 +86,7 @@ class LJSpeechClipCollector(object):
        mel, wav = example
        frames = mel.shape[-1]
        start = np.random.randint(0, frames - self.clip_frames)
-        mel_clip = mel[:, start: start + self.clip_frames]
+        mel_clip = mel[:, start:start + self.clip_frames]
-        wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
+        wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
                       self.hop_length]
        return mel_clip, wav_clip
--- a/examples/waveflow/preprocess.py
+++ b/examples/waveflow/preprocess.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import tqdm
 import csv
@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(exist_ok=True)
-    transform = Transform(
+    transform = Transform(config.sample_rate, config.n_fft, config.win_length,
-        config.sample_rate, 
+                          config.hop_length, config.n_mels)
        config.n_fft, 
        config.win_length, 
        config.hop_length, 
        config.n_mels)
    file_names = []
    for example in tqdm.tqdm(dataset):
@ -109,20 +119,32 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
        file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
    meta_data = pd.DataFrame.from_records(file_names)
-    meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
+    meta_data.to_csv(
-    print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
+        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
    print("saved meta data in to {}".format(
        os.path.join(output_dir, "metadata.csv")))
    print("Done!")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
+    parser.add_argument(
-    parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
+        "--config",
-    parser.add_argument("--output", type=str, help="path to save output dataset")
+        type=str,
-    parser.add_argument("--opts", nargs=argparse.REMAINDER,
+        metavar="FILE",
        help="extra config to overwrite the default config")
    parser.add_argument(
        "--input", type=str, help="path of the ljspeech dataset")
    parser.add_argument(
        "--output", type=str, help="path to save output dataset")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
+    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print msg")
    config = get_cfg_defaults()
    args = parser.parse_args()
--- a/examples/waveflow/synthesize.py
+++ b/examples/waveflow/synthesize.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import numpy as np
 import soundfile as sf
@ -8,9 +22,9 @@ import parakeet
 from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
 from parakeet.utils import layer_tools, checkpoint
 from config import get_cfg_defaults
 def main(config, args):
    paddle.set_device(args.device)
    model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
@ -23,7 +37,8 @@ def main(config, args):
    for file_path in mel_dir.iterdir():
        mel = np.load(str(file_path))
        audio = model.predict(mel)
-        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
+        audio_path = output_dir / (
            os.path.splitext(file_path.name)[0] + ".wav")
        sf.write(audio_path, audio, config.data.sample_rate)
        print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -31,14 +46,29 @@ def main(config, args):
 if __name__ == "__main__":
    config = get_cfg_defaults()
-    parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
+    parser = argparse.ArgumentParser(
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
+        description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser.add_argument(
-    parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
+        "--config",
        type=str,
        metavar="FILE",
        help="extra config to overwrite the default config")
    parser.add_argument(
        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
    parser.add_argument(
        "--input",
        type=str,
        help="path of directory containing mel spectrogram (in .npy format)")
    parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
+    parser.add_argument(
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        "--device", type=str, default="cpu", help="device type to use.")
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
+    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print msg")
    args = parser.parse_args()
    if args.config:
--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 from pathlib import Path
 import numpy as np
@ -34,7 +48,8 @@ class Experiment(ExperimentBase):
        if self.parallel > 1:
            model = paddle.DataParallel(model)
-        optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters())
+        optimizer = paddle.optimizer.Adam(
            config.training.lr, parameters=model.parameters())
        criterion = WaveFlowLoss(sigma=config.model.sigma)
        self.model = model
@ -46,9 +61,11 @@ class Experiment(ExperimentBase):
        args = self.args
        ljspeech_dataset = LJSpeech(args.data)
-        valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
+        valid_set, train_set = dataset.split(ljspeech_dataset,
                                             config.data.valid_size)
-        batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length)
+        batch_fn = LJSpeechClipCollector(config.data.clip_frames,
                                         config.data.hop_length)
        if not self.parallel:
            train_loader = DataLoader(
@ -97,10 +114,12 @@ class Experiment(ExperimentBase):
        loss_value = float(loss)
        msg = "Rank: {}, ".format(dist.get_rank())
        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
                                                  iteration_time)
        msg += "loss: {:>.6f}".format(loss_value)
        self.logger.info(msg)
-        self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
+        self.visualizer.add_scalar(
            "train/loss", loss_value, global_step=self.iteration)
    @mp_tools.rank_zero_only
    @paddle.no_grad()
@ -112,7 +131,8 @@ class Experiment(ExperimentBase):
        loss = self.criterion(z, log_det_jocobian)
        valid_losses.append(float(loss))
        valid_loss = np.mean(valid_losses)
-        self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
+        self.visualizer.add_scalar(
            "valid/loss", valid_loss, global_step=self.iteration)
 def main_sp(config, args):
--- a/examples/wavenet/config.py
+++ b/examples/wavenet/config.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode as CN
 _C = CN()
@ -12,8 +26,7 @@ _C.data = CN(
        # f_max=8000, # Hz, max frequency when converting to mel
        n_mels=80,  # mel bands
        train_clip_seconds=0.5,  # audio clip length(in seconds)
-    )
+    ))
 )
 _C.model = CN(
    dict(
@ -24,9 +37,7 @@ _C.model = CN(
        residual_channels=128,  # resiaudal channel in each flow
        loss_type="mog",
        output_dim=3,  # single gaussian
-        log_scale_min=-9.0,
+        log_scale_min=-9.0, ))
    )
 )
 _C.training = CN(
    dict(
@ -37,8 +48,8 @@ _C.training = CN(
        save_interval=10000,  # checkpoint
        max_iteration=3000000,  # max iteration to train
        gradient_max_norm=100.0  # global norm of gradients
-    )
+    ))
-)
+
 def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
--- a/examples/wavenet/ljspeech.py
+++ b/examples/wavenet/ljspeech.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from pathlib import Path
 import pickle
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
 from parakeet.data import dataset
 from parakeet.audio import AudioProcessor
 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
    def __init__(self, root):
        self.root = Path(root).expanduser()
        meta_data = pandas.read_csv(
            str(self.root / "metadata.csv"),
            sep="\t",
            header=None,
-            names=["fname", "frames", "samples"]
+            names=["fname", "frames", "samples"])
        )
        records = []
-        for row in meta_data.itertuples() :
+        for row in meta_data.itertuples():
            mel_path = str(self.root / "mel" / (row.fname + ".npy"))
            wav_path = str(self.root / "wav" / (row.fname + ".npy"))
            records.append((mel_path, wav_path))
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
    def __init__(self, padding_value=0.):
        self.padding_value = padding_value
@ -48,7 +64,7 @@ class LJSpeechCollector(object):
        wavs = [example[1] for example in examples]
        mels = batch_spec(mels, pad_value=self.padding_value)
        wavs = batch_wav(wavs, pad_value=self.padding_value)
-        audio_starts = np.zeros((batch_size,), dtype=np.int64)
+        audio_starts = np.zeros((batch_size, ), dtype=np.int64)
        return mels, wavs, audio_starts
@ -75,7 +91,8 @@ class LJSpeechClipCollector(object):
        mel, wav = example
        frames = mel.shape[-1]
        start = np.random.randint(0, frames - self.clip_frames)
-        wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
+        wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
                       self.hop_length]
        return mel, wav_clip, start
@ -132,7 +149,3 @@ class DataCollector(object):
            audios = np.array(audios, dtype=np.float32)
        audio_starts = np.array(audio_starts, dtype=np.int64)
        return audios, mels, audio_starts
--- a/examples/wavenet/preprocess.py
+++ b/examples/wavenet/preprocess.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import tqdm
 import csv
@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(exist_ok=True)
-    transform = Transform(
+    transform = Transform(config.sample_rate, config.n_fft, config.win_length,
-        config.sample_rate, 
+                          config.hop_length, config.n_mels)
        config.n_fft, 
        config.win_length, 
        config.hop_length, 
        config.n_mels)
    file_names = []
    for example in tqdm.tqdm(dataset):
@ -110,20 +120,32 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
        file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
    meta_data = pd.DataFrame.from_records(file_names)
-    meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
+    meta_data.to_csv(
-    print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
+        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
    print("saved meta data in to {}".format(
        os.path.join(output_dir, "metadata.csv")))
    print("Done!")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
+    parser.add_argument(
-    parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
+        "--config",
-    parser.add_argument("--output", type=str, help="path to save output dataset")
+        type=str,
-    parser.add_argument("--opts", nargs=argparse.REMAINDER,
+        metavar="FILE",
        help="extra config to overwrite the default config")
    parser.add_argument(
        "--input", type=str, help="path of the ljspeech dataset")
    parser.add_argument(
        "--output", type=str, help="path to save output dataset")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
+    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print msg")
    config = get_cfg_defaults()
    args = parser.parse_args()
--- a/examples/wavenet/synthesize.py
+++ b/examples/wavenet/synthesize.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import numpy as np
 import soundfile as sf
@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint
 from config import get_cfg_defaults
 def main(config, args):
    paddle.set_device(args.device)
    model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
@ -22,7 +37,8 @@ def main(config, args):
    for file_path in mel_dir.iterdir():
        mel = np.load(str(file_path))
        audio = model.predict(mel)
-        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
+        audio_path = output_dir / (
            os.path.splitext(file_path.name)[0] + ".wav")
        sf.write(audio_path, audio, config.data.sample_rate)
        print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -30,14 +46,29 @@ def main(config, args):
 if __name__ == "__main__":
    config = get_cfg_defaults()
-    parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
+    parser = argparse.ArgumentParser(
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
+        description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser.add_argument(
-    parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
+        "--config",
        type=str,
        metavar="FILE",
        help="extra config to overwrite the default config")
    parser.add_argument(
        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
    parser.add_argument(
        "--input",
        type=str,
        help="path of directory containing mel spectrogram (in .npy format)")
    parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
+    parser.add_argument(
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        "--device", type=str, default="cpu", help="device type to use.")
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
+    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print msg")
    args = parser.parse_args()
    if args.config:
--- a/examples/wavenet/train.py
+++ b/examples/wavenet/train.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 from pathlib import Path
 import math
@ -39,13 +53,13 @@ class Experiment(ExperimentBase):
            model = paddle.DataParallel(model)
        lr_scheduler = paddle.optimizer.lr.StepDecay(
-            config.training.lr, 
+            config.training.lr, config.training.anneal_interval,
            config.training.anneal_interval, 
            config.training.anneal_rate)
        optimizer = paddle.optimizer.Adam(
            lr_scheduler,
            parameters=model.parameters(),
-            grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm))
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(
                config.training.gradient_max_norm))
        self.model = model
        self.model_core = model._layer if self.parallel else model
@ -56,7 +70,8 @@ class Experiment(ExperimentBase):
        args = self.args
        ljspeech_dataset = LJSpeech(args.data)
-        valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
+        valid_set, train_set = dataset.split(ljspeech_dataset,
                                             config.data.valid_size)
        # convolutional net's causal padding size
        context_size = config.model.n_stack \
@ -66,7 +81,8 @@ class Experiment(ExperimentBase):
        # frames used to compute loss
        frames_per_second = config.data.sample_rate // config.data.hop_length
-        train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second)
+        train_clip_frames = math.ceil(config.data.train_clip_seconds *
                                      frames_per_second)
        num_frames = train_clip_frames + context_frames
        batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
@ -111,10 +127,12 @@ class Experiment(ExperimentBase):
        loss_value = float(loss)
        msg = "Rank: {}, ".format(dist.get_rank())
        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
                                                  iteration_time)
        msg += "loss: {:>.6f}".format(loss_value)
        self.logger.info(msg)
-        self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
+        self.visualizer.add_scalar(
            "train/loss", loss_value, global_step=self.iteration)
    @mp_tools.rank_zero_only
    @paddle.no_grad()
@ -126,7 +144,8 @@ class Experiment(ExperimentBase):
        loss = self.model.loss(y, wav)
        valid_losses.append(float(loss))
        valid_loss = np.mean(valid_losses)
-        self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
+        self.visualizer.add_scalar(
            "valid/loss", valid_loss, global_step=self.iteration)
 def main_sp(config, args):
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
@ -18,15 +18,16 @@ import numpy as np
 __all__ = ["AudioProcessor"]
 class AudioProcessor(object):
    def __init__(self,
-                 sample_rate:int,
+                 sample_rate: int,
-                 n_fft:int,
+                 n_fft: int,
-                 win_length:int,
+                 win_length: int,
-                 hop_length:int,
+                 hop_length: int,
-                 n_mels:int=80,
+                 n_mels: int=80,
-                 f_min:int=0,
+                 f_min: int=0,
-                 f_max:int=None,
+                 f_max: int=None,
                 window="hann",
                 center=True,
                 pad_mode="reflect"):
@ -50,8 +51,7 @@ class AudioProcessor(object):
        self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
    def _create_mel_filter(self):
-        mel_filter = librosa.filters.mel(
+        mel_filter = librosa.filters.mel(self.sample_rate,
            self.sample_rate,
                                         self.n_fft,
                                         n_mels=self.n_mels,
                                         fmin=self.f_min,
@ -69,7 +69,7 @@ class AudioProcessor(object):
    def stft(self, wav):
        D = librosa.core.stft(
            wav,
-            n_fft = self.n_fft,
+            n_fft=self.n_fft,
            hop_length=self.hop_length,
            win_length=self.win_length,
            window=self.window,
--- a/parakeet/audio/spec_normalizer.py
+++ b/parakeet/audio/spec_normalizer.py
@ -1,3 +1,16 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This modules contains normalizers for spectrogram magnitude.
@ -23,10 +36,12 @@ class NormalizerBase(object):
    def inverse(self, normalized):
        raise NotImplementedError("inverse must be implemented")
 class LogMagnitude(NormalizerBase):
    """
    This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
    """
    def __init__(self, min=1e-7):
        self.min = min
@ -44,6 +59,7 @@ class UnitMagnitude(NormalizerBase):
    """
    This is the normalizer used in the 
    """
    def __init__(self, min=1e-5):
        self.min = min
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided.
 import numpy as np
 __all__ = [
-    "batch_text_id", "batch_wav", "batch_spec",
+    "batch_text_id",
-    "TextIDBatcher", "WavBatcher", "SpecBatcher",
+    "batch_wav",
    "batch_spec",
    "TextIDBatcher",
    "WavBatcher",
    "SpecBatcher",
 ]
 class TextIDBatcher(object):
    """A wrapper class for `batch_text_id`."""
@ -113,7 +118,11 @@ class SpecBatcher(object):
        self.time_major = time_major
    def __call__(self, minibatch):
-        out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype)
+        out = batch_spec(
            minibatch,
            pad_value=self.pad_value,
            time_major=self.time_major,
            dtype=self.dtype)
        return out
@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
    """
    # assume (F, T) or (T, F)
    peek_example = minibatch[0]
-    assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram"
+    assert len(
        peek_example.shape) == 2, "we only handles mono channel spectrogram"
    # assume (F, n_frame) or (n_frame, F)
    time_idx = 0 if time_major else -1
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@ -17,17 +17,25 @@ import paddle
 from paddle.io import Dataset
 __all__ = [
-    "split", "TransformDataset", "CacheDataset", "TupleDataset", 
+    "split",
-    "DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset", 
+    "TransformDataset",
    "CacheDataset",
    "TupleDataset",
    "DictDataset",
    "SliceDataset",
    "SubsetDataset",
    "FilterDataset",
    "ChainDataset",
 ]
 def split(dataset, first_size):
    """A utility function to split a dataset into two datasets."""
    first = SliceDataset(dataset, 0, first_size)
    second = SliceDataset(dataset, first_size, len(dataset))
    return first, second
 class TransformDataset(Dataset):
    def __init__(self, dataset, transform):
        """Dataset which is transformed from another with a transform.
--- a/parakeet/datasets/init.py
+++ b/parakeet/datasets/init.py
@ -1,2 +1,16 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from parakeet.datasets.common import *
 from parakeet.datasets.ljspeech import *
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
@ -1,9 +1,24 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.io import Dataset
 import os
 import librosa
 __all__ = ["AudioFolderDataset"]
 class AudioFolderDataset(Dataset):
    def __init__(self, path, sample_rate, extension="wav"):
        self.root = os.path.expanduser(path)
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
@ -1,8 +1,23 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.io import Dataset
 from pathlib import Path
 __all__ = ["LJSpeechMetaData"]
 class LJSpeechMetaData(Dataset):
    def __init__(self, root):
        self.root = Path(root).expanduser()
@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset):
    def __len__(self):
        return len(self.records)
--- a/parakeet/frontend/init.py
+++ b/parakeet/frontend/init.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from parakeet.frontend.vocab import *
 from parakeet.frontend.phonectic import *
 from parakeet.frontend.punctuation import *
--- a/parakeet/frontend/normalizer/init.py
+++ b/parakeet/frontend/normalizer/init.py
@ -1,2 +1,16 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from parakeet.frontend.normalizer.normalizer import *
 from parakeet.frontend.normalizer.numbers import *
--- a/parakeet/frontend/normalizer/abbrrviation.py
+++ b/parakeet/frontend/normalizer/abbrrviation.py
@ -0,0 +1,14 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/parakeet/frontend/normalizer/acronyms.py
+++ b/parakeet/frontend/normalizer/acronyms.py
@ -0,0 +1,14 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/parakeet/frontend/normalizer/width.py
+++ b/parakeet/frontend/normalizer/width.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 def full2half_width(ustr):
    half = []
    for u in ustr:
@ -10,6 +24,7 @@ def full2half_width(ustr):
        half.append(u)
    return ''.join(half)
 def half2full_width(ustr):
    full = []
    for u in ustr:
--- a/parakeet/frontend/phonectic.py
+++ b/parakeet/frontend/phonectic.py
@ -17,7 +17,8 @@ from typing import Union
 from g2p_en import G2p
 from g2pM import G2pM
 from parakeet.frontend import Vocab
-from opencc import OpenCC
+# discard opencc untill we find an easy solution to install it on windows
 # from opencc import OpenCC
 from parakeet.frontend.punctuation import get_punctuations
 from parakeet.frontend.normalizer.normalizer import normalize
@ -211,7 +212,7 @@ class Chinese(Phonetics):
    """
    def __init__(self):
-        self.opencc_backend = OpenCC('t2s.json')
+        # self.opencc_backend = OpenCC('t2s.json')
        self.backend = G2pM()
        self.phonemes = self._get_all_syllables()
        self.punctuations = get_punctuations("cn")
@ -236,7 +237,8 @@ class Chinese(Phonetics):
        List[str]
            The list of pronunciation sequence.
        """
-        simplified = self.opencc_backend.convert(sentence)
+        # simplified = self.opencc_backend.convert(sentence)
        simplified = sentence
        phonemes = self.backend(simplified)
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
--- a/parakeet/frontend/punctuation.py
+++ b/parakeet/frontend/punctuation.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import abc
 import string
@ -13,15 +27,8 @@ EN_PUNCT = [
    "!",
 ]
-CN_PUNCT = [
+CN_PUNCT = ["、", "，", "；", "：", "。", "？", "！"]
-    "、",
+
    "，",
    "；",
    "：",
    "。",
    "？",
    "！"
 ]
 def get_punctuations(lang):
    if lang == "en":
@ -30,4 +37,3 @@ def get_punctuations(lang):
        return CN_PUNCT
    else:
        raise ValueError(f"language {lang} Not supported")
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@ -575,7 +575,8 @@ class TransformerTTS(nn.Layer):
            decoder_prenet_dropout=config.model.decoder_prenet_dropout,
            dropout=config.model.dropout)
-        iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
+        iteration = checkpoint.load_parameters(
            model, checkpoint_path=checkpoint_path)
        drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
        reduction_factor = scheduler.StepWise(config.training.reduction_factor)
        model.set_constants(
--- a/parakeet/models/waveflow.py
+++ b/parakeet/models/waveflow.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 from typing import List, Union, Tuple
@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo
 __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
 def fold(x, n_group):
    r"""Fold audio or spectrogram's temporal dimension in to groups.
@ -31,6 +46,7 @@ def fold(x, n_group):
    new_shape = spatial_shape + [time_steps // n_group, n_group]
    return paddle.reshape(x, new_shape)
 class UpsampleNet(nn.LayerList):
    """Layer to upsample mel spectrogram to the same temporal resolution with 
    the corresponding waveform. 
@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList):
    ---------
    ``librosa.core.stft``
    """
    def __init__(self, upsample_factors):
        super(UpsampleNet, self).__init__()
        for factor in upsample_factors:
@ -67,7 +84,9 @@ class UpsampleNet(nn.LayerList):
            init = I.Uniform(-std, std)
            self.append(
                nn.utils.weight_norm(
-                    nn.Conv2DTranspose(1, 1, (3, 2 * factor), 
+                    nn.Conv2DTranspose(
                        1,
                        1, (3, 2 * factor),
                        padding=(1, factor // 2),
                        stride=(1, factor),
                        weight_attr=init,
@ -131,15 +150,21 @@ class ResidualBlock(nn.Layer):
    dilations : int
        Dilations of the Convolution2d applied to the input.
    """
    def __init__(self, channels, cond_channels, kernel_size, dilations):
        super(ResidualBlock, self).__init__()
        # input conv
        std = math.sqrt(1 / channels * np.prod(kernel_size))
        init = I.Uniform(-std, std)
-        receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)]
+        receptive_field = [
            1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
        ]
        rh, rw = receptive_field
        paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2]  # causal & same
-        conv = nn.Conv2D(channels, 2 * channels, kernel_size, 
+        conv = nn.Conv2D(
            channels,
            2 * channels,
            kernel_size,
            padding=paddings,
            dilation=dilations,
            weight_attr=init,
@ -152,15 +177,18 @@ class ResidualBlock(nn.Layer):
        # condition projection
        std = math.sqrt(1 / cond_channels)
        init = I.Uniform(-std, std)
-        condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1),
+        condition_proj = nn.Conv2D(
-                                   weight_attr=init, bias_attr=init)
+            cond_channels,
            2 * channels, (1, 1),
            weight_attr=init,
            bias_attr=init)
        self.condition_proj = nn.utils.weight_norm(condition_proj)
        # parametric residual & skip connection
        std = math.sqrt(1 / channels)
        init = I.Uniform(-std, std)
-        out_proj = nn.Conv2D(channels, 2 * channels, (1, 1),
+        out_proj = nn.Conv2D(
-                             weight_attr=init, bias_attr=init)
+            channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
        self.out_proj = nn.utils.weight_norm(out_proj)
    def forward(self, x, condition):
@ -290,6 +318,7 @@ class ResidualNet(nn.LayerList):
    ValueError
        If the length of dilations_h does not equals n_layers.
    """
    def __init__(self,
                 n_layer: int,
                 residual_channels: int,
@ -297,11 +326,13 @@ class ResidualNet(nn.LayerList):
                 kernel_size: Tuple[int],
                 dilations_h: List[int]):
        if len(dilations_h) != n_layer:
-            raise ValueError("number of dilations_h should equals num of layers")
+            raise ValueError(
                "number of dilations_h should equals num of layers")
        super(ResidualNet, self).__init__()
        for i in range(n_layer):
-            dilation = (dilations_h[i], 2 ** i)
+            dilation = (dilations_h[i], 2**i)
-            layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation)
+            layer = ResidualBlock(residual_channels, condition_channels,
                                  kernel_size, dilation)
            self.append(layer)
    def forward(self, x, condition):
@ -397,7 +428,9 @@ class Flow(nn.Layer):
        super(Flow, self).__init__()
        # input projection
        self.input_proj = nn.utils.weight_norm(
-            nn.Conv2D(1, channels, (1, 1), 
+            nn.Conv2D(
                1,
                channels, (1, 1),
                weight_attr=I.Uniform(-1., 1.),
                bias_attr=I.Uniform(-1., 1.)))
@ -406,7 +439,9 @@ class Flow(nn.Layer):
                                  self.dilations_dict[n_group])
        # output projection
-        self.output_proj = nn.Conv2D(channels, 2, (1, 1),
+        self.output_proj = nn.Conv2D(
            channels,
            2, (1, 1),
            weight_attr=I.Constant(0.),
            bias_attr=I.Constant(0.))
@ -452,8 +487,8 @@ class Flow(nn.Layer):
            transformation from x to z.
        """
        # (B, C, H-1, W)
-        logs, b = self._predict_parameters(
+        logs, b = self._predict_parameters(x[:, :, :-1, :],
-            x[:, :, :-1, :], condition[:, :, 1:, :]) 
+                                           condition[:, :, 1:, :])
        z = self._transform(x, logs, b)
        return z, (logs, b)
@ -511,10 +546,11 @@ class Flow(nn.Layer):
        self._start_sequence()
        for i in range(1, self.n_group):
            x_row = x[-1]  # actuallt i-1:i
-            z_row = z[:, :, i:i+1, :]
+            z_row = z[:, :, i:i + 1, :]
-            condition_row = condition[:, :, i:i+1, :]
+            condition_row = condition[:, :, i:i + 1, :]
-            x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row)
+            x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
                                                      condition_row)
            x.append(x_next_row)
            logs_list.append(logs)
            b_list.append(b)
@ -549,13 +585,17 @@ class WaveFlow(nn.LayerList):
    kernel_size : Union[int, List[int]]
        Kernel size of the convolution layer in each ResidualBlock.
    """
-    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
+
    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
                 kernel_size):
        if n_group % 2 or n_flows % 2:
-            raise ValueError("number of flows and number of group must be even "
+            raise ValueError(
                "number of flows and number of group must be even "
                "since a permutation along group among flows is used.")
        super(WaveFlow, self).__init__()
        for _ in range(n_flows):
-            self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group))
+            self.append(
                Flow(n_layers, channels, mel_bands, kernel_size, n_group))
        # permutations in h
        self.perms = self._create_perm(n_group, n_flows)
@ -572,7 +612,8 @@ class WaveFlow(nn.LayerList):
            if i < n_flows // 2:
                perms.append(indices[::-1])
            else:
-                perm = list(reversed(indices[:half])) + list(reversed(indices[half:]))
+                perm = list(reversed(indices[:half])) + list(
                    reversed(indices[half:]))
                perms.append(perm)
        return perms
@ -612,8 +653,10 @@ class WaveFlow(nn.LayerList):
        x, condition = self._trim(x, condition)
        # to (B, C, h, T//h) layout
-        x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
+        x = paddle.unsqueeze(
-        condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
+            paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
        condition = paddle.transpose(
            fold(condition, self.n_group), [0, 1, 3, 2])
        # flows
        logs_list = []
@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList):
        z, condition = self._trim(z, condition)
        # to (B, C, h, T//h) layout
-        z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
+        z = paddle.unsqueeze(
-        condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
+            paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
        condition = paddle.transpose(
            fold(condition, self.n_group), [0, 1, 3, 2])
        # reverse it flow by flow
        for i in reversed(range(self.n_flows)):
@ -695,6 +740,7 @@ class ConditionalWaveFlow(nn.LayerList):
    kernel_size : Union[int, List[int]]
        Kernel size of the convolution layer in each ResidualBlock.
    """
    def __init__(self,
                 upsample_factors: List[int],
                 n_flows: int,
@ -795,8 +841,7 @@ class ConditionalWaveFlow(nn.LayerList):
        ConditionalWaveFlow
            The model built from pretrained result.
        """
-        model = cls(
+        model = cls(upsample_factors=config.model.upsample_factors,
            upsample_factors=config.model.upsample_factors,
                    n_flows=config.model.n_flows,
                    n_layers=config.model.n_layers,
                    n_group=config.model.n_group,
@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer):
        The standard deviation of the gaussian noise used in WaveFlow, by 
        default 1.0.
    """
    def __init__(self, sigma=1.0):
        super(WaveFlowLoss, self).__init__()
        self.sigma = sigma
@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer):
        Tensor [shape=(1,)]
            The loss.
        """
-        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
+        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
                                    ) - log_det_jacobian
        loss = loss / np.prod(z.shape)
        return loss + self.const
--- a/parakeet/models/wavenet.py
+++ b/parakeet/models/wavenet.py
@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools
 __all__ = ["WaveNet", "ConditionalWaveNet"]
 def crop(x, audio_start, audio_length):
    """Crop the upsampled condition to match audio_length. 
@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList):
    ---------
    ``librosa.core.stft``
    """
    def __init__(self, upscale_factors=[16, 16]):
        super(UpsampleNet, self).__init__()
        self.upscale_factors = list(upscale_factors)
@ -106,7 +108,9 @@ class UpsampleNet(nn.LayerList):
        for factor in self.upscale_factors:
            self.append(
                nn.utils.weight_norm(
-                    nn.Conv2DTranspose(1, 1, 
+                    nn.Conv2DTranspose(
                        1,
                        1,
                        kernel_size=(3, 2 * factor),
                        stride=(1, factor),
                        padding=(1, factor // 2))))
@ -159,6 +163,7 @@ class ResidualBlock(nn.Layer):
    dilation :int
        Dilation of the internal convolution cells.
    """
    def __init__(self,
                 residual_channels: int,
                 condition_dim: int,
@ -170,9 +175,11 @@ class ResidualBlock(nn.Layer):
        # following clarinet's implementation, we do not have parametric residual
        # & skip connection.
-        _filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size
+        _filter_size = filter_size[0] if isinstance(filter_size, (
            list, tuple)) else filter_size
        std = math.sqrt(1 / (_filter_size * residual_channels))
-        conv = Conv1dCell(residual_channels, 
+        conv = Conv1dCell(
            residual_channels,
            dilated_channels,
            filter_size,
            dilation=dilation,
@ -180,7 +187,9 @@ class ResidualBlock(nn.Layer):
        self.conv = nn.utils.weight_norm(conv)
        std = math.sqrt(1 / condition_dim)
-        condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), 
+        condition_proj = Conv1dCell(
            condition_dim,
            dilated_channels, (1, ),
            weight_attr=I.Normal(scale=std))
        self.condition_proj = nn.utils.weight_norm(condition_proj)
@ -309,6 +318,7 @@ class ResidualNet(nn.LayerList):
        Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
    """
    def __init__(self,
                 n_stack: int,
                 n_loop: int,
@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList):
        dilations = [2**i for i in range(n_loop)] * n_stack
        self.context_size = 1 + sum(dilations)
        for dilation in dilations:
-            self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
+            self.append(
                ResidualBlock(residual_channels, condition_dim, filter_size,
                              dilation))
    def forward(self, x, condition=None):
        """Forward pass of ``ResidualNet``.
@ -426,6 +438,7 @@ class WaveNet(nn.Layer):
        This is only used for computing loss when ``loss_type`` is "mog", If 
        the predicted log scale is less than -9.0, it is clipped at -9.0.
    """
    def __init__(self, n_stack, n_loop, residual_channels, output_dim,
                 condition_dim, filter_size, loss_type, log_scale_min):
@ -437,19 +450,24 @@ class WaveNet(nn.Layer):
        else:
            if (output_dim % 3 != 0):
                raise ValueError(
-                    "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim))
+                    "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".
-            self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1)
+                    format(output_dim))
            self.embed = nn.utils.weight_norm(
                nn.Linear(1, residual_channels), dim=1)
        self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
                                  condition_dim, filter_size)
        self.context_size = self.resnet.context_size
        skip_channels = residual_channels  # assume the same channel
-        self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
+        self.proj1 = nn.utils.weight_norm(
-        self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
+            nn.Linear(skip_channels, skip_channels), dim=1)
        self.proj2 = nn.utils.weight_norm(
            nn.Linear(skip_channels, skip_channels), dim=1)
        # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
        # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
-        self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1)
+        self.proj3 = nn.utils.weight_norm(
            nn.Linear(skip_channels, output_dim), dim=1)
        self.loss_type = loss_type
        self.output_dim = output_dim
@ -781,6 +799,7 @@ class ConditionalWaveNet(nn.Layer):
        This is only used for computing loss when ``loss_type`` is "mog", If 
        the predicted log scale is less than -9.0, it is clipped at -9.0.
    """
    def __init__(self,
                 upsample_factors: List[int],
                 n_stack: int,
@ -793,7 +812,8 @@ class ConditionalWaveNet(nn.Layer):
                 log_scale_min: float=-9.0):
        super(ConditionalWaveNet, self).__init__()
        self.encoder = UpsampleNet(upsample_factors)
-        self.decoder = WaveNet(n_stack=n_stack, 
+        self.decoder = WaveNet(
            n_stack=n_stack,
            n_loop=n_loop,
            residual_channels=residual_channels,
            output_dim=output_dim,
@ -943,8 +963,7 @@ class ConditionalWaveNet(nn.Layer):
        ConditionalWaveNet
            The model built from pretrained result.
        """
-        model = cls(
+        model = cls(upsample_factors=config.model.upsample_factors,
            upsample_factors=config.model.upsample_factors,
                    n_stack=config.model.n_stack,
                    n_loop=config.model.n_loop,
                    residual_channels=config.model.residual_channels,
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
@ -86,6 +100,7 @@ class STFT(nn.Layer):
    Ony ``center`` and ``reflect`` padding is supported now.
    """
    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
        super(STFT, self).__init__()
        self.hop_length = hop_length
@ -109,7 +124,8 @@ class STFT(nn.Layer):
                            (self.n_bin, 1, 1, self.n_fft))
        w = np.concatenate([w_real, w_imag], axis=0)
-        self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
+        self.weight = paddle.cast(
            paddle.to_tensor(w), paddle.get_default_dtype())
    def forward(self, x):
        """Compute the stft transform.
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
@ -20,6 +20,7 @@ __all__ = [
    "Conv1dBatchNorm",
 ]
 class Conv1dCell(nn.Conv1D):
    """A subclass of Conv1D layer, which can be used in an autoregressive 
    decoder like an RNN cell. 
@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer):
    epsilon : [type], optional
        The epsilon of the BatchNorm1D layer, by default 1e-05
    """
    def __init__(self,
                 in_channels,
                 out_channels,
--- a/parakeet/modules/geometry.py
+++ b/parakeet/modules/geometry.py
@ -1,6 +1,21 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import paddle
 def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.
--- a/parakeet/modules/losses.py
+++ b/parakeet/modules/losses.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numba
 import numpy as np
 import paddle
@ -11,6 +25,7 @@ __all__ = [
    "diagonal_loss",
 ]
 def weighted_mean(input, weight):
    """Weighted mean. It can also be used as masked mean.
@ -88,8 +103,7 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
    return loss
-def diagonal_loss(
+def diagonal_loss(attentions,
    attentions, 
                  input_lengths,
                  target_lengths,
                  g=0.2,
@ -133,6 +147,7 @@ def diagonal_loss(
    else:
        return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
@numba.jit(nopython=True)
 def guided_attention(N, max_N, T, max_T, g):
    W = np.zeros((max_T, max_N), dtype=np.float32)
@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g):
    # (T_dec, T_enc)
    return W
 def guided_attentions(input_lengths, target_lengths, g=0.2):
    B = len(input_lengths)
    max_input_len = input_lengths.max()
--- a/parakeet/modules/masking.py
+++ b/parakeet/modules/masking.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle.fluid.layers import sequence_mask
@ -8,6 +22,7 @@ __all__ = [
    "future_mask",
 ]
 def id_mask(input, padding_index=0, dtype="bool"):
    """Generate mask with input ids. 
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 import paddle
@ -5,6 +19,7 @@ from paddle.nn import functional as F
 __all__ = ["positional_encoding"]
 def positional_encoding(start_index, length, size, dtype=None):
    r"""Generate standard positional encoding matrix.
@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None):
    dtype = dtype or paddle.get_default_dtype()
    channel = np.arange(0, size, 2)
    index = np.arange(start_index, start_index + length, 1)
-    p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
+    p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
    encodings = np.zeros([length, size])
    encodings[:, 0::2] = np.sin(p)
    encodings[:, 1::2] = np.cos(p)
--- a/parakeet/modules/transformer.py
+++ b/parakeet/modules/transformer.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import paddle
 from paddle import nn
@ -12,6 +26,7 @@ __all__ = [
    "TransformerDecoderLayer",
 ]
 class PositionwiseFFN(nn.Layer):
    """A faithful implementation of Position-wise Feed-Forward Network 
    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer):
        The probability of the Dropout applied to the output of the first 
        layer, by default 0.
    """
-    def __init__(self, 
+
-                 input_size: int, 
+    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
                 hidden_size: int, 
                 dropout=0.0):
        super(PositionwiseFFN, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, input_size)
@ -86,6 +99,7 @@ class TransformerEncoderLayer(nn.Layer):
    ------
    It uses the PostLN (post layer norm) scheme. 
    """
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
        super(TransformerEncoderLayer, self).__init__()
        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer):
        """
        context_vector, attn_weights = self.self_mha(x, x, x, mask)
        x = self.layer_norm1(
-            F.dropout(x + context_vector,
+            F.dropout(
-                      self.dropout,
+                x + context_vector, self.dropout, training=self.training))
                      training=self.training))
        x = self.layer_norm2(
-            F.dropout(x + self.ffn(x),
+            F.dropout(
-                      self.dropout,
+                x + self.ffn(x), self.dropout, training=self.training))
                      training=self.training))
        return x, attn_weights
@ -155,6 +167,7 @@ class TransformerDecoderLayer(nn.Layer):
    ------
    It uses the PostLN (post layer norm) scheme. 
    """
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
        super(TransformerDecoderLayer, self).__init__()
        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer):
        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] 
            Decoder-encoder cross attention.
        """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
+        context_vector, self_attn_weights = self.self_mha(q, q, q,
                                                          decoder_mask)
        q = self.layer_norm1(
-            F.dropout(q + context_vector, 
+            F.dropout(
-                      self.dropout, 
+                q + context_vector, self.dropout, training=self.training))
                      training=self.training))
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask)
+        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
                                                            encoder_mask)
        q = self.layer_norm2(
-            F.dropout(q + context_vector,
+            F.dropout(
-                      self.dropout,
+                q + context_vector, self.dropout, training=self.training))
                      training=self.training))
        q = self.layer_norm3(
-            F.dropout(q + self.ffn(q),
+            F.dropout(
-                      self.dropout,
+                q + self.ffn(q), self.dropout, training=self.training))
                      training=self.training))
        return q, self_attn_weights, cross_attn_weights
--- a/parakeet/training/init.py
+++ b/parakeet/training/init.py
@ -1,2 +1,16 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from parakeet.training.cli import *
 from parakeet.training.experiment import *
--- a/parakeet/training/cli.py
+++ b/parakeet/training/cli.py
@ -1,5 +1,20 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 def default_argument_parser():
    r"""A simple yet genral argument parser for experiments with parakeet.
--- a/parakeet/training/default_config.py
+++ b/parakeet/training/default_config.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode
 _C = CfgNode(
@ -5,8 +19,8 @@ _C = CfgNode(
        valid_interval=1000,  # validation
        save_interval=10000,  # checkpoint
        max_iteration=900000,  # max iteration to train
-    )
+    ))
-)
+
 def get_default_training_config():
    return _C.clone()
--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools
 __all__ = ["ExperimentBase"]
 class ExperimentBase(object):
    """
    An experiment template in order to structure the training code and take 
--- a/parakeet/utils/checkpoint.py
+++ b/parakeet/utils/checkpoint.py
@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
    return iteration
 def _save_checkpoint(checkpoint_dir: str, iteration: int):
    """Save the iteration number of the latest model to be checkpointed.
@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
    with open(checkpoint_record, "wt") as handle:
        handle.write("model_checkpoint_path: step-{}".format(iteration))
 def load_parameters(model,
                    optimizer=None,
                    checkpoint_dir=None,
@ -97,18 +99,19 @@ def load_parameters(model,
    params_path = checkpoint_path + ".pdparams"
    model_dict = paddle.load(params_path)
    model.set_state_dict(model_dict)
-    print("[checkpoint] Rank {}: loaded model from {}".format(
+    print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
-        local_rank, params_path))
+                                                              params_path))
    optimizer_path = checkpoint_path + ".pdopt"
    if optimizer and os.path.isfile(optimizer_path):
        optimizer_dict = paddle.load(optimizer_path)
        optimizer.set_state_dict(optimizer_dict)
-        print("[checkpoint] Rank {}: loaded optimizer state from {}".
+        print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
-              format(local_rank, optimizer_path))
+            local_rank, optimizer_path))
    return iteration
@mp_tools.rank_zero_only
 def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
    """Checkpoint the latest trained model parameters.
--- a/parakeet/utils/internals.py
+++ b/parakeet/utils/internals.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from paddle.framework import core
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
@ -28,6 +28,7 @@ def summary(layer: nn.Layer):
    print("layer has {} parameters, {} elements.".format(num_params,
                                                         num_elements))
 def gradient_norm(layer: nn.Layer):
    grad_norm_dict = {}
    for name, param in layer.state_dict().items():
@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer):
            grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
    return grad_norm_dict
 def recursively_remove_weight_norm(layer: nn.Layer):
    for layer in layer.sublayers():
        try:
@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer):
            # ther is not weight norm hoom in this layer
            pass
 def freeze(layer: nn.Layer):
    for param in layer.parameters():
        param.trainable = False
 def unfreeze(layer: nn.Layer):
    for param in layer.parameters():
        param.trainable = True
--- a/parakeet/utils/mp_tools.py
+++ b/parakeet/utils/mp_tools.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import distributed as dist
 from functools import wraps
@ -16,6 +30,3 @@ def rank_zero_only(func):
        return result
    return wrapper
--- a/parakeet/utils/scheduler.py
+++ b/parakeet/utils/scheduler.py
@ -1,3 +1,17 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
@ -34,8 +48,8 @@ class PieceWise(SchedulerBase):
            return self.ys[0]
        if i == self.num_anchors:
            return self.ys[-1]
-        k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1]) 
+        k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1])
-        out = self.ys[i-1] + (step - self.xs[i-1]) * k
+        out = self.ys[i - 1] + (step - self.xs[i - 1]) * k
        return out
@ -58,5 +72,4 @@ class StepWise(SchedulerBase):
            return self.ys[-1]
        if i == 0:
            return self.ys[0]
-        return self.ys[i-1]
+        return self.ys[i - 1]
--- a/setup.py
+++ b/setup.py
@ -48,7 +48,6 @@ setup_info = dict(
    description='Speech synthesis tools and models based on Paddlepaddle',
    long_description=long_description,
    license='Apache 2',
    python_requires='>=3.6',
    install_requires=[
        'numpy',
@ -64,30 +63,25 @@ setup_info = dict(
        'scipy',
        'pandas',
        'sox',
-        'opencc',
+        # 'opencc',
        'soundfile',
        'g2p_en',
        'g2pM',
        'yacs',
        'tensorboardX',
    ],
-    extras_require={
+    extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
        'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
    },
    # Package info
    packages=find_packages(exclude=('tests', 'tests.*')),
    zip_safe=True,
-    
+    classifiers=[
    classifiers = [
        'Development Status :: 4 - Beta',
        'Intended Audience :: Developers',
        'Topic :: Scientific/Engineering :: Artificial Intelligence'
        'License :: OSI Approved :: Apache2 License',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
-    ],
+    ], )
    )
 setup(**setup_info)
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@ -1,101 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.modules import attention as attn
 class TestScaledDotProductAttention(unittest.TestCase):
    def test_without_mask(self):
        x = paddle.randn([4, 16, 8])
        context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
        assert(list(context_vector.shape) == [4, 16, 8])
        assert(list(attention_weights.shape) == [4, 16, 16])
    def test_with_mask(self):
        x = paddle.randn([4, 16, 8])
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([16, 15, 13, 14]), dtype=x.dtype)
        mask = mask.unsqueeze(1) # unsqueeze for the decoder time steps
        context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x, mask)
        assert(list(context_vector.shape) == [4, 16, 8])
        assert(list(attention_weights.shape) == [4, 16, 16])
    def test_4d(self):
        x = paddle.randn([4, 6, 16, 8])
        context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
        assert(list(context_vector.shape) == [4, 6, 16, 8])
        assert(list(attention_weights.shape) == [4, 6, 16, 16])
 class TestMonoheadAttention(unittest.TestCase):
    def test_io(self):
        net = attn.MonoheadAttention(6, 0.1)
        q = paddle.randn([4, 18, 6])
        k = paddle.randn([4, 12, 6])
        v = paddle.randn([4, 12, 6])
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
        context_vector, attn_weights = net(q, k, v, mask)
        self.assertTupleEqual(context_vector.numpy().shape, (4, 18, 6))
        self.assertTupleEqual(attn_weights.numpy().shape, (4, 18, 12))
 class TestDropHead(unittest.TestCase):
    def test_drop(self):
        x = paddle.randn([4, 6, 16, 8])
        out = attn.drop_head(x, 2, training=True)
        # drop 2 head from 6 at all positions
        np.testing.assert_allclose(np.sum(out.numpy() == 0., axis=1), 2)
    def test_drop_all(self):
        x = paddle.randn([4, 6, 16, 8])
        out = attn.drop_head(x, 6, training=True)
        np.testing.assert_allclose(np.sum(out.numpy()), 0)
    def test_eval(self):
        x = paddle.randn([4, 6, 16, 8])
        out = attn.drop_head(x, 6, training=False)
        self.assertIs(x, out)
 class TestMultiheadAttention(unittest.TestCase):
    def __init__(self, methodName="test_io", same_qk=True):
        super(TestMultiheadAttention, self).__init__(methodName)
        self.same_qk = same_qk
    def setUp(self):
        if self.same_qk:
            net = attn.MultiheadAttention(64, 8, dropout=0.3)
        else:
            net = attn.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
        self.net =net
    def test_io(self):
        q = paddle.randn([4, 12, 64])
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
        context_vector, attention_weights = self.net(q, q, q, mask)
        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
 def load_tests(loader, standard_tests, pattern):
    suite = unittest.TestSuite()
    suite.addTest(TestScaledDotProductAttention("test_without_mask"))
    suite.addTest(TestScaledDotProductAttention("test_with_mask"))
    suite.addTest(TestScaledDotProductAttention("test_4d"))
    suite.addTest(TestDropHead("test_drop"))
    suite.addTest(TestDropHead("test_drop_all"))
    suite.addTest(TestDropHead("test_eval"))
    suite.addTest(TestMonoheadAttention("test_io"))
    suite.addTest(TestMultiheadAttention("test_io", same_qk=True))
    suite.addTest(TestMultiheadAttention("test_io", same_qk=False))
    return suite
--- a/tests/test_cbhg.py
+++ b/tests/test_cbhg.py
@ -1,34 +0,0 @@
 import unittest
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.modules import cbhg
 class TestHighway(unittest.TestCase):
    def test_io(self):
        net = cbhg.Highway(4)
        x = paddle.randn([2, 12, 4])
        y = net(x)
        self.assertTupleEqual(y.numpy().shape, (2, 12, 4))
 class TestCBHG(unittest.TestCase):
    def __init__(self, methodName="runTest", ):
        super(TestCBHG, self).__init__(methodName)
    def test_io(self):
        self.net = cbhg.CBHG(64, 32, 16, 
                             projection_channels=[64, 128], 
                             num_highways=4, highway_features=128, 
                             gru_features=64)
        x = paddle.randn([4, 64, 32])
        y = self.net(x)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 128))
 def load_tests(loader, standard_tests, pattern):
    suite = unittest.TestSuite()
    suite.addTest(TestHighway("test_io"))
    suite.addTest(TestCBHG("test_io"))
    return suite
--- a/tests/test_clarinet.py
+++ b/tests/test_clarinet.py
@ -1,43 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.models import clarinet
 from parakeet.modules import stft
 class TestParallelWaveNet(unittest.TestCase):
    def test_io(self):
        net = clarinet.ParallelWaveNet([8, 8, 8], [1, 1, 1], 16, 12, 2)
        x = paddle.randn([4, 6073])
        condition = paddle.randn([4, 12, 6073])
        z, out_mu, out_log_std = net(x, condition)
        self.assertTupleEqual(z.numpy().shape, (4, 6073))
        self.assertTupleEqual(out_mu.numpy().shape, (4, 6073))
        self.assertTupleEqual(out_log_std.numpy().shape, (4, 6073))
 class TestClariNet(unittest.TestCase):
    def setUp(self):
        encoder = clarinet.UpsampleNet([2, 2])
        teacher = clarinet.WaveNet(8, 3, 16, 3, 12, 2, "mog", -9.0)
        student = clarinet.ParallelWaveNet([8, 8, 8, 8, 8, 8], [1, 1, 1, 1, 1, 1], 16, 12, 2)
        stft_module = stft.STFT(16, 4, 8)
        net = clarinet.Clarinet(encoder, teacher, student, stft_module, -6.0, lmd=4)
        print("context size is: ", teacher.context_size)
        self.net = net
    def test_io(self):
        audio = paddle.randn([4, 1366])
        mel = paddle.randn([4, 12, 512]) # 512 * 4 =2048
        audio_start = paddle.zeros([4], dtype="int64")
        loss = self.net(audio, mel, audio_start, clip_kl=True)
        loss["loss"].numpy()
    def test_synthesis(self):
        mel = paddle.randn([4, 12, 512]) # 64 = 246 / 4
        out = self.net.synthesis(mel)
        self.assertTupleEqual(out.numpy().shape, (4, 2048))
--- a/tests/test_connections.py
+++ b/tests/test_connections.py
@ -1,33 +0,0 @@
 import unittest
 import paddle
 from paddle import nn
 paddle.disable_static(paddle.CPUPlace())
 paddle.set_default_dtype("float64")
 from parakeet.modules import connections as conn
 class TestPreLayerNormWrapper(unittest.TestCase):
    def test_io(self):
        net = nn.Linear(8, 8)
        net = conn.PreLayerNormWrapper(net, 8)
        x = paddle.randn([4, 8])
        y = net(x)
        self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
 class TestPostLayerNormWrapper(unittest.TestCase):
    def test_io(self):
        net = nn.Linear(8, 8)
        net = conn.PostLayerNormWrapper(net, 8)
        x = paddle.randn([4, 8])
        y = net(x)
        self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
 class TestResidualWrapper(unittest.TestCase):
    def test_io(self):
        net = nn.Linear(8, 8)
        net = conn.ResidualWrapper(net)
        x = paddle.randn([4, 8])
        y = net(x)
        self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
--- a/tests/test_conv.py
+++ b/tests/test_conv.py
@ -1,67 +0,0 @@
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 import unittest
 import numpy as np
 from parakeet.modules import conv
 class TestConv1dCell(unittest.TestCase):
    def setUp(self):
        self.net = conv.Conv1dCell(4, 6, 5, dilation=2)
    def forward_incremental(self, x):
        outs = []
        self.net.start_sequence()
        with paddle.no_grad():
            for i in range(x.shape[-1]):
                xt = x[:, :, i]
                yt = self.net.add_input(xt)
                outs.append(yt)
            y2 = paddle.stack(outs, axis=-1)
        return y2
    def test_equality(self):
        x = paddle.randn([2, 4, 16])
        y1 = self.net(x)
        self.net.eval()
        y2 = self.forward_incremental(x)
        np.testing.assert_allclose(y2.numpy(), y1.numpy())
 class TestConv1dBatchNorm(unittest.TestCase):
    def __init__(self, methodName="runTest", causal=False, channel_last=False):
        super(TestConv1dBatchNorm, self).__init__(methodName)
        self.causal = causal
        self.channel_last = channel_last
    def setUp(self):
        k = 5
        paddding = (k - 1, 0) if self.causal else ((k-1) // 2, k //2)
        self.net = conv.Conv1dBatchNorm(4, 6, (k,), 1, padding=paddding, 
                                        data_format="NLC" if self.channel_last else "NCL")
    def test_input_output(self):
        x = paddle.randn([4, 16, 4]) if self.channel_last else paddle.randn([4, 4, 16]) 
        out = self.net(x)
        out_np = out.numpy()
        if self.channel_last:
            self.assertTupleEqual(out_np.shape, (4, 16, 6))
        else:
            self.assertTupleEqual(out_np.shape, (4, 6, 16))
    def runTest(self):
        self.test_input_output()
 def load_tests(loader, standard_tests, pattern):
    suite = unittest.TestSuite()
    suite.addTest(TestConv1dBatchNorm("runTest", True, True))
    suite.addTest(TestConv1dBatchNorm("runTest", False, False))
    suite.addTest(TestConv1dBatchNorm("runTest", True, False))
    suite.addTest(TestConv1dBatchNorm("runTest", False, True))
    suite.addTest(TestConv1dCell("test_equality"))
    return suite
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@ -1,122 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 from paddle import io
 from parakeet import data
 class MyDataset(io.Dataset):
    def __init__(self, size):
        self._data = np.random.randn(size, 6)
    def __getitem__(self, i):
        return self._data[i]
    def __len__(self):
        return self._data.shape[0]
 class TestTransformDataset(unittest.TestCase):
    def test(self):
        dataset = MyDataset(20)
        dataset = data.TransformDataset(dataset, lambda x: np.abs(x))
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("TransformDataset")
        for batch, in dataloader:
            print(type(batch), batch.dtype, batch.shape)
 class TestChainDataset(unittest.TestCase):
    def test(self):
        dataset1 = MyDataset(20)
        dataset2 = MyDataset(40)
        dataset = data.ChainDataset(dataset1, dataset2)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("ChainDataset")
        for batch, in dataloader:
            print(type(batch), batch.dtype, batch.shape)
 class TestTupleDataset(unittest.TestCase):
    def test(self):
        dataset1 = MyDataset(20)
        dataset2 = MyDataset(20)
        dataset = data.TupleDataset(dataset1, dataset2)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("TupleDataset")
        for field1, field2 in dataloader:
            print(type(field1), field1.dtype, field1.shape)
            print(type(field2), field2.dtype, field2.shape)
 class TestDictDataset(unittest.TestCase):
    def test(self):
        dataset1 = MyDataset(20)
        dataset2 = MyDataset(20)
        dataset = data.DictDataset(field1=dataset1, field2=dataset2)
        def collate_fn(examples):
            examples_tuples = []
            for example in examples:
                examples_tuples.append(example.values())
            return paddle.fluid.dataloader.dataloader_iter.default_collate_fn(examples_tuples)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1, collate_fn=collate_fn)
        print("DictDataset")
        for field1, field2 in dataloader:
            print(type(field1), field1.dtype, field1.shape)
            print(type(field2), field2.dtype, field2.shape)
 class TestSliceDataset(unittest.TestCase):
    def test(self):
        dataset = MyDataset(40)
        dataset = data.SliceDataset(dataset, 0, 20)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("SliceDataset")
        for batch, in dataloader:
            print(type(batch), batch.dtype, batch.shape)
 class TestSplit(unittest.TestCase):
    def test(self):
        dataset = MyDataset(40)
        train, valid = data.split(dataset, 10)
        dataloader1 = io.DataLoader(train, batch_size=4, shuffle=True, num_workers=1)
        dataloader2 = io.DataLoader(valid, batch_size=4, shuffle=True, num_workers=1)
        print("First Dataset")
        for batch, in dataloader1:
            print(type(batch), batch.dtype, batch.shape)
        print("Second Dataset")
        for batch, in dataloader2:
            print(type(batch), batch.dtype, batch.shape)
 class TestSubsetDataset(unittest.TestCase):
    def test(self):
        dataset = MyDataset(40)
        indices = np.random.choice(np.arange(40), [20], replace=False).tolist()
        dataset = data.SubsetDataset(dataset, indices)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("SubsetDataset")
        for batch, in dataloader:
            print(type(batch), batch.dtype, batch.shape)
 class TestFilterDataset(unittest.TestCase):
    def test(self):
        dataset = MyDataset(40)
        dataset = data.FilterDataset(dataset, lambda x: np.mean(x)> 0.3)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("FilterDataset")
        for batch, in dataloader:
            print(type(batch), batch.dtype, batch.shape)
 class TestCacheDataset(unittest.TestCase):
    def test(self):
        dataset = MyDataset(40)
        dataset = data.CacheDataset(dataset)
        dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
        print("CacheDataset")
        for batch, in dataloader:
            print(type(batch), batch.dtype, batch.shape)
--- a/tests/test_deepvoice3.py
+++ b/tests/test_deepvoice3.py
@ -1,107 +0,0 @@
 import numpy as np
 import unittest
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.models import deepvoice3 as dv3
 class TestConvBlock(unittest.TestCase):
    def test_io_causal(self):
        net = dv3.ConvBlock(6, 5, True, True, 8, 0.9)
        x = paddle.randn([4, 32, 6])
        condition = paddle.randn([4, 8])
        # TODO(chenfeiyu): to report an issue on default data type
        padding = paddle.zeros([4, 4, 6], dtype=x.dtype)
        y = net.forward(x, condition, padding)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
    def test_io_non_causal(self):
        net = dv3.ConvBlock(6, 5, False, True, 8, 0.9)
        x = paddle.randn([4, 32, 6])
        condition = paddle.randn([4, 8])
        y = net.forward(x, condition)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
 class TestAffineBlock1(unittest.TestCase):
    def test_io(self):
        net = dv3.AffineBlock1(6, 16, True, 8)
        x = paddle.randn([4, 32, 6])
        condition = paddle.randn([4, 8])
        y = net(x, condition)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
 class TestAffineBlock2(unittest.TestCase):
    def test_io(self):
        net = dv3.AffineBlock2(6, 16, True, 8)
        x = paddle.randn([4, 32, 6])
        condition = paddle.randn([4, 8])
        y = net(x, condition)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
 class TestEncoder(unittest.TestCase):
    def test_io(self):
        net = dv3.Encoder(5, 8, 16, 5, True, 6)
        x = paddle.randn([4, 32, 8])
        condition = paddle.randn([4, 6])
        keys, values = net(x, condition)
        self.assertTupleEqual(keys.numpy().shape, (4, 32, 8))
        self.assertTupleEqual(values.numpy().shape, (4, 32, 8))
 class TestAttentionBlock(unittest.TestCase):
    def test_io(self):
        net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
        q = paddle.randn([4, 32, 6])
        k = paddle.randn([4, 24, 6])
        v = paddle.randn([4, 24, 6])
        lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
        condition = paddle.randn([4, 8])
        context_vector, attention_weight = net(q, k, v, lengths, condition, 0)
        self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
        self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
    def test_io_with_previous_attn(self):
        net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
        q = paddle.randn([4, 32, 6])
        k = paddle.randn([4, 24, 6])
        v = paddle.randn([4, 24, 6])
        lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
        condition = paddle.randn([4, 8])
        prev_attn_weight = paddle.randn([4, 32, 16])
        context_vector, attention_weight = net(
            q, k, v, lengths, condition, 0, 
            force_monotonic=True, prev_coeffs=prev_attn_weight, window=(0, 4))
        self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
        self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
 class TestDecoder(unittest.TestCase):
    def test_io(self):
        net = dv3.Decoder(8, 4, [4, 12], 5, 3, 16, 1.0, 1.45, True, 6)
        x = paddle.randn([4, 32, 8])
        k = paddle.randn([4, 24, 12]) # prenet's last size should equals k's feature size
        v = paddle.randn([4, 24, 12])
        lengths = paddle.to_tensor([24, 18, 19, 22])
        condition = paddle.randn([4, 6])
        decoded, hidden, attentions, final_state = net(x, k, v, lengths, 0, condition)
        self.assertTupleEqual(decoded.numpy().shape, (4, 32, 4 * 8))
        self.assertTupleEqual(hidden.numpy().shape, (4, 32, 12))
        self.assertEqual(len(attentions), 5)
        self.assertTupleEqual(attentions[0].numpy().shape, (4, 32, 24))
        self.assertEqual(len(final_state), 5)
        self.assertTupleEqual(final_state[0].numpy().shape, (4, 2, 12))
 class TestPostNet(unittest.TestCase):
    def test_io(self):
        net = dv3.PostNet(3, 8, 16, 3, 12, 4, True, 6)
        x = paddle.randn([4, 32, 8])
        condition = paddle.randn([4, 6])
        y = net(x, condition)
        self.assertTupleEqual(y.numpy().shape, (4, 32 * 4, 12))
--- a/tests/test_geometry.py
+++ b/tests/test_geometry.py
@ -1,19 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.modules import geometry as geo
 class TestShuffleDim(unittest.TestCase):
    def test_perm(self):
        x = paddle.randn([2, 3, 4, 6])
        y = geo.shuffle_dim(x, 2, [3, 2, 1, 0])
        np.testing.assert_allclose(x.numpy()[0, 0, :, 0], y.numpy()[0, 0, ::-1, 0])
    def test_random_perm(self):
        x = paddle.randn([2, 3, 4, 6])
        y = geo.shuffle_dim(x, 2)
        np.testing.assert_allclose(x.numpy().sum(2), y.numpy().sum(2))
--- a/tests/test_losses.py
+++ b/tests/test_losses.py
@ -1,33 +0,0 @@
 import unittest
 import paddle
 paddle.set_device("cpu")
 import numpy as np
 from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy
 class TestWeightedMean(unittest.TestCase):
    def test(self):
        x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
        mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
        loss = weighted_mean(x, mask)
        self.assertAlmostEqual(loss.numpy()[0], 7)
 class TestMaskedL1Loss(unittest.TestCase):
    def test(self):
        x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
        y = paddle.zeros_like(x)
        mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
        loss = masked_l1_loss(x, y, mask)
        print(loss)
        self.assertAlmostEqual(loss.numpy()[0], 7)
 class TestMaskedCrossEntropy(unittest.TestCase):
    def test(self):
        x = paddle.randn([3, 30, 8], dtype="float64")
        y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1)
        loss = masked_softmax_with_cross_entropy(x, y, mask)
        print(loss)
--- a/tests/test_masking.py
+++ b/tests/test_masking.py
@ -1,54 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 paddle.set_default_dtype("float64")
 from parakeet.modules import masking
 def sequence_mask(lengths, max_length=None, dtype="bool"):
    max_length = max_length or np.max(lengths)
    ids = np.arange(max_length)
    return (ids < np.expand_dims(lengths, -1)).astype(dtype)
 def future_mask(lengths, max_length=None, dtype="bool"):
    max_length = max_length or np.max(lengths)
    return np.tril(np.tril(np.ones(max_length))).astype(dtype)
 class TestIDMask(unittest.TestCase):
    def test(self):
        ids = paddle.to_tensor(
            [[1, 2, 3, 0, 0, 0],
             [2, 4, 5, 6, 0, 0],
             [7, 8, 9, 0, 0, 0]]
        )
        mask = masking.id_mask(ids)
        self.assertTupleEqual(mask.numpy().shape, ids.numpy().shape)
        print(mask.numpy())
 class TestFeatureMask(unittest.TestCase):
    def test(self):
        features = np.random.randn(3, 16, 8)
        lengths = [16, 14, 12]
        for i, length in enumerate(lengths):
            features[i, length:, :] = 0
        feature_tensor = paddle.to_tensor(features)
        mask = masking.feature_mask(feature_tensor, -1)
        self.assertTupleEqual(mask.numpy().shape, (3, 16, 1))
        print(mask.numpy().squeeze())
 class TestCombineMask(unittest.TestCase):
    def test_bool_mask(self):
        lengths = np.array([12, 8, 9, 10])
        padding_mask = sequence_mask(lengths, dtype="bool")
        no_future_mask = future_mask(lengths, dtype="bool")
        combined_mask1 = np.expand_dims(padding_mask, 1) * no_future_mask
        print(paddle.to_tensor(padding_mask).dtype)
        print(paddle.to_tensor(no_future_mask).dtype)
        combined_mask2 = masking.combine_mask(
            paddle.to_tensor(padding_mask).unsqueeze(1), paddle.to_tensor(no_future_mask)
        )
        np.testing.assert_allclose(combined_mask2.numpy(), combined_mask1)
--- a/tests/test_position_encoding.py
+++ b/tests/test_position_encoding.py
@ -1,64 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 from parakeet.modules import positional_encoding as pe
 def positional_encoding(start_index, length, size, dtype="float32"):
    if (size % 2 != 0):
        raise ValueError("size should be divisible by 2")
    channel = np.arange(0, size, 2, dtype=dtype)
    index = np.arange(start_index, start_index + length, 1, dtype=dtype)
    p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
    encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
    return encodings
 def scalable_positional_encoding(start_index, length, size, omega):
    dtype = omega.dtype
    index = np.arange(start_index, start_index + length, 1, dtype=dtype)
    channel = np.arange(0, size, 2, dtype=dtype)
    p = np.reshape(omega, omega.shape + (1, 1)) \
      * np.expand_dims(index, -1) \
      / (10000 ** (channel / float(size)))
    encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
    return encodings
 class TestPositionEncoding(unittest.TestCase):
    def __init__(self, start=0, length=20, size=16, dtype="float64"):
        super(TestPositionEncoding, self).__init__("runTest")
        self.spec = (start, length, size, dtype)
    def test_equality(self):
        start, length, size, dtype = self.spec
        position_embed1 = positional_encoding(start, length, size, dtype)
        position_embed2 = pe.positional_encoding(start, length, size, dtype)
        np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
    def runTest(self):
        paddle.disable_static(paddle.CPUPlace())
        self.test_equality()
 class TestScalablePositionEncoding(unittest.TestCase):
    def __init__(self, start=0, length=20, size=16, dtype="float64"):
        super(TestScalablePositionEncoding, self).__init__("runTest")
        self.spec = (start, length, size, dtype)
    def test_equality(self):
        start, length, size, dtype = self.spec
        omega = np.random.uniform(1, 2, size=(4,)).astype(dtype)
        position_embed1 = scalable_positional_encoding(start, length, size, omega)
        position_embed2 = pe.scalable_positional_encoding(start, length, size, paddle.to_tensor(omega))
        np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
    def runTest(self):
        paddle.disable_static(paddle.CPUPlace())
        self.test_equality()
 def load_tests(loader, standard_tests, pattern):
    suite = unittest.TestSuite()
    suite.addTest(TestPositionEncoding(0, 20, 16, "float64"))
    suite.addTest(TestScalablePositionEncoding(0, 20, 16))
    return suite
--- a/tests/test_stft.py
+++ b/tests/test_stft.py
@ -1,27 +0,0 @@
 import unittest
 import numpy as np
 import librosa
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.modules import stft
 class TestSTFT(unittest.TestCase):
    def test(self):
        path = librosa.util.example("choice")
        wav, sr = librosa.load(path, duration=5)
        wav = wav.astype("float64")
        spec = librosa.stft(wav, n_fft=2048, hop_length=256, win_length=1024)
        mag1 = np.abs(spec)
        wav_in_batch = paddle.unsqueeze(paddle.to_tensor(wav), 0)
        mag2 = stft.STFT(2048, 256, 1024).magnitude(wav_in_batch)
        mag2 = paddle.squeeze(mag2, [0, 2]).numpy()
        print("mag1", mag1)
        print("mag2", mag2)
        # TODO(chenfeiyu): Is there something wrong? there is some elements that
        # does not match
        # np.testing.assert_allclose(mag2, mag1)
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@ -1,43 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.modules import transformer
 class TestPositionwiseFFN(unittest.TestCase):
    def test_io(self):
        net = transformer.PositionwiseFFN(8, 12)
        x = paddle.randn([2, 3, 4, 8])
        y = net(x)
        self.assertTupleEqual(y.numpy().shape, (2, 3, 4, 8))
 class TestTransformerEncoderLayer(unittest.TestCase):
    def test_io(self):
        net = transformer.TransformerEncoderLayer(64, 8, 128, 0.5)
        x = paddle.randn([4, 12, 64])
        lengths = paddle.to_tensor([12, 8, 9, 10])
        mask = paddle.fluid.layers.sequence_mask(lengths, dtype=x.dtype)
        y, attn_weights = net(x, mask)
        self.assertTupleEqual(y.numpy().shape, (4, 12, 64))
        self.assertTupleEqual(attn_weights.numpy().shape, (4, 8, 12, 12))
 class TestTransformerDecoderLayer(unittest.TestCase):
    def test_io(self):
        net = transformer.TransformerDecoderLayer(64, 8, 128, 0.5)
        q = paddle.randn([4, 32, 64])
        k = paddle.randn([4, 24, 64])
        v = paddle.randn([4, 24, 64])
        enc_lengths = paddle.to_tensor([24, 18, 20, 22])
        dec_lengths = paddle.to_tensor([32, 28, 30, 31])
        enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype)
        dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype)
        y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
        self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
        self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
--- a/tests/test_transformer_tts.py
+++ b/tests/test_transformer_tts.py
@ -1,121 +0,0 @@
 import unittest
 import numpy as np
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.models import transformer_tts as tts
 from parakeet.modules import masking
 from pprint import pprint
 class TestMultiheadAttention(unittest.TestCase):
    def test_io_same_qk(self):
        net = tts.MultiheadAttention(64, 8)
        q = paddle.randn([4, 12, 64])
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
        context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
    def test_io(self):
        net = tts.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
        q = paddle.randn([4, 12, 64])
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
        mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
        context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
 class TestTransformerEncoderLayer(unittest.TestCase):
    def test_io(self):
        net = tts.TransformerEncoderLayer(64, 8, 128)
        x = paddle.randn([4, 12, 64])
        mask = paddle.fluid.layers.sequence_mask(
            paddle.to_tensor([12, 10, 8, 9]), dtype=x.dtype)
        context_vector, attention_weights = net(x, mask)
        self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
        self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
 class TestTransformerDecoderLayer(unittest.TestCase):
    def test_io(self):
        net = tts.TransformerDecoderLayer(64, 8, 128, 0.5)
        q = paddle.randn([4, 32, 64])
        k = paddle.randn([4, 24, 64])
        v = paddle.randn([4, 24, 64])
        enc_lengths = paddle.to_tensor([24, 18, 20, 22])
        dec_lengths = paddle.to_tensor([32, 28, 30, 31])
        enc_mask = masking.sequence_mask(enc_lengths, dtype=k.dtype)
        dec_padding_mask = masking.sequence_mask(dec_lengths, dtype=q.dtype)
        no_future_mask = masking.future_mask(32, dtype=q.dtype)
        dec_mask = masking.combine_mask(dec_padding_mask.unsqueeze(-1), no_future_mask)
        y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
        self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
        self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
 class TestTransformerTTS(unittest.TestCase):
    def setUp(self):
        net = tts.TransformerTTS(
            128, 0, 64, 128, 80, 4, 128, 
            6, 6, 128, 128, 4, 
            3, 10, 0.1)
        self.net = net
    def test_encode_io(self):
        net = self.net
        text = paddle.randint(0, 128, [4, 176])
        lengths = paddle.to_tensor([176, 156, 174, 168])
        mask = masking.sequence_mask(lengths, dtype=text.dtype)
        text = text * mask
        encoded, attention_weights, encoder_mask = net.encode(text)
        print("output shapes:")
        print("encoded:", encoded.numpy().shape)
        print("encoder_attentions:", [item.shape for item in attention_weights])
        print("encoder_mask:", encoder_mask.numpy().shape)
    def test_all_io(self):
        net = self.net
        text = paddle.randint(0, 128, [4, 176])
        lengths = paddle.to_tensor([176, 156, 174, 168])
        mask = masking.sequence_mask(lengths, dtype=text.dtype)
        text = text * mask
        mel = paddle.randn([4, 189, 80])
        frames = paddle.to_tensor([189, 186, 179, 174])
        mask = masking.sequence_mask(frames, dtype=frames.dtype)
        mel = mel * mask.unsqueeze(-1)
        encoded, encoder_attention_weights, encoder_mask = net.encode(text)
        mel_output, mel_intermediate, cross_attention_weights, stop_logits = net.decode(encoded, mel, encoder_mask)
        print("output shapes:")
        print("encoder_output:", encoded.numpy().shape)
        print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
        print("encoder_mask:", encoder_mask.numpy().shape)
        print("mel_output: ", mel_output.numpy().shape)
        print("mel_intermediate: ", mel_intermediate.numpy().shape)
        print("decoder_attentions:", [item.shape for item in cross_attention_weights])
        print("stop_logits:", stop_logits.numpy().shape)
    def test_predict_io(self):
        net = self.net
        net.eval()
        with paddle.no_grad():
            text = paddle.randint(0, 128, [176])
            decoder_output, encoder_attention_weights, cross_attention_weights = net.predict(text)
        print("output shapes:")
        print("mel_output: ", decoder_output.numpy().shape)
        print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
        print("decoder_attentions:", [item.shape for item in cross_attention_weights])
--- a/tests/test_waveflow.py
+++ b/tests/test_waveflow.py
@ -1,130 +0,0 @@
 import numpy as np
 import unittest
 import paddle
 paddle.set_default_dtype("float64")
 paddle.disable_static(paddle.CPUPlace())
 from parakeet.models import waveflow
 class TestFold(unittest.TestCase):
    def test_audio(self):
        x = paddle.randn([4, 32 * 8])
        y = waveflow.fold(x, 8)
        self.assertTupleEqual(y.numpy().shape, (4, 32, 8))
    def test_spec(self):
        x = paddle.randn([4, 80, 32 * 8])
        y = waveflow.fold(x, 8)
        self.assertTupleEqual(y.numpy().shape, (4, 80, 32, 8))
 class TestUpsampleNet(unittest.TestCase):
    def test_io(self):
        net = waveflow.UpsampleNet([2, 2])
        x = paddle.randn([4, 8, 6])
        y = net(x)
        self.assertTupleEqual(y.numpy().shape, (4, 8, 2 * 2 * 6))
 class TestResidualBlock(unittest.TestCase):
    def test_io(self):
        net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
        x = paddle.randn([4, 4, 16, 32])
        condition = paddle.randn([4, 6, 16, 32])
        res, skip = net(x, condition)
        self.assertTupleEqual(res.numpy().shape, (4, 4, 16, 32))
        self.assertTupleEqual(skip.numpy().shape, (4, 4, 16, 32))
    def test_add_input(self):
        net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
        net.eval()
        net.start_sequence()
        x_row = paddle.randn([4, 4, 1, 32])
        condition_row = paddle.randn([4, 6, 1, 32])
        res, skip = net.add_input(x_row, condition_row)
        self.assertTupleEqual(res.numpy().shape, (4, 4, 1, 32))
        self.assertTupleEqual(skip.numpy().shape, (4, 4, 1, 32))
 class TestResidualNet(unittest.TestCase):
    def test_io(self):
        net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
        x = paddle.randn([4, 6, 8, 32])
        condition = paddle.randn([4, 8, 8, 32])
        y = net(x, condition)
        self.assertTupleEqual(y.numpy().shape, (4, 6, 8, 32))
    def test_add_input(self):
        net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
        net.eval()
        net.start_sequence()
        x_row = paddle.randn([4, 6, 1, 32])
        condition_row = paddle.randn([4, 8, 1, 32])
        y_row = net.add_input(x_row, condition_row)
        self.assertTupleEqual(y_row.numpy().shape, (4, 6, 1, 32))
 class TestFlow(unittest.TestCase):
    def test_io(self):
        net = waveflow.Flow(8, 16, 7, (3, 3), 8)
        x = paddle.randn([4, 1, 8, 32])
        condition = paddle.randn([4, 7, 8, 32])
        z, (logs, b) = net(x, condition)
        self.assertTupleEqual(z.numpy().shape, (4, 1, 8, 32))
        self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
        self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
    def test_inverse_row(self):
        net = waveflow.Flow(8, 16, 7, (3, 3), 8)
        net.eval()
        net._start_sequence()
        x_row = paddle.randn([4, 1, 1, 32]) # last row
        condition_row = paddle.randn([4, 7, 1, 32])
        z_row = paddle.randn([4, 1, 1, 32])
        x_next_row, (logs, b) = net._inverse_row(z_row, x_row, condition_row)
        self.assertTupleEqual(x_next_row.numpy().shape, (4, 1, 1, 32))
        self.assertTupleEqual(logs.numpy().shape, (4, 1, 1, 32))
        self.assertTupleEqual(b.numpy().shape, (4, 1, 1, 32))
    def test_inverse(self):
        net = waveflow.Flow(8, 16, 7, (3, 3), 8)
        net.eval()
        z = paddle.randn([4, 1, 8, 32])
        condition = paddle.randn([4, 7, 8, 32])
        with paddle.no_grad():
            x, (logs, b) = net.inverse(z, condition)
        self.assertTupleEqual(x.numpy().shape, (4, 1, 8, 32))
        self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
        self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
 class TestWaveFlow(unittest.TestCase):
    def test_io(self):
        x = paddle.randn([4, 32 * 8 ])
        condition = paddle.randn([4, 7, 32 * 8])
        net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
        z, logs_det_jacobian = net(x, condition)
        self.assertTupleEqual(z.numpy().shape, (4, 32 * 8))
        self.assertTupleEqual(logs_det_jacobian.numpy().shape, (1,))
    def test_inverse(self):
        z = paddle.randn([4, 32 * 8 ])
        condition = paddle.randn([4, 7, 32 * 8])
        net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
        net.eval()
        with paddle.no_grad():
            x = net.inverse(z, condition)
        self.assertTupleEqual(x.numpy().shape, (4, 32 * 8))
`@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+`
	```	```

	在这个软件源中包含了几个例子，可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户，可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。	在这个软件源中包含了几个例子，可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户，可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。