diff --git a/README_cn.md b/README_cn.md
index 994a4e2..ce88032 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -228,6 +228,6 @@ Parakeet 同时提供了示例模型的训练好的参数，可从下表中获
 
 正在开发中。
 
-## 版权和许可 
+## 版权和许可
 
 Parakeet 以 [Apache-2.0 license](LICENSE) 提供。
diff --git a/doc/source/conf.py b/doc/source/conf.py
index f7d0af2..dd4a270 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -14,7 +28,6 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 
-
 # -- Project information -----------------------------------------------------
 
 project = 'parakeet'
@@ -24,7 +37,6 @@ author = 'parakeet-developers'
 # The full version, including alpha/beta/rc tags
 release = '0.2'
 
-
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -33,7 +45,7 @@ release = '0.2'
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.viewcode',
-    "sphinx_rtd_theme",	
+    "sphinx_rtd_theme",
     'sphinx.ext.mathjax',
     'numpydoc',
 ]
@@ -46,7 +58,6 @@ templates_path = ['_templates']
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 
-
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
diff --git a/docs/config_cn.md b/docs/config_cn.md
index 2b8ce4c..29a80c6 100644
--- a/docs/config_cn.md
+++ b/docs/config_cn.md
@@ -18,7 +18,7 @@
 
 常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
 
-`ini` 
+`ini`
 优点：简单，支持字符串插值等操作。
 缺点：仅支持两层结构，值不带类型信息，解析的时候需要手动 cast。
 
@@ -102,11 +102,3 @@ optional arguments:
   --opts ...            options to overwrite --config file and the default
                         config, passing in KEY VALUE pairs
 ```
-
-
-
-
-
-
-
-
diff --git a/docs/data_cn.md b/docs/data_cn.md
index 4a7aab8..6ef6404 100644
--- a/docs/data_cn.md
+++ b/docs/data_cn.md
@@ -21,7 +21,7 @@
 
 一般来说，我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。
 
-parakeet 还提供了若干个高阶的 Dataset 类，用于从已有的 Dataset 产生新的 Dataset. 
+parakeet 还提供了若干个高阶的 Dataset 类，用于从已有的 Dataset 产生新的 Dataset.
 
 1. 用于字段组合的有 TupleDataset, DictDataset;
 2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset;
@@ -137,7 +137,7 @@ class Transform(object):
         self.processor = AudioProcessor(
             sample_rate=22050,
             n_fft=1024,
-            win_length=1024, 
+            win_length=1024,
             hop_length=256,
             f_max=8000)
         self.normalizer = LogMagnitude()
@@ -167,7 +167,7 @@ ljspeech = TransformDataset(meta, transform)
 
 当然也可以选择专门写一个转换脚本把转换后的数据集保存下来，然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。
 
-接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据，所以我们需要进行 padding. 
+接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据，所以我们需要进行 padding.
 
 ```python
 class LJSpeechCollector(object):
@@ -197,10 +197,10 @@ def create_dataloader(source_path, valid_size, batch_size):
 
     valid_set, train_set = dataset.split(lj, valid_size)
     train_loader = DataLoader(
-        train_set, 
-        return_list=False, 
-        batch_size=batch_size, 
-        shuffle=True, 
+        train_set,
+        return_list=False,
+        batch_size=batch_size,
+        shuffle=True,
         drop_last=True,
         collate_fn=LJSpeechCollector())
     valid_loader = DataLoader(
diff --git a/docs/experiment_cn.md b/docs/experiment_cn.md
index dc6a997..0596dda 100644
--- a/docs/experiment_cn.md
+++ b/docs/experiment_cn.md
@@ -72,4 +72,4 @@ def train(self):
 
 ```python
 exp.run()
-```
\ No newline at end of file
+```
diff --git a/docs/experiment_guide_cn.md b/docs/experiment_guide_cn.md
index c5cc82e..8c9b89d 100644
--- a/docs/experiment_guide_cn.md
+++ b/docs/experiment_guide_cn.md
@@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset  --+
 ```
 
 在这个软件源中包含了几个例子，可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户，可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
-
-
diff --git a/docs/installation_cn.md b/docs/installation_cn.md
index a861c86..030b721 100644
--- a/docs/installation_cn.md
+++ b/docs/installation_cn.md
@@ -31,7 +31,7 @@ python -m pip install paddlepaddle==2.0.0rc0 -i https://mirror.baidu.com/pypi/si
 # ubuntu, debian
 sudo apt-get install libsndfile1
 
-# centos, fedora, 
+# centos, fedora,
 sudo yum install libsndfile
 
 # openSUSE
diff --git a/docs/overview_cn.md b/docs/overview_cn.md
index 40659af..06a9f93 100644
--- a/docs/overview_cn.md
+++ b/docs/overview_cn.md
@@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了
 1. 可复用的模型以及常用的模块；
 2. 从数据处理，模型训练到预测等一系列过程的完整实验；
 3. 高质量的开箱即用模型。
-
-
-
-
-
-
-
diff --git a/examples/transformer_tts/config.py b/examples/transformer_tts/config.py
index fef9ed8..bcf8e90 100644
--- a/examples/transformer_tts/config.py
+++ b/examples/transformer_tts/config.py
@@ -1,21 +1,34 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from yacs.config import CfgNode as CN
 
 _C = CN()
 _C.data = CN(
     dict(
-        batch_size=16, # batch size
-        valid_size=64, # the first N examples are reserved for validation
-        sample_rate=22050, # Hz, sample rate
-        n_fft=1024, # fft frame size
-        win_length=1024, # window size
+        batch_size=16,  # batch size
+        valid_size=64,  # the first N examples are reserved for validation
+        sample_rate=22050,  # Hz, sample rate
+        n_fft=1024,  # fft frame size
+        win_length=1024,  # window size
         hop_length=256,  # hop size between ajacent frame
-        f_max=8000, # Hz, max frequency when converting to mel
+        f_max=8000,  # Hz, max frequency when converting to mel
         d_mel=80,  # mel bands
-        padding_idx=0, # text embedding's padding index
-        mel_start_value=0.5, # value for starting frame
-        mel_end_value=-0.5, # # value for ending frame
-    )
-)
+        padding_idx=0,  # text embedding's padding index
+        mel_start_value=0.5,  # value for starting frame
+        mel_end_value=-0.5,  # # value for ending frame
+    ))
 
 _C.model = CN(
     dict(
@@ -31,22 +44,21 @@ _C.model = CN(
         postnet_kernel_size=5,  # decoder postnet(cnn)'s kernel size
         max_reduction_factor=10,  # max_reduction factor
         dropout=0.1,  # global droput probability
-        stop_loss_scale=8.0, # scaler for stop _loss
-        decoder_prenet_dropout=0.5, # decoder prenet dropout probability
-    )
-)
+        stop_loss_scale=8.0,  # scaler for stop _loss
+        decoder_prenet_dropout=0.5,  # decoder prenet dropout probability
+    ))
 
 _C.training = CN(
     dict(
-        lr=1e-4, # learning rate
+        lr=1e-4,  # learning rate
         drop_n_heads=[[0, 0], [15000, 1]],
         reduction_factor=[[0, 10], [80000, 4], [200000, 2]],
-        plot_interval=1000, # plot attention and spectrogram
-        valid_interval=1000, # validation
-        save_interval=10000, # checkpoint
-        max_iteration=900000, # max iteration to train
-    )
-)
+        plot_interval=1000,  # plot attention and spectrogram
+        valid_interval=1000,  # validation
+        save_interval=10000,  # checkpoint
+        max_iteration=900000,  # max iteration to train
+    ))
+
 
 def get_cfg_defaults():
     """Get a yacs CfgNode object with default values for my_project."""
diff --git a/examples/transformer_tts/ljspeech.py b/examples/transformer_tts/ljspeech.py
index 245b475..137db96 100644
--- a/examples/transformer_tts/ljspeech.py
+++ b/examples/transformer_tts/ljspeech.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from pathlib import Path
 import pickle
@@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader
 from parakeet.data.batch import batch_spec, batch_text_id
 from parakeet.data import dataset
 
+
 class LJSpeech(Dataset):
     """A simple dataset adaptor for the processed ljspeech dataset."""
+
     def __init__(self, root):
         self.root = Path(root).expanduser()
         records = []
@@ -35,13 +51,13 @@ class Transform(object):
         self.end_value = end_value
 
     def __call__(self, example):
-        ids, mel = example # ids already have <s> and </s>
+        ids, mel = example  # ids already have <s> and </s>
         ids = np.array(ids, dtype=np.int64)
         # add start and end frame
-        mel = np.pad(mel, 
-                     [(0, 0), (1, 1)], 
-                     mode='constant', 
-                     constant_values=[(0, 0), (self.start_value, self.end_value)])
+        mel = np.pad(
+            mel, [(0, 0), (1, 1)],
+            mode='constant',
+            constant_values=[(0, 0), (self.start_value, self.end_value)])
         stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
         stop_labels[-1] = 2
         # actually this thing can also be done within the model
@@ -50,6 +66,7 @@ class Transform(object):
 
 class LJSpeechCollector(object):
     """A simple callable to batch LJSpeech examples."""
+
     def __init__(self, padding_idx=0, padding_value=0.):
         self.padding_idx = padding_idx
         self.padding_value = padding_value
@@ -67,15 +84,16 @@ class LJSpeechCollector(object):
 
 def create_dataloader(config, source_path):
     lj = LJSpeech(source_path)
-    transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
+    transform = Transform(config.data.mel_start_value,
+                          config.data.mel_end_value)
     lj = dataset.TransformDataset(lj, transform)
 
     valid_set, train_set = dataset.split(lj, config.data.valid_size)
     data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx)
     train_loader = DataLoader(
-        train_set, 
-        batch_size=config.data.batch_size, 
-        shuffle=True, 
+        train_set,
+        batch_size=config.data.batch_size,
+        shuffle=True,
         drop_last=True,
         collate_fn=data_collator)
     valid_loader = DataLoader(
@@ -85,4 +103,3 @@ def create_dataloader(config, source_path):
         drop_last=False,
         collate_fn=data_collator)
     return train_loader, valid_loader
-
diff --git a/examples/transformer_tts/preprocess.py b/examples/transformer_tts/preprocess.py
index 001f04c..2ba1985 100644
--- a/examples/transformer_tts/preprocess.py
+++ b/examples/transformer_tts/preprocess.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import tqdm
 import pickle
@@ -11,6 +25,7 @@ from parakeet.frontend import English
 
 from config import get_cfg_defaults
 
+
 def create_dataset(config, source_path, target_path, verbose=False):
     # create output dir
     target_path = Path(target_path).expanduser()
@@ -23,11 +38,11 @@ def create_dataset(config, source_path, target_path, verbose=False):
         sample_rate=config.data.sample_rate,
         n_fft=config.data.n_fft,
         n_mels=config.data.d_mel,
-        win_length=config.data.win_length, 
+        win_length=config.data.win_length,
         hop_length=config.data.hop_length,
         f_max=config.data.f_max)
     normalizer = LogMagnitude()
-    
+
     records = []
     for (fname, text, _) in tqdm.tqdm(meta_data):
         wav = processor.read_wav(fname)
@@ -42,12 +57,13 @@ def create_dataset(config, source_path, target_path, verbose=False):
         np.save(mel_path / mel_name, mel)
     if verbose:
         print("save mel spectrograms into {}".format(mel_path))
-    
+
     # save meta data as pickle archive
     with open(target_path / "metadata.pkl", 'wb') as f:
         pickle.dump(records, f)
         if verbose:
-            print("saved metadata into {}".format(target_path / "metadata.pkl"))
+            print("saved metadata into {}".format(target_path /
+                                                  "metadata.pkl"))
 
     # also save meta data into text format for inspection
     with open(target_path / "metadata.txt", 'wt') as f:
@@ -55,21 +71,31 @@ def create_dataset(config, source_path, target_path, verbose=False):
             phoneme_str = "|".join(phonemes)
             f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
         if verbose:
-            print("saved metadata into {}".format(target_path / "metadata.txt"))
-    
+            print("saved metadata into {}".format(target_path /
+                                                  "metadata.txt"))
+
     print("Done.")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
-    parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
-    parser.add_argument("--output", type=str, help="path to save output dataset")
-    parser.add_argument("--opts", nargs=argparse.REMAINDER,
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
         help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
     )
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
-    
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
     config = get_cfg_defaults()
     args = parser.parse_args()
     if args.config:
diff --git a/examples/transformer_tts/synthesize.py b/examples/transformer_tts/synthesize.py
index b8f352f..6758819 100644
--- a/examples/transformer_tts/synthesize.py
+++ b/examples/transformer_tts/synthesize.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import time
 from pathlib import Path
@@ -13,21 +27,22 @@ from parakeet.utils.display import add_attention_plots
 
 from config import get_cfg_defaults
 
+
 @paddle.fluid.dygraph.no_grad
 def main(config, args):
     paddle.set_device(args.device)
 
     # model
     frontend = English()
-    model = TransformerTTS.from_pretrained(
-        frontend, config, args.checkpoint_path)
+    model = TransformerTTS.from_pretrained(frontend, config,
+                                           args.checkpoint_path)
     model.eval()
 
     # inputs
     input_path = Path(args.input).expanduser()
-    with open(input_path, "rt") as f: 
+    with open(input_path, "rt") as f:
         sentences = f.readlines()
-    
+
     output_dir = Path(args.output).expanduser()
     output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -38,22 +53,36 @@ def main(config, args):
         mel_output = mel_output.T  #(C, T)
         np.save(str(output_dir / f"sentence_{i}"), mel_output)
         if args.verbose:
-            print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy"))
+            print("spectrogram saved at {}".format(output_dir /
+                                                   f"sentence_{i}.npy"))
+
 
 if __name__ == "__main__":
     config = get_cfg_defaults()
 
-    parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser = argparse.ArgumentParser(
+        description="generate mel spectrogram with TransformerTTS.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
     parser.add_argument("--input", type=str, help="path of the text sentences")
     parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
-    
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="device type to use.")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
     args = parser.parse_args()
-    if args.config: 
+    if args.config:
         config.merge_from_file(args.config)
     if args.opts:
         config.merge_from_list(args.opts)
diff --git a/examples/transformer_tts/train.py b/examples/transformer_tts/train.py
index 59ec7aa..b5ae11d 100644
--- a/examples/transformer_tts/train.py
+++ b/examples/transformer_tts/train.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 import logging
 from pathlib import Path
@@ -19,12 +33,13 @@ from parakeet.training.experiment import ExperimentBase
 from config import get_cfg_defaults
 from ljspeech import LJSpeech, LJSpeechCollector, Transform
 
+
 class Experiment(ExperimentBase):
     def setup_model(self):
         config = self.config
         frontend = English()
         model = TransformerTTS(
-            frontend, 
+            frontend,
             d_encoder=config.model.d_encoder,
             d_decoder=config.model.d_decoder,
             d_mel=config.data.d_mel,
@@ -46,8 +61,7 @@ class Experiment(ExperimentBase):
             beta1=0.9,
             beta2=0.98,
             epsilon=1e-9,
-            parameters=model.parameters()
-        )
+            parameters=model.parameters())
         criterion = TransformerTTSLoss(config.model.stop_loss_scale)
         drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
         reduction_factor = scheduler.StepWise(config.training.reduction_factor)
@@ -63,21 +77,24 @@ class Experiment(ExperimentBase):
         config = self.config
 
         ljspeech_dataset = LJSpeech(args.data)
-        transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
-        ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
-        valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
+        transform = Transform(config.data.mel_start_value,
+                              config.data.mel_end_value)
+        ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
+                                                    transform)
+        valid_set, train_set = dataset.split(ljspeech_dataset,
+                                             config.data.valid_size)
         batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
-        
+
         if not self.parallel:
             train_loader = DataLoader(
-                train_set, 
-                batch_size=config.data.batch_size, 
-                shuffle=True, 
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
                 drop_last=True,
                 collate_fn=batch_fn)
         else:
             sampler = DistributedBatchSampler(
-                train_set, 
+                train_set,
                 batch_size=config.data.batch_size,
                 num_replicas=dist.get_world_size(),
                 rank=dist.get_rank(),
@@ -95,11 +112,11 @@ class Experiment(ExperimentBase):
     def compute_outputs(self, text, mel, stop_label):
         model_core = self.model._layers if self.parallel else self.model
         model_core.set_constants(
-            self.reduction_factor(self.iteration), 
+            self.reduction_factor(self.iteration),
             self.drop_n_heads(self.iteration))
 
         # TODO(chenfeiyu): we can combine these 2 slices
-        mel_input = mel[:,:-1, :]
+        mel_input = mel[:, :-1, :]
         reduced_mel_input = mel_input[:, ::model_core.r, :]
         outputs = self.model(text, reduced_mel_input)
         return outputs
@@ -115,11 +132,8 @@ class Experiment(ExperimentBase):
 
         time_steps = mel_target.shape[1]
         losses = self.criterion(
-            mel_output[:,:time_steps, :], 
-            mel_intermediate[:,:time_steps, :], 
-            mel_target, 
-            stop_logits[:,:time_steps, :], 
-            stop_label_target)
+            mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
+            mel_target, stop_logits[:, :time_steps, :], stop_label_target)
         return losses
 
     def train_batch(self):
@@ -133,7 +147,7 @@ class Experiment(ExperimentBase):
         outputs = self.compute_outputs(text, mel, stop_label)
         losses = self.compute_losses(batch, outputs)
         loss = losses["loss"]
-        loss.backward() 
+        loss.backward()
         self.optimizer.step()
         iteration_time = time.time() - start
 
@@ -141,14 +155,17 @@ class Experiment(ExperimentBase):
         # logging
         msg = "Rank: {}, ".format(dist.get_rank())
         msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
-        msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items())
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+                                                  iteration_time)
+        msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                         for k, v in losses_np.items())
         self.logger.info(msg)
-        
+
         if dist.get_rank() == 0:
             for k, v in losses_np.items():
-                self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
-    
+                self.visualizer.add_scalar(f"train_loss/{k}", v,
+                                           self.iteration)
+
     @mp_tools.rank_zero_only
     @paddle.no_grad()
     def valid(self):
@@ -163,10 +180,9 @@ class Experiment(ExperimentBase):
             if i < 2:
                 attention_weights = outputs["cross_attention_weights"]
                 display.add_multi_attention_plots(
-                    self.visualizer, 
-                    f"valid_sentence_{i}_cross_attention_weights", 
-                    attention_weights, 
-                    self.iteration)
+                    self.visualizer,
+                    f"valid_sentence_{i}_cross_attention_weights",
+                    attention_weights, self.iteration)
 
         # write visual log
         valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
@@ -191,7 +207,7 @@ if __name__ == "__main__":
     config = get_cfg_defaults()
     parser = default_argument_parser()
     args = parser.parse_args()
-    if args.config: 
+    if args.config:
         config.merge_from_file(args.config)
     if args.opts:
         config.merge_from_list(args.opts)
diff --git a/examples/waveflow/config.py b/examples/waveflow/config.py
index 97a877a..5ca2ba1 100644
--- a/examples/waveflow/config.py
+++ b/examples/waveflow/config.py
@@ -1,40 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from yacs.config import CfgNode as CN
 
 _C = CN()
 _C.data = CN(
     dict(
-        batch_size=8, # batch size
-        valid_size=16, # the first N examples are reserved for validation
-        sample_rate=22050, # Hz, sample rate
-        n_fft=1024, # fft frame size
-        win_length=1024, # window size
+        batch_size=8,  # batch size
+        valid_size=16,  # the first N examples are reserved for validation
+        sample_rate=22050,  # Hz, sample rate
+        n_fft=1024,  # fft frame size
+        win_length=1024,  # window size
         hop_length=256,  # hop size between ajacent frame
-        f_max=8000, # Hz, max frequency when converting to mel
+        f_max=8000,  # Hz, max frequency when converting to mel
         n_mels=80,  # mel bands
-        clip_frames=65, # mel clip frames
-    )
-)
+        clip_frames=65,  # mel clip frames
+    ))
 
 _C.model = CN(
     dict(
         upsample_factors=[16, 16],
-        n_flows=8, # number of flows in WaveFlow
-        n_layers=8, # number of conv block in each flow
-        n_group=16, # folding factor of audio and spectrogram
-        channels=128, # resiaudal channel in each flow
-        kernel_size=[3, 3], # kernel size in each conv block
-        sigma=1.0, # stddev of the random noise
-    )
-)
+        n_flows=8,  # number of flows in WaveFlow
+        n_layers=8,  # number of conv block in each flow
+        n_group=16,  # folding factor of audio and spectrogram
+        channels=128,  # resiaudal channel in each flow
+        kernel_size=[3, 3],  # kernel size in each conv block
+        sigma=1.0,  # stddev of the random noise
+    ))
 
 _C.training = CN(
     dict(
-        lr=2e-4, # learning rates
-        valid_interval=1000, # validation
-        save_interval=10000, # checkpoint
-        max_iteration=3000000, # max iteration to train
-    )
-)
+        lr=2e-4,  # learning rates
+        valid_interval=1000,  # validation
+        save_interval=10000,  # checkpoint
+        max_iteration=3000000,  # max iteration to train
+    ))
+
 
 def get_cfg_defaults():
     """Get a yacs CfgNode object with default values for my_project."""
diff --git a/examples/waveflow/ljspeech.py b/examples/waveflow/ljspeech.py
index d7f5425..e07303a 100644
--- a/examples/waveflow/ljspeech.py
+++ b/examples/waveflow/ljspeech.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from pathlib import Path
 import pickle
@@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
 from parakeet.data import dataset
 from parakeet.audio import AudioProcessor
 
+
 class LJSpeech(Dataset):
     """A simple dataset adaptor for the processed ljspeech dataset."""
+
     def __init__(self, root):
         self.root = Path(root).expanduser()
         meta_data = pandas.read_csv(
             str(self.root / "metadata.csv"),
             sep="\t",
             header=None,
-            names=["fname", "frames", "samples"]
-        )
-        
+            names=["fname", "frames", "samples"])
+
         records = []
-        for row in meta_data.itertuples() :
+        for row in meta_data.itertuples():
             mel_path = str(self.root / "mel" / (row.fname + ".npy"))
             wav_path = str(self.root / "wav" / (row.fname + ".npy"))
             records.append((mel_path, wav_path))
@@ -39,6 +54,7 @@ class LJSpeech(Dataset):
 
 class LJSpeechCollector(object):
     """A simple callable to batch LJSpeech examples."""
+
     def __init__(self, padding_value=0.):
         self.padding_value = padding_value
 
@@ -52,9 +68,9 @@ class LJSpeechCollector(object):
 
 class LJSpeechClipCollector(object):
     def __init__(self, clip_frames=65, hop_length=256):
-        self.clip_frames = clip_frames 
+        self.clip_frames = clip_frames
         self.hop_length = hop_length
-    
+
     def __call__(self, examples):
         mels = []
         wavs = []
@@ -70,9 +86,7 @@ class LJSpeechClipCollector(object):
         mel, wav = example
         frames = mel.shape[-1]
         start = np.random.randint(0, frames - self.clip_frames)
-        mel_clip = mel[:, start: start + self.clip_frames]
-        wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
+        mel_clip = mel[:, start:start + self.clip_frames]
+        wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
+                       self.hop_length]
         return mel_clip, wav_clip
-
-
-
diff --git a/examples/waveflow/preprocess.py b/examples/waveflow/preprocess.py
index d4bdc8e..ac6d62e 100644
--- a/examples/waveflow/preprocess.py
+++ b/examples/waveflow/preprocess.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import tqdm
 import csv
@@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
     output_dir = Path(output_dir).expanduser()
     output_dir.mkdir(exist_ok=True)
 
-    transform = Transform(
-        config.sample_rate, 
-        config.n_fft, 
-        config.win_length, 
-        config.hop_length, 
-        config.n_mels)
+    transform = Transform(config.sample_rate, config.n_fft, config.win_length,
+                          config.hop_length, config.n_mels)
     file_names = []
 
     for example in tqdm.tqdm(dataset):
@@ -107,23 +117,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
         np.save(str(mel_dir / base_name), mel)
 
         file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
-    
+
     meta_data = pd.DataFrame.from_records(file_names)
-    meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
-    print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
+    meta_data.to_csv(
+        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
+    print("saved meta data in to {}".format(
+        os.path.join(output_dir, "metadata.csv")))
 
     print("Done!")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
-    parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
-    parser.add_argument("--output", type=str, help="path to save output dataset")
-    parser.add_argument("--opts", nargs=argparse.REMAINDER,
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
         help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
     )
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
-    
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
     config = get_cfg_defaults()
     args = parser.parse_args()
     if args.config:
diff --git a/examples/waveflow/synthesize.py b/examples/waveflow/synthesize.py
index 1856eb2..45c751a 100644
--- a/examples/waveflow/synthesize.py
+++ b/examples/waveflow/synthesize.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import numpy as np
 import soundfile as sf
@@ -8,9 +22,9 @@ import parakeet
 from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
 from parakeet.utils import layer_tools, checkpoint
 
-
 from config import get_cfg_defaults
 
+
 def main(config, args):
     paddle.set_device(args.device)
     model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
@@ -23,7 +37,8 @@ def main(config, args):
     for file_path in mel_dir.iterdir():
         mel = np.load(str(file_path))
         audio = model.predict(mel)
-        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
+        audio_path = output_dir / (
+            os.path.splitext(file_path.name)[0] + ".wav")
         sf.write(audio_path, audio, config.data.sample_rate)
         print("[synthesize] {} -> {}".format(file_path, audio_path))
 
@@ -31,17 +46,32 @@ def main(config, args):
 if __name__ == "__main__":
     config = get_cfg_defaults()
 
-    parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
-    parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
+    parser = argparse.ArgumentParser(
+        description="generate mel spectrogram with TransformerTTS.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="path of directory containing mel spectrogram (in .npy format)")
     parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
-    
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="device type to use.")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
     args = parser.parse_args()
-    if args.config: 
+    if args.config:
         config.merge_from_file(args.config)
     if args.opts:
         config.merge_from_list(args.opts)
@@ -49,4 +79,4 @@ if __name__ == "__main__":
     print(config)
     print(args)
 
-    main(config, args)
\ No newline at end of file
+    main(config, args)
diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py
index 1cd68f0..443cc8b 100644
--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 from pathlib import Path
 import numpy as np
@@ -34,7 +48,8 @@ class Experiment(ExperimentBase):
 
         if self.parallel > 1:
             model = paddle.DataParallel(model)
-        optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters())
+        optimizer = paddle.optimizer.Adam(
+            config.training.lr, parameters=model.parameters())
         criterion = WaveFlowLoss(sigma=config.model.sigma)
 
         self.model = model
@@ -46,20 +61,22 @@ class Experiment(ExperimentBase):
         args = self.args
 
         ljspeech_dataset = LJSpeech(args.data)
-        valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
+        valid_set, train_set = dataset.split(ljspeech_dataset,
+                                             config.data.valid_size)
+
+        batch_fn = LJSpeechClipCollector(config.data.clip_frames,
+                                         config.data.hop_length)
 
-        batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length)
-        
         if not self.parallel:
             train_loader = DataLoader(
-                train_set, 
-                batch_size=config.data.batch_size, 
-                shuffle=True, 
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
                 drop_last=True,
                 collate_fn=batch_fn)
         else:
             sampler = DistributedBatchSampler(
-                train_set, 
+                train_set,
                 batch_size=config.data.batch_size,
                 num_replicas=dist.get_world_size(),
                 rank=dist.get_rank(),
@@ -71,7 +88,7 @@ class Experiment(ExperimentBase):
         valid_batch_fn = LJSpeechCollector()
         valid_loader = DataLoader(
             valid_set, batch_size=1, collate_fn=valid_batch_fn)
-        
+
         self.train_loader = train_loader
         self.valid_loader = valid_loader
 
@@ -90,17 +107,19 @@ class Experiment(ExperimentBase):
         mel, wav = batch
         z, log_det_jocobian = self.compute_outputs(mel, wav)
         loss = self.criterion(z, log_det_jocobian)
-        loss.backward() 
+        loss.backward()
         self.optimizer.step()
         iteration_time = time.time() - start
 
         loss_value = float(loss)
         msg = "Rank: {}, ".format(dist.get_rank())
         msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+                                                  iteration_time)
         msg += "loss: {:>.6f}".format(loss_value)
         self.logger.info(msg)
-        self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
+        self.visualizer.add_scalar(
+            "train/loss", loss_value, global_step=self.iteration)
 
     @mp_tools.rank_zero_only
     @paddle.no_grad()
@@ -112,7 +131,8 @@ class Experiment(ExperimentBase):
         loss = self.criterion(z, log_det_jocobian)
         valid_losses.append(float(loss))
         valid_loss = np.mean(valid_losses)
-        self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
+        self.visualizer.add_scalar(
+            "valid/loss", valid_loss, global_step=self.iteration)
 
 
 def main_sp(config, args):
@@ -132,7 +152,7 @@ if __name__ == "__main__":
     config = get_cfg_defaults()
     parser = default_argument_parser()
     args = parser.parse_args()
-    if args.config: 
+    if args.config:
         config.merge_from_file(args.config)
     if args.opts:
         config.merge_from_list(args.opts)
diff --git a/examples/wavenet/config.py b/examples/wavenet/config.py
index 58f9beb..658d416 100644
--- a/examples/wavenet/config.py
+++ b/examples/wavenet/config.py
@@ -1,19 +1,32 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from yacs.config import CfgNode as CN
 
 _C = CN()
 _C.data = CN(
     dict(
-        batch_size=8, # batch size
-        valid_size=16, # the first N examples are reserved for validation
-        sample_rate=22050, # Hz, sample rate
-        n_fft=2048, # fft frame size
-        win_length=1024, # window size
+        batch_size=8,  # batch size
+        valid_size=16,  # the first N examples are reserved for validation
+        sample_rate=22050,  # Hz, sample rate
+        n_fft=2048,  # fft frame size
+        win_length=1024,  # window size
         hop_length=256,  # hop size between ajacent frame
         # f_max=8000, # Hz, max frequency when converting to mel
         n_mels=80,  # mel bands
-        train_clip_seconds=0.5, # audio clip length(in seconds)
-    )
-)
+        train_clip_seconds=0.5,  # audio clip length(in seconds)
+    ))
 
 _C.model = CN(
     dict(
@@ -21,24 +34,22 @@ _C.model = CN(
         n_stack=3,
         n_loop=10,
         filter_size=2,
-        residual_channels=128, # resiaudal channel in each flow
+        residual_channels=128,  # resiaudal channel in each flow
         loss_type="mog",
-        output_dim=3, # single gaussian
-        log_scale_min=-9.0,
-    )
-)
+        output_dim=3,  # single gaussian
+        log_scale_min=-9.0, ))
 
 _C.training = CN(
     dict(
-        lr=1e-3, # learning rates
-        anneal_rate=0.5, # learning rate decay rate
-        anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps
-        valid_interval=1000, # validation
-        save_interval=10000, # checkpoint
-        max_iteration=3000000, # max iteration to train
-        gradient_max_norm=100.0 # global norm of gradients
-    )
-)
+        lr=1e-3,  # learning rates
+        anneal_rate=0.5,  # learning rate decay rate
+        anneal_interval=200000,  # decrese lr by annel_rate every anneal_interval steps
+        valid_interval=1000,  # validation
+        save_interval=10000,  # checkpoint
+        max_iteration=3000000,  # max iteration to train
+        gradient_max_norm=100.0  # global norm of gradients
+    ))
+
 
 def get_cfg_defaults():
     """Get a yacs CfgNode object with default values for my_project."""
diff --git a/examples/wavenet/ljspeech.py b/examples/wavenet/ljspeech.py
index 18dc388..d1d3c67 100644
--- a/examples/wavenet/ljspeech.py
+++ b/examples/wavenet/ljspeech.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from pathlib import Path
 import pickle
@@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
 from parakeet.data import dataset
 from parakeet.audio import AudioProcessor
 
+
 class LJSpeech(Dataset):
     """A simple dataset adaptor for the processed ljspeech dataset."""
+
     def __init__(self, root):
         self.root = Path(root).expanduser()
         meta_data = pandas.read_csv(
             str(self.root / "metadata.csv"),
             sep="\t",
             header=None,
-            names=["fname", "frames", "samples"]
-        )
-        
+            names=["fname", "frames", "samples"])
+
         records = []
-        for row in meta_data.itertuples() :
+        for row in meta_data.itertuples():
             mel_path = str(self.root / "mel" / (row.fname + ".npy"))
             wav_path = str(self.root / "wav" / (row.fname + ".npy"))
             records.append((mel_path, wav_path))
@@ -39,6 +54,7 @@ class LJSpeech(Dataset):
 
 class LJSpeechCollector(object):
     """A simple callable to batch LJSpeech examples."""
+
     def __init__(self, padding_value=0.):
         self.padding_value = padding_value
 
@@ -48,15 +64,15 @@ class LJSpeechCollector(object):
         wavs = [example[1] for example in examples]
         mels = batch_spec(mels, pad_value=self.padding_value)
         wavs = batch_wav(wavs, pad_value=self.padding_value)
-        audio_starts = np.zeros((batch_size,), dtype=np.int64)
+        audio_starts = np.zeros((batch_size, ), dtype=np.int64)
         return mels, wavs, audio_starts
 
 
 class LJSpeechClipCollector(object):
     def __init__(self, clip_frames=65, hop_length=256):
-        self.clip_frames = clip_frames 
+        self.clip_frames = clip_frames
         self.hop_length = hop_length
-    
+
     def __call__(self, examples):
         mels = []
         wavs = []
@@ -75,7 +91,8 @@ class LJSpeechClipCollector(object):
         mel, wav = example
         frames = mel.shape[-1]
         start = np.random.randint(0, frames - self.clip_frames)
-        wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
+        wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
+                       self.hop_length]
         return mel, wav_clip, start
 
 
@@ -132,7 +149,3 @@ class DataCollector(object):
             audios = np.array(audios, dtype=np.float32)
         audio_starts = np.array(audio_starts, dtype=np.int64)
         return audios, mels, audio_starts
-
-
-
-
diff --git a/examples/wavenet/preprocess.py b/examples/wavenet/preprocess.py
index 29b140c..cc83727 100644
--- a/examples/wavenet/preprocess.py
+++ b/examples/wavenet/preprocess.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import tqdm
 import csv
@@ -23,7 +37,7 @@ class Transform(object):
         self.win_length = win_length
         self.hop_length = hop_length
         self.n_mels = n_mels
-        
+
         self.spec_normalizer = UnitMagnitude(min=1e-5)
 
     def __call__(self, example):
@@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
     output_dir = Path(output_dir).expanduser()
     output_dir.mkdir(exist_ok=True)
 
-    transform = Transform(
-        config.sample_rate, 
-        config.n_fft, 
-        config.win_length, 
-        config.hop_length, 
-        config.n_mels)
+    transform = Transform(config.sample_rate, config.n_fft, config.win_length,
+                          config.hop_length, config.n_mels)
     file_names = []
 
     for example in tqdm.tqdm(dataset):
@@ -108,23 +118,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
         np.save(str(mel_dir / base_name), mel)
 
         file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
-    
+
     meta_data = pd.DataFrame.from_records(file_names)
-    meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
-    print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
+    meta_data.to_csv(
+        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
+    print("saved meta data in to {}".format(
+        os.path.join(output_dir, "metadata.csv")))
 
     print("Done!")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
-    parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
-    parser.add_argument("--output", type=str, help="path to save output dataset")
-    parser.add_argument("--opts", nargs=argparse.REMAINDER,
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
         help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
     )
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
-    
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
     config = get_cfg_defaults()
     args = parser.parse_args()
     if args.config:
diff --git a/examples/wavenet/synthesize.py b/examples/wavenet/synthesize.py
index 80b96a2..c5a69fe 100644
--- a/examples/wavenet/synthesize.py
+++ b/examples/wavenet/synthesize.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import numpy as np
 import soundfile as sf
@@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint
 
 from config import get_cfg_defaults
 
+
 def main(config, args):
     paddle.set_device(args.device)
     model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
@@ -22,7 +37,8 @@ def main(config, args):
     for file_path in mel_dir.iterdir():
         mel = np.load(str(file_path))
         audio = model.predict(mel)
-        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
+        audio_path = output_dir / (
+            os.path.splitext(file_path.name)[0] + ".wav")
         sf.write(audio_path, audio, config.data.sample_rate)
         print("[synthesize] {} -> {}".format(file_path, audio_path))
 
@@ -30,17 +46,32 @@ def main(config, args):
 if __name__ == "__main__":
     config = get_cfg_defaults()
 
-    parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
-    parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
+    parser = argparse.ArgumentParser(
+        description="generate mel spectrogram with TransformerTTS.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="path of directory containing mel spectrogram (in .npy format)")
     parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-    parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
-    
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="device type to use.")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")
+
     args = parser.parse_args()
-    if args.config: 
+    if args.config:
         config.merge_from_file(args.config)
     if args.opts:
         config.merge_from_list(args.opts)
@@ -48,4 +79,4 @@ if __name__ == "__main__":
     print(config)
     print(args)
 
-    main(config, args)
\ No newline at end of file
+    main(config, args)
diff --git a/examples/wavenet/train.py b/examples/wavenet/train.py
index 77c54e3..8e9bc0e 100644
--- a/examples/wavenet/train.py
+++ b/examples/wavenet/train.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 from pathlib import Path
 import math
@@ -26,7 +40,7 @@ class Experiment(ExperimentBase):
         config = self.config
         model = ConditionalWaveNet(
             upsample_factors=config.model.upsample_factors,
-            n_stack=config.model.n_stack, 
+            n_stack=config.model.n_stack,
             n_loop=config.model.n_loop,
             residual_channels=config.model.residual_channels,
             output_dim=config.model.output_dim,
@@ -39,13 +53,13 @@ class Experiment(ExperimentBase):
             model = paddle.DataParallel(model)
 
         lr_scheduler = paddle.optimizer.lr.StepDecay(
-            config.training.lr, 
-            config.training.anneal_interval, 
+            config.training.lr, config.training.anneal_interval,
             config.training.anneal_rate)
         optimizer = paddle.optimizer.Adam(
             lr_scheduler,
             parameters=model.parameters(),
-            grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm))
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(
+                config.training.gradient_max_norm))
 
         self.model = model
         self.model_core = model._layer if self.parallel else model
@@ -56,7 +70,8 @@ class Experiment(ExperimentBase):
         args = self.args
 
         ljspeech_dataset = LJSpeech(args.data)
-        valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
+        valid_set, train_set = dataset.split(ljspeech_dataset,
+                                             config.data.valid_size)
 
         # convolutional net's causal padding size
         context_size = config.model.n_stack \
@@ -66,20 +81,21 @@ class Experiment(ExperimentBase):
 
         # frames used to compute loss
         frames_per_second = config.data.sample_rate // config.data.hop_length
-        train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second)
-        
+        train_clip_frames = math.ceil(config.data.train_clip_seconds *
+                                      frames_per_second)
+
         num_frames = train_clip_frames + context_frames
         batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
         if not self.parallel:
             train_loader = DataLoader(
-                train_set, 
-                batch_size=config.data.batch_size, 
-                shuffle=True, 
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
                 drop_last=True,
                 collate_fn=batch_fn)
         else:
             sampler = DistributedBatchSampler(
-                train_set, 
+                train_set,
                 batch_size=config.data.batch_size,
                 shuffle=True,
                 drop_last=True)
@@ -89,7 +105,7 @@ class Experiment(ExperimentBase):
         valid_batch_fn = LJSpeechCollector()
         valid_loader = DataLoader(
             valid_set, batch_size=1, collate_fn=valid_batch_fn)
-        
+
         self.train_loader = train_loader
         self.valid_loader = valid_loader
 
@@ -101,20 +117,22 @@ class Experiment(ExperimentBase):
         self.model.train()
         self.optimizer.clear_grad()
         mel, wav, audio_starts = batch
-        
+
         y = self.model(wav, mel, audio_starts)
         loss = self.model.loss(y, wav)
-        loss.backward() 
+        loss.backward()
         self.optimizer.step()
         iteration_time = time.time() - start
 
         loss_value = float(loss)
         msg = "Rank: {}, ".format(dist.get_rank())
         msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+                                                  iteration_time)
         msg += "loss: {:>.6f}".format(loss_value)
         self.logger.info(msg)
-        self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
+        self.visualizer.add_scalar(
+            "train/loss", loss_value, global_step=self.iteration)
 
     @mp_tools.rank_zero_only
     @paddle.no_grad()
@@ -126,7 +144,8 @@ class Experiment(ExperimentBase):
         loss = self.model.loss(y, wav)
         valid_losses.append(float(loss))
         valid_loss = np.mean(valid_losses)
-        self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
+        self.visualizer.add_scalar(
+            "valid/loss", valid_loss, global_step=self.iteration)
 
 
 def main_sp(config, args):
@@ -146,7 +165,7 @@ if __name__ == "__main__":
     config = get_cfg_defaults()
     parser = default_argument_parser()
     args = parser.parse_args()
-    if args.config: 
+    if args.config:
         config.merge_from_file(args.config)
     if args.opts:
         config.merge_from_list(args.opts)
diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py
index 93d4e6b..3795111 100644
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
@@ -18,15 +18,16 @@ import numpy as np
 
 __all__ = ["AudioProcessor"]
 
+
 class AudioProcessor(object):
     def __init__(self,
-                 sample_rate:int,
-                 n_fft:int,
-                 win_length:int,
-                 hop_length:int,
-                 n_mels:int=80,
-                 f_min:int=0,
-                 f_max:int=None,
+                 sample_rate: int,
+                 n_fft: int,
+                 win_length: int,
+                 hop_length: int,
+                 n_mels: int=80,
+                 f_min: int=0,
+                 f_max: int=None,
                  window="hann",
                  center=True,
                  pad_mode="reflect"):
@@ -40,7 +41,7 @@ class AudioProcessor(object):
         self.window = window
         self.center = center
         self.pad_mode = pad_mode
-        
+
         # mel
         self.n_mels = n_mels
         self.f_min = f_min
@@ -48,19 +49,18 @@ class AudioProcessor(object):
 
         self.mel_filter = self._create_mel_filter()
         self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
-        
+
     def _create_mel_filter(self):
-        mel_filter = librosa.filters.mel(
-            self.sample_rate,
-            self.n_fft,
-            n_mels=self.n_mels,
-            fmin=self.f_min,
-            fmax=self.f_max)
+        mel_filter = librosa.filters.mel(self.sample_rate,
+                                         self.n_fft,
+                                         n_mels=self.n_mels,
+                                         fmin=self.f_min,
+                                         fmax=self.f_max)
         return mel_filter
 
     def read_wav(self, filename):
         # resampling may occur
-        wav, _ = librosa.load(filename, sr=self.sample_rate) 
+        wav, _ = librosa.load(filename, sr=self.sample_rate)
         return wav
 
     def write_wav(self, path, wav):
@@ -69,7 +69,7 @@ class AudioProcessor(object):
     def stft(self, wav):
         D = librosa.core.stft(
             wav,
-            n_fft = self.n_fft,
+            n_fft=self.n_fft,
             hop_length=self.hop_length,
             win_length=self.win_length,
             window=self.window,
diff --git a/parakeet/audio/spec_normalizer.py b/parakeet/audio/spec_normalizer.py
index 08cea1b..069c453 100644
--- a/parakeet/audio/spec_normalizer.py
+++ b/parakeet/audio/spec_normalizer.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """
 This modules contains normalizers for spectrogram magnitude.
@@ -19,22 +32,24 @@ __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"]
 class NormalizerBase(object):
     def transform(self, spec):
         raise NotImplementedError("transform must be implemented")
-    
+
     def inverse(self, normalized):
         raise NotImplementedError("inverse must be implemented")
 
+
 class LogMagnitude(NormalizerBase):
     """
     This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
     """
+
     def __init__(self, min=1e-7):
         self.min = min
-    
+
     def transform(self, x):
         x = np.maximum(x, self.min)
         x = np.log(x)
         return x
-    
+
     def inverse(self, x):
         return np.exp(x)
 
@@ -44,15 +59,16 @@ class UnitMagnitude(NormalizerBase):
     """
     This is the normalizer used in the 
     """
+
     def __init__(self, min=1e-5):
         self.min = min
-    
+
     def transform(self, x):
         db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20
         normalized = (db_scale + 100) / 100
         clipped = np.clip(normalized, 0, 1)
         return clipped
-    
+
     def inverse(self, x):
         denormalized = np.clip(x, 0, 1) * 100 - 100
         out = np.exp((denormalized + 20) / 20 * np.log(10))
diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py
index 1551124..4c5be61 100644
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided.
 import numpy as np
 
 __all__ = [
-    "batch_text_id", "batch_wav", "batch_spec",
-    "TextIDBatcher", "WavBatcher", "SpecBatcher",
+    "batch_text_id",
+    "batch_wav",
+    "batch_spec",
+    "TextIDBatcher",
+    "WavBatcher",
+    "SpecBatcher",
 ]
 
+
 class TextIDBatcher(object):
     """A wrapper class for `batch_text_id`."""
 
@@ -99,8 +104,8 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
         pad_len = max_len - example.shape[-1]
         batch.append(
             np.pad(example, [(0, pad_len)],
-                    mode='constant',
-                    constant_values=pad_value))
+                   mode='constant',
+                   constant_values=pad_value))
     return np.array(batch, dtype=dtype)
 
 
@@ -113,7 +118,11 @@ class SpecBatcher(object):
         self.time_major = time_major
 
     def __call__(self, minibatch):
-        out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype)
+        out = batch_spec(
+            minibatch,
+            pad_value=self.pad_value,
+            time_major=self.time_major,
+            dtype=self.dtype)
         return out
 
 
@@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
     """
     # assume (F, T) or (T, F)
     peek_example = minibatch[0]
-    assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram"
+    assert len(
+        peek_example.shape) == 2, "we only handles mono channel spectrogram"
 
     # assume (F, n_frame) or (n_frame, F)
     time_idx = 0 if time_major else -1
@@ -143,11 +153,11 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
         if time_major:
             batch.append(
                 np.pad(example, [(0, pad_len), (0, 0)],
-                        mode='constant',
-                        constant_values=pad_value))
+                       mode='constant',
+                       constant_values=pad_value))
         else:
             batch.append(
                 np.pad(example, [(0, 0), (0, pad_len)],
-                        mode='constant',
-                        constant_values=pad_value))
+                       mode='constant',
+                       constant_values=pad_value))
     return np.array(batch, dtype=dtype)
diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py
index de9b40c..a188767 100644
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@@ -17,17 +17,25 @@ import paddle
 from paddle.io import Dataset
 
 __all__ = [
-    "split", "TransformDataset", "CacheDataset", "TupleDataset", 
-    "DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset", 
+    "split",
+    "TransformDataset",
+    "CacheDataset",
+    "TupleDataset",
+    "DictDataset",
+    "SliceDataset",
+    "SubsetDataset",
+    "FilterDataset",
     "ChainDataset",
 ]
 
+
 def split(dataset, first_size):
     """A utility function to split a dataset into two datasets."""
     first = SliceDataset(dataset, 0, first_size)
     second = SliceDataset(dataset, first_size, len(dataset))
     return first, second
 
+
 class TransformDataset(Dataset):
     def __init__(self, dataset, transform):
         """Dataset which is transformed from another with a transform.
@@ -141,7 +149,7 @@ class DictDataset(Dataset):
                     for i in six.moves.range(length)]
         else:
             return batches
-    
+
     def __len__(self):
         return self._length
 
diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py
index de7be70..e75da0b 100644
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
@@ -1,2 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.datasets.common import *
 from parakeet.datasets.ljspeech import *
\ No newline at end of file
diff --git a/parakeet/datasets/common.py b/parakeet/datasets/common.py
index e0d91a3..a1d16d6 100644
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
@@ -1,9 +1,24 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.io import Dataset
 import os
 import librosa
 
 __all__ = ["AudioFolderDataset"]
 
+
 class AudioFolderDataset(Dataset):
     def __init__(self, path, sample_rate, extension="wav"):
         self.root = os.path.expanduser(path)
@@ -19,5 +34,5 @@ class AudioFolderDataset(Dataset):
 
     def __getitem__(self, i):
         file_name = self.file_names[i]
-        y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable
+        y, _ = librosa.load(file_name, sr=self.sample_rate)  # pylint: disable=unused-variable
         return y
diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py
index 9c2e0c3..a37863f 100644
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
@@ -1,8 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.io import Dataset
 from pathlib import Path
 
 __all__ = ["LJSpeechMetaData"]
 
+
 class LJSpeechMetaData(Dataset):
     def __init__(self, root):
         self.root = Path(root).expanduser()
@@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset):
 
     def __len__(self):
         return len(self.records)
-
diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py
index cee73c1..2d06dda 100644
--- a/parakeet/frontend/__init__.py
+++ b/parakeet/frontend/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.frontend.vocab import *
 from parakeet.frontend.phonectic import *
 from parakeet.frontend.punctuation import *
diff --git a/parakeet/frontend/normalizer/__init__.py b/parakeet/frontend/normalizer/__init__.py
index f098650..37fd580 100644
--- a/parakeet/frontend/normalizer/__init__.py
+++ b/parakeet/frontend/normalizer/__init__.py
@@ -1,2 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.frontend.normalizer.normalizer import *
 from parakeet.frontend.normalizer.numbers import *
diff --git a/parakeet/frontend/normalizer/abbrrviation.py b/parakeet/frontend/normalizer/abbrrviation.py
index e69de29..9118340 100644
--- a/parakeet/frontend/normalizer/abbrrviation.py
+++ b/parakeet/frontend/normalizer/abbrrviation.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/parakeet/frontend/normalizer/acronyms.py b/parakeet/frontend/normalizer/acronyms.py
index e69de29..9118340 100644
--- a/parakeet/frontend/normalizer/acronyms.py
+++ b/parakeet/frontend/normalizer/acronyms.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/parakeet/frontend/normalizer/width.py b/parakeet/frontend/normalizer/width.py
index 440557f..b1598af 100644
--- a/parakeet/frontend/normalizer/width.py
+++ b/parakeet/frontend/normalizer/width.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 def full2half_width(ustr):
     half = []
     for u in ustr:
         num = ord(u)
-        if num == 0x3000:    # 全角空格变半角
+        if num == 0x3000:  # 全角空格变半角
             num = 32
         elif 0xFF01 <= num <= 0xFF5E:
             num -= 0xfee0
@@ -10,15 +24,16 @@ def full2half_width(ustr):
         half.append(u)
     return ''.join(half)
 
+
 def half2full_width(ustr):
     full = []
     for u in ustr:
         num = ord(u)
-        if num == 32:    # 半角空格变全角
+        if num == 32:  # 半角空格变全角
             num = 0x3000
         elif 0x21 <= num <= 0x7E:
             num += 0xfee0
-        u = chr(num)    # to unicode
+        u = chr(num)  # to unicode
         full.append(u)
-        
-    return ''.join(full)
\ No newline at end of file
+
+    return ''.join(full)
diff --git a/parakeet/frontend/punctuation.py b/parakeet/frontend/punctuation.py
index 9984970..099e759 100644
--- a/parakeet/frontend/punctuation.py
+++ b/parakeet/frontend/punctuation.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import abc
 import string
 
@@ -13,15 +27,8 @@ EN_PUNCT = [
     "!",
 ]
 
-CN_PUNCT = [
-    "、",
-    "，",
-    "；",
-    "：",
-    "。",
-    "？",
-    "！"
-]
+CN_PUNCT = ["、", "，", "；", "：", "。", "？", "！"]
+
 
 def get_punctuations(lang):
     if lang == "en":
@@ -30,4 +37,3 @@ def get_punctuations(lang):
         return CN_PUNCT
     else:
         raise ValueError(f"language {lang} Not supported")
-
diff --git a/parakeet/models/transformer_tts.py b/parakeet/models/transformer_tts.py
index f84a9f8..c7f0ccd 100644
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@@ -559,7 +559,7 @@ class TransformerTTS(nn.Layer):
     @classmethod
     def from_pretrained(cls, frontend, config, checkpoint_path):
         model = TransformerTTS(
-            frontend, 
+            frontend,
             d_encoder=config.model.d_encoder,
             d_decoder=config.model.d_decoder,
             d_mel=config.data.d_mel,
@@ -575,11 +575,12 @@ class TransformerTTS(nn.Layer):
             decoder_prenet_dropout=config.model.decoder_prenet_dropout,
             dropout=config.model.dropout)
 
-        iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
+        iteration = checkpoint.load_parameters(
+            model, checkpoint_path=checkpoint_path)
         drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
         reduction_factor = scheduler.StepWise(config.training.reduction_factor)
         model.set_constants(
-            reduction_factor=reduction_factor(iteration), 
+            reduction_factor=reduction_factor(iteration),
             drop_n_heads=drop_n_heads(iteration))
         return model
 
diff --git a/parakeet/models/waveflow.py b/parakeet/models/waveflow.py
index d58127b..625e61f 100644
--- a/parakeet/models/waveflow.py
+++ b/parakeet/models/waveflow.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import numpy as np
 from typing import List, Union, Tuple
@@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo
 
 __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
 
+
 def fold(x, n_group):
     r"""Fold audio or spectrogram's temporal dimension in to groups.
 
@@ -31,6 +46,7 @@ def fold(x, n_group):
     new_shape = spatial_shape + [time_steps // n_group, n_group]
     return paddle.reshape(x, new_shape)
 
+
 class UpsampleNet(nn.LayerList):
     """Layer to upsample mel spectrogram to the same temporal resolution with 
     the corresponding waveform. 
@@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList):
     ---------
     ``librosa.core.stft``
     """
+
     def __init__(self, upsample_factors):
         super(UpsampleNet, self).__init__()
         for factor in upsample_factors:
@@ -67,16 +84,18 @@ class UpsampleNet(nn.LayerList):
             init = I.Uniform(-std, std)
             self.append(
                 nn.utils.weight_norm(
-                    nn.Conv2DTranspose(1, 1, (3, 2 * factor), 
+                    nn.Conv2DTranspose(
+                        1,
+                        1, (3, 2 * factor),
                         padding=(1, factor // 2),
                         stride=(1, factor),
                         weight_attr=init,
                         bias_attr=init)))
-            
+
         # upsample factors
         self.upsample_factor = np.prod(upsample_factors)
         self.upsample_factors = upsample_factors
-    
+
     def forward(self, x, trim_conv_artifact=False):
         r"""Forward pass of the ``UpsampleNet``.
         
@@ -131,38 +150,47 @@ class ResidualBlock(nn.Layer):
     dilations : int
         Dilations of the Convolution2d applied to the input.
     """
+
     def __init__(self, channels, cond_channels, kernel_size, dilations):
         super(ResidualBlock, self).__init__()
         # input conv
         std = math.sqrt(1 / channels * np.prod(kernel_size))
         init = I.Uniform(-std, std)
-        receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)]
+        receptive_field = [
+            1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
+        ]
         rh, rw = receptive_field
-        paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
-        conv = nn.Conv2D(channels, 2 * channels, kernel_size, 
-                         padding=paddings,
-                         dilation=dilations, 
-                         weight_attr=init, 
-                         bias_attr=init)
+        paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2]  # causal & same
+        conv = nn.Conv2D(
+            channels,
+            2 * channels,
+            kernel_size,
+            padding=paddings,
+            dilation=dilations,
+            weight_attr=init,
+            bias_attr=init)
         self.conv = nn.utils.weight_norm(conv)
         self.rh = rh
         self.rw = rw
         self.dilations = dilations
-        
+
         # condition projection
         std = math.sqrt(1 / cond_channels)
         init = I.Uniform(-std, std)
-        condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1),
-                                   weight_attr=init, bias_attr=init)
+        condition_proj = nn.Conv2D(
+            cond_channels,
+            2 * channels, (1, 1),
+            weight_attr=init,
+            bias_attr=init)
         self.condition_proj = nn.utils.weight_norm(condition_proj)
-        
+
         # parametric residual & skip connection
         std = math.sqrt(1 / channels)
         init = I.Uniform(-std, std)
-        out_proj = nn.Conv2D(channels, 2 * channels, (1, 1),
-                             weight_attr=init, bias_attr=init)
+        out_proj = nn.Conv2D(
+            channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
         self.out_proj = nn.utils.weight_norm(out_proj)
-        
+
     def forward(self, x, condition):
         """Compute output for a whole folded sequence.
         
@@ -185,10 +213,10 @@ class ResidualBlock(nn.Layer):
         x_in = x
         x = self.conv(x)
         x += self.condition_proj(condition)
-        
+
         content, gate = paddle.chunk(x, 2, axis=1)
         x = paddle.tanh(content) * F.sigmoid(gate)
-        
+
         x = self.out_proj(x)
         res, skip = paddle.chunk(x, 2, axis=1)
         res = x_in + res
@@ -249,7 +277,7 @@ class ResidualBlock(nn.Layer):
 
         content, gate = paddle.chunk(x_row, 2, axis=1)
         x_row = paddle.tanh(content) * F.sigmoid(gate)
-        
+
         x_row = self.out_proj(x_row)
         res, skip = paddle.chunk(x_row, 2, axis=1)
         res = x_row_in + res
@@ -290,20 +318,23 @@ class ResidualNet(nn.LayerList):
     ValueError
         If the length of dilations_h does not equals n_layers.
     """
-    def __init__(self, 
-                 n_layer: int, 
-                 residual_channels: int, 
-                 condition_channels: int, 
-                 kernel_size: Tuple[int], 
+
+    def __init__(self,
+                 n_layer: int,
+                 residual_channels: int,
+                 condition_channels: int,
+                 kernel_size: Tuple[int],
                  dilations_h: List[int]):
         if len(dilations_h) != n_layer:
-            raise ValueError("number of dilations_h should equals num of layers")
+            raise ValueError(
+                "number of dilations_h should equals num of layers")
         super(ResidualNet, self).__init__()
         for i in range(n_layer):
-            dilation = (dilations_h[i], 2 ** i)
-            layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation)
+            dilation = (dilations_h[i], 2**i)
+            layer = ResidualBlock(residual_channels, condition_channels,
+                                  kernel_size, dilation)
             self.append(layer)
-            
+
     def forward(self, x, condition):
         """Comput the output of given the input and the condition.
 
@@ -332,7 +363,7 @@ class ResidualNet(nn.LayerList):
         """
         for layer in self:
             layer.start_sequence()
-    
+
     def add_input(self, x_row, condition_row):
         """Compute the output for a row and update the buffers.
 
@@ -386,33 +417,37 @@ class Flow(nn.Layer):
         Number of timesteps to the folded into a group.
     """
     dilations_dict = {
-            8: [1, 1, 1, 1, 1, 1, 1, 1],
-            16: [1, 1, 1, 1, 1, 1, 1, 1],
-            32: [1, 2, 4, 1, 2, 4, 1, 2],
-            64: [1, 2, 4, 8, 16, 1, 2, 4],
-            128: [1, 2, 4, 8, 16, 32, 64, 1]
+        8: [1, 1, 1, 1, 1, 1, 1, 1],
+        16: [1, 1, 1, 1, 1, 1, 1, 1],
+        32: [1, 2, 4, 1, 2, 4, 1, 2],
+        64: [1, 2, 4, 8, 16, 1, 2, 4],
+        128: [1, 2, 4, 8, 16, 32, 64, 1]
     }
-    
+
     def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
         super(Flow, self).__init__()
         # input projection
         self.input_proj = nn.utils.weight_norm(
-            nn.Conv2D(1, channels, (1, 1), 
-                      weight_attr=I.Uniform(-1., 1.), 
-                      bias_attr=I.Uniform(-1., 1.)))
-        
+            nn.Conv2D(
+                1,
+                channels, (1, 1),
+                weight_attr=I.Uniform(-1., 1.),
+                bias_attr=I.Uniform(-1., 1.)))
+
         # residual net
-        self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, 
+        self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
                                   self.dilations_dict[n_group])
-        
+
         # output projection
-        self.output_proj = nn.Conv2D(channels, 2, (1, 1),
-                                   weight_attr=I.Constant(0.),
-                                   bias_attr=I.Constant(0.))
-        
+        self.output_proj = nn.Conv2D(
+            channels,
+            2, (1, 1),
+            weight_attr=I.Constant(0.),
+            bias_attr=I.Constant(0.))
+
         # specs
         self.n_group = n_group
-    
+
     def _predict_parameters(self, x, condition):
         x = self.input_proj(x)
         x = self.resnet(x, condition)
@@ -421,11 +456,11 @@ class Flow(nn.Layer):
         return logs, b
 
     def _transform(self, x, logs, b):
-        z_0 = x[:, :, :1, :] # the first row, just copy it
-        z_out = x[:, :, 1:, :] * paddle.exp(logs) + b            
+        z_0 = x[:, :, :1, :]  # the first row, just copy it
+        z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
         z_out = paddle.concat([z_0, z_out], axis=2)
         return z_out
-    
+
     def forward(self, x, condition):
         """Probability density estimation. It is done by inversely transform 
         a sample from p(X) into a sample from p(Z).
@@ -452,8 +487,8 @@ class Flow(nn.Layer):
             transformation from x to z.
         """
         # (B, C, H-1, W)
-        logs, b = self._predict_parameters(
-            x[:, :, :-1, :], condition[:, :, 1:, :]) 
+        logs, b = self._predict_parameters(x[:, :, :-1, :],
+                                           condition[:, :, 1:, :])
         z = self._transform(x, logs, b)
         return z, (logs, b)
 
@@ -467,7 +502,7 @@ class Flow(nn.Layer):
     def _inverse_transform_row(self, z_row, logs, b):
         x_row = (z_row - b) * paddle.exp(-logs)
         return x_row
-    
+
     def _inverse_row(self, z_row, x_row, condition_row):
         logs, b = self._predict_row_parameters(x_row, condition_row)
         x_next_row = self._inverse_transform_row(z_row, logs, b)
@@ -475,7 +510,7 @@ class Flow(nn.Layer):
 
     def _start_sequence(self):
         self.resnet.start_sequence()
-    
+
     def inverse(self, z, condition):
         """Sampling from the the distrition p(X). It is done by sample form 
         p(Z) and transform the sample. It is a auto regressive transformation.
@@ -510,15 +545,16 @@ class Flow(nn.Layer):
 
         self._start_sequence()
         for i in range(1, self.n_group):
-            x_row = x[-1] # actuallt i-1:i
-            z_row = z[:, :, i:i+1, :]
-            condition_row = condition[:, :, i:i+1, :]
+            x_row = x[-1]  # actuallt i-1:i
+            z_row = z[:, :, i:i + 1, :]
+            condition_row = condition[:, :, i:i + 1, :]
 
-            x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row)
+            x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
+                                                      condition_row)
             x.append(x_next_row)
             logs_list.append(logs)
             b_list.append(b)
-        
+
         x = paddle.concat(x, 2)
         logs = paddle.concat(logs_list, 2)
         b = paddle.concat(b_list, 2)
@@ -549,21 +585,25 @@ class WaveFlow(nn.LayerList):
     kernel_size : Union[int, List[int]]
         Kernel size of the convolution layer in each ResidualBlock.
     """
-    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
+
+    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
+                 kernel_size):
         if n_group % 2 or n_flows % 2:
-            raise ValueError("number of flows and number of group must be even "
-                             "since a permutation along group among flows is used.")
+            raise ValueError(
+                "number of flows and number of group must be even "
+                "since a permutation along group among flows is used.")
         super(WaveFlow, self).__init__()
         for _ in range(n_flows):
-            self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group))
-        
+            self.append(
+                Flow(n_layers, channels, mel_bands, kernel_size, n_group))
+
         # permutations in h
         self.perms = self._create_perm(n_group, n_flows)
 
         # specs
         self.n_group = n_group
         self.n_flows = n_flows
-    
+
     def _create_perm(self, n_group, n_flows):
         indices = list(range(n_group))
         half = n_group // 2
@@ -572,20 +612,21 @@ class WaveFlow(nn.LayerList):
             if i < n_flows // 2:
                 perms.append(indices[::-1])
             else:
-                perm = list(reversed(indices[:half])) + list(reversed(indices[half:]))
+                perm = list(reversed(indices[:half])) + list(
+                    reversed(indices[half:]))
                 perms.append(perm)
         return perms
-        
+
     def _trim(self, x, condition):
         assert condition.shape[-1] >= x.shape[-1]
         pruned_len = int(x.shape[-1] // self.n_group * self.n_group)
-        
+
         if x.shape[-1] > pruned_len:
             x = x[:, :pruned_len]
         if condition.shape[-1] > pruned_len:
             condition = condition[:, :, :pruned_len]
         return x, condition
-    
+
     def forward(self, x, condition):
         """Probability density estimation of random variable x given the 
         condition.
@@ -610,21 +651,23 @@ class WaveFlow(nn.LayerList):
         # x: (B, T)
         # condition: (B, C, T) upsampled condition
         x, condition = self._trim(x, condition)
-        
+
         # to (B, C, h, T//h) layout
-        x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
-        condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
-        
+        x = paddle.unsqueeze(
+            paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
+        condition = paddle.transpose(
+            fold(condition, self.n_group), [0, 1, 3, 2])
+
         # flows
         logs_list = []
         for i, layer in enumerate(self):
-            x, (logs, b) = layer(x, condition)          
+            x, (logs, b) = layer(x, condition)
             logs_list.append(logs)
             # permute paddle has no shuffle dim
             x = geo.shuffle_dim(x, 2, perm=self.perms[i])
             condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
 
-        z = paddle.squeeze(x, 1) # (B, H, W)
+        z = paddle.squeeze(x, 1)  # (B, H, W)
         batch_size = z.shape[0]
         z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
 
@@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList):
 
         z, condition = self._trim(z, condition)
         # to (B, C, h, T//h) layout
-        z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
-        condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
+        z = paddle.unsqueeze(
+            paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
+        condition = paddle.transpose(
+            fold(condition, self.n_group), [0, 1, 3, 2])
 
         # reverse it flow by flow
         for i in reversed(range(self.n_flows)):
@@ -663,7 +708,7 @@ class WaveFlow(nn.LayerList):
             condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
             z, (logs, b) = self[i].inverse(z, condition)
 
-        x = paddle.squeeze(z, 1) # (B, H, W)
+        x = paddle.squeeze(z, 1)  # (B, H, W)
         batch_size = x.shape[0]
         x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
         return x
@@ -695,23 +740,24 @@ class ConditionalWaveFlow(nn.LayerList):
     kernel_size : Union[int, List[int]]
         Kernel size of the convolution layer in each ResidualBlock.
     """
-    def __init__(self, 
-                upsample_factors: List[int], 
-                n_flows: int, 
-                n_layers: int, 
-                n_group: int, 
-                channels: int, 
-                n_mels: int, 
-                kernel_size: Union[int, List[int]]):
+
+    def __init__(self,
+                 upsample_factors: List[int],
+                 n_flows: int,
+                 n_layers: int,
+                 n_group: int,
+                 channels: int,
+                 n_mels: int,
+                 kernel_size: Union[int, List[int]]):
         super(ConditionalWaveFlow, self).__init__()
         self.encoder = UpsampleNet(upsample_factors)
         self.decoder = WaveFlow(
-        n_flows=n_flows,
-        n_layers=n_layers,
-        n_group=n_group,
-        channels=channels,
-        mel_bands=n_mels,
-        kernel_size=kernel_size)
+            n_flows=n_flows,
+            n_layers=n_layers,
+            n_group=n_group,
+            channels=channels,
+            mel_bands=n_mels,
+            kernel_size=kernel_size)
 
     def forward(self, audio, mel):
         """Compute the transformed random variable z (x to z) and the log of 
@@ -737,7 +783,7 @@ class ConditionalWaveFlow(nn.LayerList):
         condition = self.encoder(mel)
         z, log_det_jacobian = self.decoder(audio, condition)
         return z, log_det_jacobian
-    
+
     @paddle.no_grad()
     def infer(self, mel):
         r"""Generate raw audio given mel spectrogram.
@@ -752,12 +798,12 @@ class ConditionalWaveFlow(nn.LayerList):
         Tensor : [shape=(B, T)] 
             The synthesized audio, where``T <= T_mel \* upsample_factors``.
         """
-        condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
+        condition = self.encoder(mel, trim_conv_artifact=True)  #(B, C, T)
         batch_size, _, time_steps = condition.shape
         z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
         x = self.decoder.inverse(z, condition)
         return x
-    
+
     @paddle.no_grad()
     def predict(self, mel):
         """Generate raw audio given mel spectrogram.
@@ -777,7 +823,7 @@ class ConditionalWaveFlow(nn.LayerList):
         audio = self.infer(mel)
         audio = audio[0].numpy()
         return audio
-    
+
     @classmethod
     def from_pretrained(cls, config, checkpoint_path):
         """Build a ConditionalWaveFlow model from a pretrained model.
@@ -795,14 +841,13 @@ class ConditionalWaveFlow(nn.LayerList):
         ConditionalWaveFlow
             The model built from pretrained result.
         """
-        model = cls(
-            upsample_factors=config.model.upsample_factors,
-            n_flows=config.model.n_flows,
-            n_layers=config.model.n_layers,
-            n_group=config.model.n_group,
-            channels=config.model.channels,
-            n_mels=config.data.n_mels,
-            kernel_size=config.model.kernel_size)
+        model = cls(upsample_factors=config.model.upsample_factors,
+                    n_flows=config.model.n_flows,
+                    n_layers=config.model.n_layers,
+                    n_group=config.model.n_group,
+                    channels=config.model.channels,
+                    n_mels=config.data.n_mels,
+                    kernel_size=config.model.kernel_size)
         checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
         return model
 
@@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer):
         The standard deviation of the gaussian noise used in WaveFlow, by 
         default 1.0.
     """
+
     def __init__(self, sigma=1.0):
         super(WaveFlowLoss, self).__init__()
         self.sigma = sigma
@@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer):
         Tensor [shape=(1,)]
             The loss.
         """
-        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
+        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
+                                    ) - log_det_jacobian
         loss = loss / np.prod(z.shape)
         return loss + self.const
diff --git a/parakeet/models/wavenet.py b/parakeet/models/wavenet.py
index 8e6f272..5ff3435 100644
--- a/parakeet/models/wavenet.py
+++ b/parakeet/models/wavenet.py
@@ -18,7 +18,7 @@ from typing import Union, Sequence, List
 from tqdm import trange
 import numpy as np
 
-import paddle 
+import paddle
 from paddle import nn
 from paddle.nn import functional as F
 import paddle.fluid.initializer as I
@@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools
 
 __all__ = ["WaveNet", "ConditionalWaveNet"]
 
+
 def crop(x, audio_start, audio_length):
     """Crop the upsampled condition to match audio_length. 
     
@@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList):
     ---------
     ``librosa.core.stft``
     """
+
     def __init__(self, upscale_factors=[16, 16]):
         super(UpsampleNet, self).__init__()
         self.upscale_factors = list(upscale_factors)
@@ -106,9 +108,11 @@ class UpsampleNet(nn.LayerList):
         for factor in self.upscale_factors:
             self.append(
                 nn.utils.weight_norm(
-                    nn.Conv2DTranspose(1, 1, 
-                        kernel_size=(3, 2 * factor), 
-                        stride=(1, factor), 
+                    nn.Conv2DTranspose(
+                        1,
+                        1,
+                        kernel_size=(3, 2 * factor),
+                        stride=(1, factor),
                         padding=(1, factor // 2))))
 
     def forward(self, x):
@@ -159,29 +163,34 @@ class ResidualBlock(nn.Layer):
     dilation :int
         Dilation of the internal convolution cells.
     """
-    def __init__(self, 
-                 residual_channels: int, 
-                 condition_dim: int, 
+
+    def __init__(self,
+                 residual_channels: int,
+                 condition_dim: int,
                  filter_size: Union[int, Sequence[int]],
                  dilation: int):
-        
+
         super(ResidualBlock, self).__init__()
         dilated_channels = 2 * residual_channels
         # following clarinet's implementation, we do not have parametric residual
         # & skip connection.
 
-        _filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size
+        _filter_size = filter_size[0] if isinstance(filter_size, (
+            list, tuple)) else filter_size
         std = math.sqrt(1 / (_filter_size * residual_channels))
-        conv = Conv1dCell(residual_channels, 
-                          dilated_channels, 
-                          filter_size, 
-                          dilation=dilation, 
-                          weight_attr=I.Normal(scale=std))
+        conv = Conv1dCell(
+            residual_channels,
+            dilated_channels,
+            filter_size,
+            dilation=dilation,
+            weight_attr=I.Normal(scale=std))
         self.conv = nn.utils.weight_norm(conv)
 
         std = math.sqrt(1 / condition_dim)
-        condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), 
-                                    weight_attr=I.Normal(scale=std))
+        condition_proj = Conv1dCell(
+            condition_dim,
+            dilated_channels, (1, ),
+            weight_attr=I.Normal(scale=std))
         self.condition_proj = nn.utils.weight_norm(condition_proj)
 
         self.filter_size = filter_size
@@ -309,10 +318,11 @@ class ResidualNet(nn.LayerList):
         Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
 
     """
-    def __init__(self, 
-                 n_stack: int, 
-                 n_loop: int, 
-                 residual_channels: int, 
+
+    def __init__(self,
+                 n_stack: int,
+                 n_loop: int,
+                 residual_channels: int,
                  condition_dim: int,
                  filter_size: int):
         super(ResidualNet, self).__init__()
@@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList):
         dilations = [2**i for i in range(n_loop)] * n_stack
         self.context_size = 1 + sum(dilations)
         for dilation in dilations:
-            self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
+            self.append(
+                ResidualBlock(residual_channels, condition_dim, filter_size,
+                              dilation))
 
     def forward(self, x, condition=None):
         """Forward pass of ``ResidualNet``.
@@ -345,7 +357,7 @@ class ResidualNet(nn.LayerList):
                 skip_connections = skip
             else:
                 skip_connections = paddle.scale(skip_connections + skip,
-                                        math.sqrt(0.5))
+                                                math.sqrt(0.5))
         return skip_connections
 
     def start_sequence(self):
@@ -381,7 +393,7 @@ class ResidualNet(nn.LayerList):
                 skip_connections = skip
             else:
                 skip_connections = paddle.scale(skip_connections + skip,
-                                        math.sqrt(0.5))
+                                                math.sqrt(0.5))
         return skip_connections
 
 
@@ -426,6 +438,7 @@ class WaveNet(nn.Layer):
         This is only used for computing loss when ``loss_type`` is "mog", If 
         the predicted log scale is less than -9.0, it is clipped at -9.0.
     """
+
     def __init__(self, n_stack, n_loop, residual_channels, output_dim,
                  condition_dim, filter_size, loss_type, log_scale_min):
 
@@ -437,19 +450,24 @@ class WaveNet(nn.Layer):
         else:
             if (output_dim % 3 != 0):
                 raise ValueError(
-                    "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim))
-            self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1)
+                    "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".
+                    format(output_dim))
+            self.embed = nn.utils.weight_norm(
+                nn.Linear(1, residual_channels), dim=1)
 
         self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
                                   condition_dim, filter_size)
         self.context_size = self.resnet.context_size
 
         skip_channels = residual_channels  # assume the same channel
-        self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
-        self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
+        self.proj1 = nn.utils.weight_norm(
+            nn.Linear(skip_channels, skip_channels), dim=1)
+        self.proj2 = nn.utils.weight_norm(
+            nn.Linear(skip_channels, skip_channels), dim=1)
         # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
         # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
-        self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1)
+        self.proj3 = nn.utils.weight_norm(
+            nn.Linear(skip_channels, output_dim), dim=1)
 
         self.loss_type = loss_type
         self.output_dim = output_dim
@@ -781,26 +799,28 @@ class ConditionalWaveNet(nn.Layer):
         This is only used for computing loss when ``loss_type`` is "mog", If 
         the predicted log scale is less than -9.0, it is clipped at -9.0.
     """
-    def __init__(self, 
-                 upsample_factors: List[int], 
-                 n_stack: int, 
-                 n_loop: int, 
-                 residual_channels: int, 
+
+    def __init__(self,
+                 upsample_factors: List[int],
+                 n_stack: int,
+                 n_loop: int,
+                 residual_channels: int,
                  output_dim: int,
-                 n_mels: int, 
-                 filter_size: int=2, 
-                 loss_type: str="mog", 
+                 n_mels: int,
+                 filter_size: int=2,
+                 loss_type: str="mog",
                  log_scale_min: float=-9.0):
         super(ConditionalWaveNet, self).__init__()
         self.encoder = UpsampleNet(upsample_factors)
-        self.decoder = WaveNet(n_stack=n_stack, 
-                               n_loop=n_loop,
-                               residual_channels=residual_channels,
-                               output_dim=output_dim,
-                               condition_dim=n_mels,
-                               filter_size=filter_size,
-                               loss_type=loss_type,
-                               log_scale_min=log_scale_min)
+        self.decoder = WaveNet(
+            n_stack=n_stack,
+            n_loop=n_loop,
+            residual_channels=residual_channels,
+            output_dim=output_dim,
+            condition_dim=n_mels,
+            filter_size=filter_size,
+            loss_type=loss_type,
+            log_scale_min=log_scale_min)
 
     def forward(self, audio, mel, audio_start):
         """Compute the output distribution given the mel spectrogram and the input(for teacher force training).
@@ -895,11 +915,11 @@ class ConditionalWaveNet(nn.Layer):
         self.decoder.start_sequence()
         x_t = paddle.zeros((batch_size, ), dtype=mel.dtype)
         for i in trange(time_steps):
-            c_t = condition[:, :, i] # (B, C)
-            y_t = self.decoder.add_input(x_t, c_t) #(B, C)
+            c_t = condition[:, :, i]  # (B, C)
+            y_t = self.decoder.add_input(x_t, c_t)  #(B, C)
             y_t = paddle.unsqueeze(y_t, 1)
-            x_t = self.sample(y_t) # (B, 1)
-            x_t = paddle.squeeze(x_t, 1) #(B,)
+            x_t = self.sample(y_t)  # (B, 1)
+            x_t = paddle.squeeze(x_t, 1)  #(B,)
             samples.append(x_t)
         samples = paddle.stack(samples, -1)
         return samples
@@ -943,16 +963,15 @@ class ConditionalWaveNet(nn.Layer):
         ConditionalWaveNet
             The model built from pretrained result.
         """
-        model = cls(
-            upsample_factors=config.model.upsample_factors,
-            n_stack=config.model.n_stack, 
-            n_loop=config.model.n_loop,
-            residual_channels=config.model.residual_channels,
-            output_dim=config.model.output_dim,
-            n_mels=config.data.n_mels,
-            filter_size=config.model.filter_size,
-            loss_type=config.model.loss_type,
-            log_scale_min=config.model.log_scale_min)
+        model = cls(upsample_factors=config.model.upsample_factors,
+                    n_stack=config.model.n_stack,
+                    n_loop=config.model.n_loop,
+                    residual_channels=config.model.residual_channels,
+                    output_dim=config.model.output_dim,
+                    n_mels=config.data.n_mels,
+                    filter_size=config.model.filter_size,
+                    loss_type=config.model.loss_type,
+                    log_scale_min=config.model.log_scale_min)
         layer_tools.summary(model)
         checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
         return model
diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py
index ebcc6c6..03e42b0 100644
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from scipy import signal
-import numpy as np 
+import numpy as np
 
 __all__ = ["quantize", "dequantize", "STFT"]
 
@@ -86,6 +100,7 @@ class STFT(nn.Layer):
     Ony ``center`` and ``reflect`` padding is supported now.
     
     """
+
     def __init__(self, n_fft, hop_length, win_length, window="hanning"):
         super(STFT, self).__init__()
         self.hop_length = hop_length
@@ -109,7 +124,8 @@ class STFT(nn.Layer):
                             (self.n_bin, 1, 1, self.n_fft))
 
         w = np.concatenate([w_real, w_imag], axis=0)
-        self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
+        self.weight = paddle.cast(
+            paddle.to_tensor(w), paddle.get_default_dtype())
 
     def forward(self, x):
         """Compute the stft transform.
diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py
index b57abf2..d984605 100644
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
@@ -20,6 +20,7 @@ __all__ = [
     "Conv1dBatchNorm",
 ]
 
+
 class Conv1dCell(nn.Conv1D):
     """A subclass of Conv1D layer, which can be used in an autoregressive 
     decoder like an RNN cell. 
@@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer):
     epsilon : [type], optional
         The epsilon of the BatchNorm1D layer, by default 1e-05
     """
+
     def __init__(self,
                  in_channels,
                  out_channels,
diff --git a/parakeet/modules/geometry.py b/parakeet/modules/geometry.py
index ec96daf..05a5931 100644
--- a/parakeet/modules/geometry.py
+++ b/parakeet/modules/geometry.py
@@ -1,6 +1,21 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import paddle
 
+
 def shuffle_dim(x, axis, perm=None):
     """Permute input tensor along aixs given the permutation or randomly.
 
@@ -32,7 +47,7 @@ def shuffle_dim(x, axis, perm=None):
         perm = np.array(perm)
     else:
         perm = np.random.permutation(size)
-    
+
     perm = paddle.to_tensor(perm)
     out = paddle.gather(x, perm, axis)
     return out
diff --git a/parakeet/modules/losses.py b/parakeet/modules/losses.py
index 3e22480..ab188fd 100644
--- a/parakeet/modules/losses.py
+++ b/parakeet/modules/losses.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numba
 import numpy as np
 import paddle
@@ -5,12 +19,13 @@ from paddle import nn
 from paddle.nn import functional as F
 
 __all__ = [
-    "weighted_mean", 
-    "masked_l1_loss", 
-    "masked_softmax_with_cross_entropy", 
+    "weighted_mean",
+    "masked_l1_loss",
+    "masked_softmax_with_cross_entropy",
     "diagonal_loss",
 ]
 
+
 def weighted_mean(input, weight):
     """Weighted mean. It can also be used as masked mean.
 
@@ -88,12 +103,11 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
     return loss
 
 
-def diagonal_loss(
-    attentions, 
-    input_lengths, 
-    target_lengths, 
-    g=0.2, 
-    multihead=False):
+def diagonal_loss(attentions,
+                  input_lengths,
+                  target_lengths,
+                  g=0.2,
+                  multihead=False):
     """A metric to evaluate how diagonal a attention distribution is.
     
     It is computed for batch attention distributions. For each attention 
@@ -133,6 +147,7 @@ def diagonal_loss(
     else:
         return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
 
+
 @numba.jit(nopython=True)
 def guided_attention(N, max_N, T, max_T, g):
     W = np.zeros((max_T, max_N), dtype=np.float32)
@@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g):
     # (T_dec, T_enc)
     return W
 
+
 def guided_attentions(input_lengths, target_lengths, g=0.2):
     B = len(input_lengths)
     max_input_len = input_lengths.max()
@@ -151,4 +167,4 @@ def guided_attentions(input_lengths, target_lengths, g=0.2):
         W[b] = guided_attention(input_lengths[b], max_input_len,
                                 target_lengths[b], max_target_len, g)
     # (B, T_dec, T_enc)
-    return W
\ No newline at end of file
+    return W
diff --git a/parakeet/modules/masking.py b/parakeet/modules/masking.py
index c54a5b1..96871a9 100644
--- a/parakeet/modules/masking.py
+++ b/parakeet/modules/masking.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle
 from paddle.fluid.layers import sequence_mask
 
@@ -8,6 +22,7 @@ __all__ = [
     "future_mask",
 ]
 
+
 def id_mask(input, padding_index=0, dtype="bool"):
     """Generate mask with input ids. 
     
diff --git a/parakeet/modules/positional_encoding.py b/parakeet/modules/positional_encoding.py
index 084ccf3..07a86c9 100644
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import numpy as np
 import paddle
@@ -5,6 +19,7 @@ from paddle.nn import functional as F
 
 __all__ = ["positional_encoding"]
 
+
 def positional_encoding(start_index, length, size, dtype=None):
     r"""Generate standard positional encoding matrix.
     
@@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None):
     dtype = dtype or paddle.get_default_dtype()
     channel = np.arange(0, size, 2)
     index = np.arange(start_index, start_index + length, 1)
-    p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
+    p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
     encodings = np.zeros([length, size])
     encodings[:, 0::2] = np.sin(p)
     encodings[:, 1::2] = np.cos(p)
diff --git a/parakeet/modules/transformer.py b/parakeet/modules/transformer.py
index 18a7523..e857990 100644
--- a/parakeet/modules/transformer.py
+++ b/parakeet/modules/transformer.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import paddle
 from paddle import nn
@@ -12,6 +26,7 @@ __all__ = [
     "TransformerDecoderLayer",
 ]
 
+
 class PositionwiseFFN(nn.Layer):
     """A faithful implementation of Position-wise Feed-Forward Network 
     in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
@@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer):
         The probability of the Dropout applied to the output of the first 
         layer, by default 0.
     """
-    def __init__(self, 
-                 input_size: int, 
-                 hidden_size: int, 
-                 dropout=0.0):
+
+    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
         super(PositionwiseFFN, self).__init__()
         self.linear1 = nn.Linear(input_size, hidden_size)
         self.linear2 = nn.Linear(hidden_size, input_size)
@@ -86,16 +99,17 @@ class TransformerEncoderLayer(nn.Layer):
     ------
     It uses the PostLN (post layer norm) scheme. 
     """
+
     def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
         super(TransformerEncoderLayer, self).__init__()
         self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
         self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-        
+
         self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
         self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-        
+
         self.dropout = dropout
-    
+
     def forward(self, x, mask):
         """Forward pass of TransformerEncoderLayer.
         
@@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer):
         """
         context_vector, attn_weights = self.self_mha(x, x, x, mask)
         x = self.layer_norm1(
-            F.dropout(x + context_vector,
-                      self.dropout,
-                      training=self.training))
-        
+            F.dropout(
+                x + context_vector, self.dropout, training=self.training))
+
         x = self.layer_norm2(
-            F.dropout(x + self.ffn(x),
-                      self.dropout,
-                      training=self.training))
+            F.dropout(
+                x + self.ffn(x), self.dropout, training=self.training))
         return x, attn_weights
 
 
@@ -155,19 +167,20 @@ class TransformerDecoderLayer(nn.Layer):
     ------
     It uses the PostLN (post layer norm) scheme. 
     """
+
     def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
         super(TransformerDecoderLayer, self).__init__()
         self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
         self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-        
+
         self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
         self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-        
+
         self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
         self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-        
+
         self.dropout = dropout
-    
+
     def forward(self, q, k, v, encoder_mask, decoder_mask):
         """Forward pass of TransformerEncoderLayer.
         
@@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer):
         cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] 
             Decoder-encoder cross attention.
         """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
+        context_vector, self_attn_weights = self.self_mha(q, q, q,
+                                                          decoder_mask)
         q = self.layer_norm1(
-            F.dropout(q + context_vector, 
-                      self.dropout, 
-                      training=self.training))
-        
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask)
+            F.dropout(
+                q + context_vector, self.dropout, training=self.training))
+
+        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
+                                                            encoder_mask)
         q = self.layer_norm2(
-            F.dropout(q + context_vector,
-                      self.dropout,
-                      training=self.training))
-        
+            F.dropout(
+                q + context_vector, self.dropout, training=self.training))
+
         q = self.layer_norm3(
-            F.dropout(q + self.ffn(q),
-                      self.dropout,
-                      training=self.training))
+            F.dropout(
+                q + self.ffn(q), self.dropout, training=self.training))
         return q, self_attn_weights, cross_attn_weights
diff --git a/parakeet/training/__init__.py b/parakeet/training/__init__.py
index cb1c59b..aec401c 100644
--- a/parakeet/training/__init__.py
+++ b/parakeet/training/__init__.py
@@ -1,2 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.training.cli import *
 from parakeet.training.experiment import *
diff --git a/parakeet/training/cli.py b/parakeet/training/cli.py
index e6b6fe5..a3cfbda 100644
--- a/parakeet/training/cli.py
+++ b/parakeet/training/cli.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 
+
 def default_argument_parser():
     r"""A simple yet genral argument parser for experiments with parakeet.
     
@@ -46,5 +61,5 @@ def default_argument_parser():
     # overwrite extra config and default config
     parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
     # yapd: enable
-    
+
     return parser
diff --git a/parakeet/training/default_config.py b/parakeet/training/default_config.py
index f4b9c29..583f6e6 100644
--- a/parakeet/training/default_config.py
+++ b/parakeet/training/default_config.py
@@ -1,12 +1,26 @@
-from yacs.config import CfgNode 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from yacs.config import CfgNode
 
 _C = CfgNode(
     dict(
-        valid_interval=1000, # validation
-        save_interval=10000, # checkpoint
-        max_iteration=900000, # max iteration to train
-    )
-)
+        valid_interval=1000,  # validation
+        save_interval=10000,  # checkpoint
+        max_iteration=900000,  # max iteration to train
+    ))
+
 
 def get_default_training_config():
     return _C.clone()
diff --git a/parakeet/training/experiment.py b/parakeet/training/experiment.py
index 1bf0af6..16da93d 100644
--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
@@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools
 
 __all__ = ["ExperimentBase"]
 
+
 class ExperimentBase(object):
     """
     An experiment template in order to structure the training code and take 
diff --git a/parakeet/utils/checkpoint.py b/parakeet/utils/checkpoint.py
index ec6f282..0d2a2e2 100644
--- a/parakeet/utils/checkpoint.py
+++ b/parakeet/utils/checkpoint.py
@@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
 
     return iteration
 
+
 def _save_checkpoint(checkpoint_dir: str, iteration: int):
     """Save the iteration number of the latest model to be checkpointed.
 
@@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
     with open(checkpoint_record, "wt") as handle:
         handle.write("model_checkpoint_path: step-{}".format(iteration))
 
+
 def load_parameters(model,
                     optimizer=None,
                     checkpoint_dir=None,
@@ -97,18 +99,19 @@ def load_parameters(model,
     params_path = checkpoint_path + ".pdparams"
     model_dict = paddle.load(params_path)
     model.set_state_dict(model_dict)
-    print("[checkpoint] Rank {}: loaded model from {}".format(
-        local_rank, params_path))
-    
+    print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
+                                                              params_path))
+
     optimizer_path = checkpoint_path + ".pdopt"
     if optimizer and os.path.isfile(optimizer_path):
         optimizer_dict = paddle.load(optimizer_path)
         optimizer.set_state_dict(optimizer_dict)
-        print("[checkpoint] Rank {}: loaded optimizer state from {}".
-              format(local_rank, optimizer_path))
+        print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
+            local_rank, optimizer_path))
 
     return iteration
 
+
 @mp_tools.rank_zero_only
 def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
     """Checkpoint the latest trained model parameters.
@@ -124,7 +127,7 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
         None
     """
     checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
-    
+
     model_dict = model.state_dict()
     params_path = checkpoint_path + ".pdparams"
     paddle.save(model_dict, params_path)
diff --git a/parakeet/utils/internals.py b/parakeet/utils/internals.py
index c72a9b0..968a604 100644
--- a/parakeet/utils/internals.py
+++ b/parakeet/utils/internals.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle.framework import core
 
diff --git a/parakeet/utils/layer_tools.py b/parakeet/utils/layer_tools.py
index 2268377..fcda44f 100644
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
@@ -28,6 +28,7 @@ def summary(layer: nn.Layer):
     print("layer has {} parameters, {} elements.".format(num_params,
                                                          num_elements))
 
+
 def gradient_norm(layer: nn.Layer):
     grad_norm_dict = {}
     for name, param in layer.state_dict().items():
@@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer):
             grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
     return grad_norm_dict
 
+
 def recursively_remove_weight_norm(layer: nn.Layer):
     for layer in layer.sublayers():
         try:
@@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer):
             # ther is not weight norm hoom in this layer
             pass
 
+
 def freeze(layer: nn.Layer):
     for param in layer.parameters():
         param.trainable = False
 
+
 def unfreeze(layer: nn.Layer):
     for param in layer.parameters():
         param.trainable = True
diff --git a/parakeet/utils/mp_tools.py b/parakeet/utils/mp_tools.py
index 0b9c6dc..a4bc97a 100644
--- a/parakeet/utils/mp_tools.py
+++ b/parakeet/utils/mp_tools.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle
 from paddle import distributed as dist
 from functools import wraps
@@ -11,11 +25,8 @@ def rank_zero_only(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
         if local_rank != 0:
-            return 
+            return
         result = func(*args, **kwargs)
         return result
-    
+
     return wrapper
-
-
-
diff --git a/parakeet/utils/scheduler.py b/parakeet/utils/scheduler.py
index 97e98ec..4d41aca 100644
--- a/parakeet/utils/scheduler.py
+++ b/parakeet/utils/scheduler.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 
 __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
@@ -24,7 +38,7 @@ class PieceWise(SchedulerBase):
         self.xs = [item[0] for item in anchors]
         self.ys = [item[1] for item in anchors]
         self.num_anchors = len(self.xs)
-    
+
     def __call__(self, step):
         i = 0
         for x in self.xs:
@@ -34,8 +48,8 @@ class PieceWise(SchedulerBase):
             return self.ys[0]
         if i == self.num_anchors:
             return self.ys[-1]
-        k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1]) 
-        out = self.ys[i-1] + (step - self.xs[i-1]) * k
+        k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1])
+        out = self.ys[i - 1] + (step - self.xs[i - 1]) * k
         return out
 
 
@@ -47,7 +61,7 @@ class StepWise(SchedulerBase):
         self.xs = [item[0] for item in anchors]
         self.ys = [item[1] for item in anchors]
         self.num_anchors = len(self.xs)
-    
+
     def __call__(self, step):
         i = 0
         for x in self.xs:
@@ -58,5 +72,4 @@ class StepWise(SchedulerBase):
             return self.ys[-1]
         if i == 0:
             return self.ys[0]
-        return self.ys[i-1]
-
+        return self.ys[i - 1]
diff --git a/setup.py b/setup.py
index ee5f215..0fa9eb7 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,6 @@ setup_info = dict(
     description='Speech synthesis tools and models based on Paddlepaddle',
     long_description=long_description,
     license='Apache 2',
-    
     python_requires='>=3.6',
     install_requires=[
         'numpy',
@@ -71,23 +70,18 @@ setup_info = dict(
         'yacs',
         'tensorboardX',
     ],
-    extras_require={
-        'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
-    },
+    extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
 
     # Package info
     packages=find_packages(exclude=('tests', 'tests.*')),
-    zip_safe=True, 
-    
-    classifiers = [
+    zip_safe=True,
+    classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
         'Topic :: Scientific/Engineering :: Artificial Intelligence'
         'License :: OSI Approved :: Apache2 License',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
-    ],
-    
-    )
+    ], )
 
 setup(**setup_info)