diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4102b69..9d6da44 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
         files: \.md$
     -   id: remove-tabs
         files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./tools/copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/examples/deepvoice3/README.md b/examples/deepvoice3/README.md
index 43e1939..0138414 100644
--- a/examples/deepvoice3/README.md
+++ b/examples/deepvoice3/README.md
@@ -1,4 +1,4 @@
-# Deepvoice 3 
+# Deepvoice 3
 
 Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
 
@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
 ## Project Structure
 
 ```text
-├── data.py          data_processing 
+├── data.py          data_processing
 ├── ljspeech.yaml    (example) configuration file
 ├── sentences.txt    sample sentences
 ├── synthesis.py     script to synthesize waveform from text
@@ -50,7 +50,7 @@ optional arguments:
                         The directory to save result.
   -g DEVICE, --device DEVICE
                         device to use
-``` 
+```
 
 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
@@ -61,7 +61,7 @@ optional arguments:
 ├── checkpoints      # checkpoint
 ├── log              # tensorboard log
 └── states           # train and evaluation results
-    ├── alignments   # attention 
+    ├── alignments   # attention
     ├── lin_spec     # linear spectrogram
     ├── mel_spec     # mel spectrogram
     └── waveform     # waveform (.wav files)
@@ -112,4 +112,3 @@ example script:
 ```bash
 python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
 ```
-
diff --git a/examples/deepvoice3/data.py b/examples/deepvoice3/data.py
index 8f6b2ce..68f54cd 100644
--- a/examples/deepvoice3/data.py
+++ b/examples/deepvoice3/data.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import csv
 from pathlib import Path
@@ -79,10 +93,11 @@ class Transform(object):
         y = signal.lfilter([1., -self.preemphasis], [1.], wav)
 
         # STFT
-        D = librosa.stft(y=y,
-                         n_fft=self.n_fft,
-                         win_length=self.win_length,
-                         hop_length=self.hop_length)
+        D = librosa.stft(
+            y=y,
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length)
         S = np.abs(D)
 
         # to db and normalize to 0-1
@@ -96,11 +111,8 @@ class Transform(object):
 
         # mel scale and to db and normalize to 0-1,
         # CAUTION: pass linear scale S, not dbscaled S
-        S_mel = librosa.feature.melspectrogram(S=S,
-                                               n_mels=self.n_mels,
-                                               fmin=self.fmin,
-                                               fmax=self.fmax,
-                                               power=1.)
+        S_mel = librosa.feature.melspectrogram(
+            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
         S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                          S_mel)) - self.ref_level_db
         S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
@@ -148,20 +160,18 @@ class DataCollector(object):
             (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
              S_mel_norm, num_frames) = example
             text_sequences.append(
-                np.pad(mix_grapheme_phonemes,
-                       (0, max_text_length - text_length)))
+                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
+                                               )))
             lin_specs.append(
-                np.pad(S_norm,
-                       ((0, 0), (self._pad_begin,
-                                 max_frames - self._pad_begin - num_frames))))
+                np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
+                                         self._pad_begin - num_frames))))
             mel_specs.append(
-                np.pad(S_mel_norm,
-                       ((0, 0), (self._pad_begin,
-                                 max_frames - self._pad_begin - num_frames))))
+                np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
+                                             self._pad_begin - num_frames))))
             done_flags.append(
                 np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
-                       (0, max_decoder_length -
-                        int(np.ceil(num_frames // self._factor))),
+                       (0, max_decoder_length - int(
+                           np.ceil(num_frames // self._factor))),
                        constant_values=1))
         text_sequences = np.array(text_sequences).astype(np.int64)
         lin_specs = np.transpose(np.array(lin_specs),
diff --git a/examples/deepvoice3/synthesis.py b/examples/deepvoice3/synthesis.py
index 303c182..5162e07 100644
--- a/examples/deepvoice3/synthesis.py
+++ b/examples/deepvoice3/synthesis.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import argparse
 import ruamel.yaml
@@ -22,11 +36,8 @@ if __name__ == "__main__":
     parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
     parser.add_argument("text", type=str, help="text file to synthesize")
     parser.add_argument("output_path", type=str, help="path to save results")
-    parser.add_argument("-g",
-                        "--device",
-                        type=int,
-                        default=-1,
-                        help="device to use")
+    parser.add_argument(
+        "-g", "--device", type=int, default=-1, help="device to use")
 
     args = parser.parse_args()
     with open(args.config, 'rt') as f:
@@ -76,15 +87,14 @@ if __name__ == "__main__":
         window_ahead = model_config["window_ahead"]
         key_projection = model_config["key_projection"]
         value_projection = model_config["value_projection"]
-        dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
-                         padding_idx, embedding_std, max_positions, n_vocab,
-                         freeze_embedding, filter_size, encoder_channels,
-                         n_mels, decoder_channels, r,
-                         trainable_positional_encodings, use_memory_mask,
-                         query_position_rate, key_position_rate,
-                         window_backward, window_ahead, key_projection,
-                         value_projection, downsample_factor, linear_dim,
-                         use_decoder_states, converter_channels, dropout)
+        dv3 = make_model(
+            n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
+            embedding_std, max_positions, n_vocab, freeze_embedding,
+            filter_size, encoder_channels, n_mels, decoder_channels, r,
+            trainable_positional_encodings, use_memory_mask,
+            query_position_rate, key_position_rate, window_backward,
+            window_ahead, key_projection, value_projection, downsample_factor,
+            linear_dim, use_decoder_states, converter_channels, dropout)
 
         summary(dv3)
         state, _ = dg.load_dygraph(args.checkpoint)
diff --git a/examples/deepvoice3/train.py b/examples/deepvoice3/train.py
index 6d9aef6..ad42822 100644
--- a/examples/deepvoice3/train.py
+++ b/examples/deepvoice3/train.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import argparse
 import ruamel.yaml
diff --git a/examples/deepvoice3/utils.py b/examples/deepvoice3/utils.py
index 02118af..756d008 100644
--- a/examples/deepvoice3/utils.py
+++ b/examples/deepvoice3/utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import numpy as np
 from matplotlib import cm
@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
                converter_channels, dropout):
     """just a simple function to create a deepvoice 3 model"""
     if n_speakers > 1:
-        spe = dg.Embedding((n_speakers, speaker_dim),
-                           param_attr=I.Normal(scale=speaker_embed_std))
+        spe = dg.Embedding(
+            (n_speakers, speaker_dim),
+            param_attr=I.Normal(scale=speaker_embed_std))
     else:
         spe = None
 
@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
         ConvSpec(h, k, 9),
         ConvSpec(h, k, 27),
         ConvSpec(h, k, 1),
-        ConvSpec(h, k, 3),
-    )
-    enc = Encoder(n_vocab,
-                  embed_dim,
-                  n_speakers,
-                  speaker_dim,
-                  padding_idx=None,
-                  embedding_weight_std=embedding_std,
-                  convolutions=encoder_convolutions,
-                  max_positions=max_positions,
-                  dropout=dropout)
+        ConvSpec(h, k, 3), )
+    enc = Encoder(
+        n_vocab,
+        embed_dim,
+        n_speakers,
+        speaker_dim,
+        padding_idx=None,
+        embedding_weight_std=embedding_std,
+        convolutions=encoder_convolutions,
+        max_positions=max_positions,
+        dropout=dropout)
     if freeze_embedding:
         freeze(enc.embed)
 
@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
         ConvSpec(h, k, 3),
         ConvSpec(h, k, 9),
         ConvSpec(h, k, 27),
-        ConvSpec(h, k, 1),
-    )
+        ConvSpec(h, k, 1), )
     attention = [True, False, False, False, True]
     force_monotonic_attention = [True, False, False, False, True]
-    dec = Decoder(n_speakers,
-                  speaker_dim,
-                  embed_dim,
-                  mel_dim,
-                  r=r,
-                  max_positions=max_positions,
-                  padding_idx=padding_idx,
-                  preattention=prenet_convolutions,
-                  convolutions=attentive_convolutions,
-                  attention=attention,
-                  dropout=dropout,
-                  use_memory_mask=use_memory_mask,
-                  force_monotonic_attention=force_monotonic_attention,
-                  query_position_rate=query_position_rate,
-                  key_position_rate=key_position_rate,
-                  window_range=WindowRange(window_behind, window_ahead),
-                  key_projection=key_projection,
-                  value_projection=value_projection)
+    dec = Decoder(
+        n_speakers,
+        speaker_dim,
+        embed_dim,
+        mel_dim,
+        r=r,
+        max_positions=max_positions,
+        padding_idx=padding_idx,
+        preattention=prenet_convolutions,
+        convolutions=attentive_convolutions,
+        attention=attention,
+        dropout=dropout,
+        use_memory_mask=use_memory_mask,
+        force_monotonic_attention=force_monotonic_attention,
+        query_position_rate=query_position_rate,
+        key_position_rate=key_position_rate,
+        window_range=WindowRange(window_behind, window_ahead),
+        key_projection=key_projection,
+        value_projection=value_projection)
     if not trainable_positional_encodings:
         freeze(dec.embed_keys_positions)
         freeze(dec.embed_query_positions)
@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
         ConvSpec(h, k, 1),
         ConvSpec(h, k, 3),
         ConvSpec(2 * h, k, 1),
-        ConvSpec(2 * h, k, 3),
-    )
-    cvt = Converter(n_speakers,
-                    speaker_dim,
-                    dec.state_dim if use_decoder_states else mel_dim,
-                    linear_dim,
-                    time_upsampling=downsample_factor,
-                    convolutions=postnet_convolutions,
-                    dropout=dropout)
+        ConvSpec(2 * h, k, 3), )
+    cvt = Converter(
+        n_speakers,
+        speaker_dim,
+        dec.state_dim if use_decoder_states else mel_dim,
+        linear_dim,
+        time_upsampling=downsample_factor,
+        convolutions=postnet_convolutions,
+        dropout=dropout)
     dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
     return dv3
 
@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
                ref_level_db, power, n_iter, win_length, hop_length,
                preemphasis):
     """generate waveform from text using a deepvoice 3 model"""
-    text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
-                    dtype=np.int64)
+    text = np.array(
+        en.text_to_sequence(
+            text, p=replace_pronounciation_prob),
+        dtype=np.int64)
     length = len(text)
     print("text sequence's length: {}".format(length))
     text_positions = np.arange(1, 1 + length)
@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
     """
     denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
     lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
-    wav = librosa.griffinlim(lin_scaled**power,
-                             n_iter=n_iter,
-                             hop_length=hop_length,
-                             win_length=win_length)
+    wav = librosa.griffinlim(
+        lin_scaled**power,
+        n_iter=n_iter,
+        hop_length=hop_length,
+        win_length=win_length)
     if preemphasis > 0:
         wav = signal.lfilter([1.], [1., -preemphasis], wav)
     return wav
@@ -225,28 +243,30 @@ def save_state(save_dir,
         plt.colorbar()
         plt.title("mel_input")
         plt.savefig(
-            os.path.join(path,
-                         "target_mel_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "target_mel_spec_step{:09d}.png".format(
+                global_step)))
         plt.close()
 
-        writer.add_image("target/mel_spec",
-                         cm.viridis(mel_input),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "target/mel_spec",
+            cm.viridis(mel_input),
+            global_step,
+            dataformats="HWC")
 
         plt.figure(figsize=(10, 3))
         display.specshow(mel_output)
         plt.colorbar()
         plt.title("mel_output")
         plt.savefig(
-            os.path.join(
-                path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
+                global_step)))
         plt.close()
 
-        writer.add_image("predicted/mel_spec",
-                         cm.viridis(mel_output),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "predicted/mel_spec",
+            cm.viridis(mel_output),
+            global_step,
+            dataformats="HWC")
 
     if lin_input is not None and lin_output is not None:
         lin_input = lin_input[0].numpy().T
@@ -258,28 +278,30 @@ def save_state(save_dir,
         plt.colorbar()
         plt.title("mel_input")
         plt.savefig(
-            os.path.join(path,
-                         "target_lin_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "target_lin_spec_step{:09d}.png".format(
+                global_step)))
         plt.close()
 
-        writer.add_image("target/lin_spec",
-                         cm.viridis(lin_input),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "target/lin_spec",
+            cm.viridis(lin_input),
+            global_step,
+            dataformats="HWC")
 
         plt.figure(figsize=(10, 3))
         display.specshow(lin_output)
         plt.colorbar()
         plt.title("mel_input")
         plt.savefig(
-            os.path.join(
-                path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
+                global_step)))
         plt.close()
 
-        writer.add_image("predicted/lin_spec",
-                         cm.viridis(lin_output),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "predicted/lin_spec",
+            cm.viridis(lin_output),
+            global_step,
+            dataformats="HWC")
 
     if alignments is not None and len(alignments.shape) == 4:
         path = os.path.join(save_dir, "alignments")
@@ -290,10 +312,11 @@ def save_state(save_dir,
                 "train_attn_layer_{}_step_{}.png".format(idx, global_step))
             plot_alignment(attn_layer, save_path)
 
-            writer.add_image("train_attn/layer_{}".format(idx),
-                             cm.viridis(attn_layer),
-                             global_step,
-                             dataformats="HWC")
+            writer.add_image(
+                "train_attn/layer_{}".format(idx),
+                cm.viridis(attn_layer),
+                global_step,
+                dataformats="HWC")
 
     if lin_output is not None:
         wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
@@ -302,7 +325,5 @@ def save_state(save_dir,
         save_path = os.path.join(
             path, "train_sample_step_{:09d}.wav".format(global_step))
         sf.write(save_path, wav, sample_rate)
-        writer.add_audio("train_sample",
-                         wav,
-                         global_step,
-                         sample_rate=sample_rate)
+        writer.add_audio(
+            "train_sample", wav, global_step, sample_rate=sample_rate)
diff --git a/examples/fastspeech/README.md b/examples/fastspeech/README.md
index 007b6b2..1199b8b 100644
--- a/examples/fastspeech/README.md
+++ b/examples/fastspeech/README.md
@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
 
-For more help on arguments: 
+For more help on arguments:
 ``python train.py --help``.
 
 ## Synthesis
@@ -75,5 +75,5 @@ or you can run the script file directly.
 sh synthesis.sh
 ```
 
-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
diff --git a/examples/fastspeech/parse.py b/examples/fastspeech/parse.py
index a6c2d99..690f4b2 100644
--- a/examples/fastspeech/parse.py
+++ b/examples/fastspeech/parse.py
@@ -1,36 +1,90 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 
+
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/fastspeech.yaml',
         help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
-        help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help="batch size for training.")
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
         help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
         help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
         help="checkpointing interval during training.")
-    parser.add_argument('--fastspeech_step', type=int, default=70000,
+    parser.add_argument(
+        '--fastspeech_step',
+        type=int,
+        default=70000,
         help="Global step to restore checkpoint of fastspeech.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
         help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
         help="use data parallel or not during training.")
 
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
         help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
         help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
         help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
         help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
         help="the directory to save audio sample in synthesis.")
-    parser.add_argument('--transtts_path', type=str, default='./log',
+    parser.add_argument(
+        '--transtts_path',
+        type=str,
+        default='./log',
         help="the directory to load pretrain transformerTTS model.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
         help="the step to load transformerTTS model.")
-    
-
diff --git a/examples/fastspeech/synthesis.py b/examples/fastspeech/synthesis.py
index 6a3d146..802d4e4 100644
--- a/examples/fastspeech/synthesis.py
+++ b/examples/fastspeech/synthesis.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tensorboardX import SummaryWriter
 from collections import OrderedDict
@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
 from parakeet import audio
 from parakeet.models.fastspeech.fastspeech import FastSpeech
 
+
 def load_checkpoint(step, model_path):
     model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
     new_state_dict = OrderedDict()
@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
             new_state_dict[param] = model_dict[param]
     return new_state_dict
 
+
 def synthesis(text_input, args):
     place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
 
     # tensorboard
     if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'synthesis')
 
     with open(args.config_path) as f:
         cfg = yaml.load(f, Loader=yaml.Loader)
@@ -37,24 +52,28 @@ def synthesis(text_input, args):
 
     with dg.guard(place):
         model = FastSpeech(cfg)
-        model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
+        model.set_dict(
+            load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech")))
         model.eval()
 
         text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
-        pos_text = np.arange(1, text.shape[1]+1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
+        pos_text = np.arange(1, text.shape[1] + 1)
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
 
-        mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
+        mel_output, mel_output_postnet = model(
+            text, pos_text, alpha=args.alpha)
 
         _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
-            num_mels=cfg['audio']['num_mels'], 
-            min_level_db=cfg['audio']['min_level_db'], 
-            ref_level_db=cfg['audio']['ref_level_db'], 
-            n_fft=cfg['audio']['n_fft'], 
-            win_length= cfg['audio']['win_length'], 
-            hop_length= cfg['audio']['hop_length'],
+            sample_rate=cfg['audio']['sr'],
+            num_mels=cfg['audio']['num_mels'],
+            min_level_db=cfg['audio']['min_level_db'],
+            ref_level_db=cfg['audio']['ref_level_db'],
+            n_fft=cfg['audio']['n_fft'],
+            win_length=cfg['audio']['win_length'],
+            hop_length=cfg['audio']['hop_length'],
             power=cfg['audio']['power'],
             preemphasis=cfg['audio']['preemphasis'],
             signal_norm=True,
@@ -67,14 +86,17 @@ def synthesis(text_input, args):
             do_trim_silence=False,
             sound_norm=False)
 
-        mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
-        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
+        mel_output_postnet = fluid.layers.transpose(
+            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
+        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
+        ))
         writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
         print("Synthesis completed !!!")
     writer.close()
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Train Fastspeech model")
     add_config_options_to_parser(parser)
     args = parser.parse_args()
-    synthesis("Transformer model is so fast!", args)
\ No newline at end of file
+    synthesis("Transformer model is so fast!", args)
diff --git a/examples/fastspeech/train.py b/examples/fastspeech/train.py
index 52b5725..f1b59a2 100644
--- a/examples/fastspeech/train.py
+++ b/examples/fastspeech/train.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import argparse
 import os
@@ -20,8 +33,10 @@ import sys
 sys.path.append("../transformer_tts")
 from data import LJSpeechLoader
 
+
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
     new_state_dict = OrderedDict()
     for param in model_dict:
         if param.startswith('_layers.'):
@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
             new_state_dict[param] = model_dict[param]
     return new_state_dict, opti_dict
 
+
 def main(args):
     local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
     nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -43,26 +59,33 @@ def main(args):
              if args.use_gpu else fluid.CPUPlace())
 
     if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'fastspeech')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'fastspeech')
 
     writer = SummaryWriter(path) if local_rank == 0 else None
 
     with dg.guard(place):
         with fluid.unique_name.guard():
             transformerTTS = TransformerTTS(cfg)
-            model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
+            model_dict, _ = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.transtts_path, "transformer"))
             transformerTTS.set_dict(model_dict)
             transformerTTS.eval()
 
         model = FastSpeech(cfg)
         model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
-                                                  parameter_list=model.parameters())
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
-        
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
+
         if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech"))
             model.set_dict(model_dict)
             optimizer.set_dict(opti_dict)
             global_step = args.fastspeech_step
@@ -76,31 +99,42 @@ def main(args):
             pbar = tqdm(reader)
 
             for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                 character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
 
-                _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
-                alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
+                _, _, attn_probs, _, _, _ = transformerTTS(
+                    character, mel_input, pos_text, pos_mel)
+                alignment = dg.to_variable(
+                    get_alignment(attn_probs, mel_lens, cfg[
+                        'transformer_head'])).astype(np.float32)
 
                 global_step += 1
-                    
+
                 #Forward
-                result= model(character, 
-                              pos_text, 
-                              mel_pos=pos_mel,  
-                              length_target=alignment)
+                result = model(
+                    character,
+                    pos_text,
+                    mel_pos=pos_mel,
+                    length_target=alignment)
                 mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                 mel_loss = layers.mse_loss(mel_output, mel)
                 mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
-                duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
+                duration_loss = layers.mean(
+                    layers.abs(
+                        layers.elementwise_sub(duration_predictor_output,
+                                               alignment)))
                 total_loss = mel_loss + mel_postnet_loss + duration_loss
 
-                if local_rank==0:
-                    writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
-                    writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
-                    writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
-
+                if local_rank == 0:
+                    writer.add_scalar('mel_loss',
+                                      mel_loss.numpy(), global_step)
+                    writer.add_scalar('post_mel_loss',
+                                      mel_postnet_loss.numpy(), global_step)
+                    writer.add_scalar('duration_loss',
+                                      duration_loss.numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
 
                 if args.use_data_parallel:
                     total_loss = model.scale_loss(total_loss)
@@ -108,21 +142,25 @@ def main(args):
                     model.apply_collective_grads()
                 else:
                     total_loss.backward()
-                optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    total_loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                 model.clear_gradients()
 
-                 # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                # save checkpoint
+                if local_rank == 0 and global_step % args.save_step == 0:
                     if not os.path.exists(args.save_path):
                         os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'fastspeech/%d' % global_step)
                     dg.save_dygraph(model.state_dict(), save_path)
                     dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
             writer.close()
 
 
-if __name__ =='__main__':
+if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Train Fastspeech model")
     add_config_options_to_parser(parser)
     args = parser.parse_args()
diff --git a/examples/transformer_tts/README.md b/examples/transformer_tts/README.md
index afdfdd2..6fda6d1 100644
--- a/examples/transformer_tts/README.md
+++ b/examples/transformer_tts/README.md
@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
 
-For more help on arguments: 
+For more help on arguments:
 ``python train_transformer.py --help``.
 
 ## Train Vocoder
@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 ```
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
 
-For more help on arguments: 
+For more help on arguments:
 ``python train_vocoder.py --help``.
 
 ## Synthesis
@@ -101,5 +101,5 @@ sh synthesis.sh
 
 And the audio file will be saved in ``--sample_path``.
 
-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
diff --git a/examples/transformer_tts/data.py b/examples/transformer_tts/data.py
index 9401b7b..99c6739 100644
--- a/examples/transformer_tts/data.py
+++ b/examples/transformer_tts/data.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, SpecBatcher
 from parakeet.data.dataset import DatasetMixin, TransformDataset
 
+
 class LJSpeechLoader:
-    def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
+    def __init__(self,
+                 config,
+                 args,
+                 nranks,
+                 rank,
+                 is_vocoder=False,
+                 shuffle=True):
         place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
 
         LJSPEECH_ROOT = Path(args.data_path)
         metadata = LJSpeechMetaData(LJSPEECH_ROOT)
         transformer = LJSpeech(config)
         dataset = TransformDataset(metadata, transformer)
-        sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
+        sampler = DistributedSampler(
+            len(metadata), nranks, rank, shuffle=shuffle)
 
         assert args.batch_size % nranks == 0
         each_bs = args.batch_size // nranks
         if is_vocoder:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples_vocoder,
+                drop_last=True)
         else:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
-        
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples,
+                drop_last=True)
+
         self.reader = fluid.io.DataLoader.from_generator(
             capacity=32,
             iterable=True,
@@ -63,13 +96,13 @@ class LJSpeech(object):
         super(LJSpeech, self).__init__()
         self.config = config
         self._ljspeech_processor = audio.AudioProcessor(
-            sample_rate=config['audio']['sr'], 
-            num_mels=config['audio']['num_mels'], 
-            min_level_db=config['audio']['min_level_db'], 
-            ref_level_db=config['audio']['ref_level_db'], 
-            n_fft=config['audio']['n_fft'], 
-            win_length= config['audio']['win_length'], 
-            hop_length= config['audio']['hop_length'],
+            sample_rate=config['audio']['sr'],
+            num_mels=config['audio']['num_mels'],
+            min_level_db=config['audio']['min_level_db'],
+            ref_level_db=config['audio']['ref_level_db'],
+            n_fft=config['audio']['n_fft'],
+            win_length=config['audio']['win_length'],
+            hop_length=config['audio']['hop_length'],
             power=config['audio']['power'],
             preemphasis=config['audio']['preemphasis'],
             signal_norm=True,
@@ -81,7 +114,7 @@ class LJSpeech(object):
             griffin_lim_iters=60,
             do_trim_silence=False,
             sound_norm=False)
-            
+
     def __call__(self, metadatum):
         """All the code for generating an Example from a metadatum. If you want a 
         different preprocessing pipeline, you can override this method. 
@@ -90,13 +123,15 @@ class LJSpeech(object):
         method.
         """
         fname, raw_text, normalized_text = metadatum
-        
+
         # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
         wav = self._ljspeech_processor.load_wav(str(fname))
         mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
         mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
-        phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
-        return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
+        phonemes = np.array(
+            g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        return (mag, mel, phonemes
+                )  # maybe we need to implement it as a map in the future
 
 
 def batch_examples(batch):
@@ -109,44 +144,71 @@ def batch_examples(batch):
     pos_mels = []
     for data in batch:
         _, mel, text = data
-        mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
+        mel_inputs.append(
+            np.concatenate(
+                [np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
+                axis=-1))
         mel_lens.append(mel.shape[1])
         text_lens.append(len(text))
         pos_texts.append(np.arange(1, len(text) + 1))
         pos_mels.append(np.arange(1, mel.shape[1] + 1))
         mels.append(mel)
         texts.append(text)
-    
+
     # Sort by text_len in descending order
-    texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
-    mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
-    mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
-    mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
-    pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
-    pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
+    texts = [
+        i
+        for i, _ in sorted(
+            zip(texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mels = [
+        i
+        for i, _ in sorted(
+            zip(mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_inputs = [
+        i
+        for i, _ in sorted(
+            zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_lens = [
+        i
+        for i, _ in sorted(
+            zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_texts = [
+        i
+        for i, _ in sorted(
+            zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_mels = [
+        i
+        for i, _ in sorted(
+            zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
     text_lens = sorted(text_lens, reverse=True)
 
     # Pad sequence with largest len of the batch
-    texts = TextIDBatcher(pad_id=0)(texts)   #(B, T)
-    pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
-    pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
-    mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
-    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
+    texts = TextIDBatcher(pad_id=0)(texts)  #(B, T)
+    pos_texts = TextIDBatcher(pad_id=0)(pos_texts)  #(B,T)
+    pos_mels = TextIDBatcher(pad_id=0)(pos_mels)  #(B,T)
+    mels = np.transpose(
+        SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))  #(B,T,num_mels)
+    mel_inputs = np.transpose(
+        SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1))  #(B,T,num_mels)
+    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
+            np.array(mel_lens))
+
 
 def batch_examples_vocoder(batch):
-    mels=[]
-    mags=[]
+    mels = []
+    mags = []
     for data in batch:
         mag, mel, _ = data
         mels.append(mel)
         mags.append(mag)
 
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
-    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
+    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
+    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
 
     return (mels, mags)
-
-
-
-        
diff --git a/examples/transformer_tts/parse.py b/examples/transformer_tts/parse.py
index aebce96..e7f124a 100644
--- a/examples/transformer_tts/parse.py
+++ b/examples/transformer_tts/parse.py
@@ -1,38 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 
+
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/train_transformer.yaml',
         help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
-        help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help="batch size for training.")
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
         help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
         help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
         help="checkpointing interval during training.")
-    parser.add_argument('--image_step', type=int, default=2000,
+    parser.add_argument(
+        '--image_step',
+        type=int,
+        default=2000,
         help="attention image interval during training.")
-    parser.add_argument('--max_len', type=int, default=400,
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=400,
         help="The max length of audio when synthsis.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
         help="Global step to restore checkpoint of transformer.")
-    parser.add_argument('--vocoder_step', type=int, default=90000,
+    parser.add_argument(
+        '--vocoder_step',
+        type=int,
+        default=90000,
         help="Global step to restore checkpoint of postnet.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
         help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
         help="use data parallel or not during training.")
-    parser.add_argument('--stop_token', type=int, default=0,
+    parser.add_argument(
+        '--stop_token',
+        type=int,
+        default=0,
         help="use stop token loss in network or not.")
 
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
         help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
         help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
         help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
         help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
         help="the directory to save audio sample in synthesis.")
diff --git a/examples/transformer_tts/synthesis.py b/examples/transformer_tts/synthesis.py
index fb1bd2f..de83362 100644
--- a/examples/transformer_tts/synthesis.py
+++ b/examples/transformer_tts/synthesis.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from scipy.io.wavfile import write
 from parakeet.g2p.en import text_to_sequence
@@ -16,6 +29,7 @@ from parakeet import audio
 from parakeet.models.transformer_tts.vocoder import Vocoder
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 
+
 def load_checkpoint(step, model_path):
     model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
     new_state_dict = OrderedDict()
@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
             new_state_dict[param] = model_dict[param]
     return new_state_dict
 
+
 def synthesis(text_input, args):
     place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
 
@@ -34,46 +49,53 @@ def synthesis(text_input, args):
 
     # tensorboard
     if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'synthesis')
 
     writer = SummaryWriter(path)
 
     with dg.guard(place):
         with fluid.unique_name.guard():
             model = TransformerTTS(cfg)
-            model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
+            model.set_dict(
+                load_checkpoint(
+                    str(args.transformer_step),
+                    os.path.join(args.checkpoint_path, "transformer")))
             model.eval()
-        
+
         with fluid.unique_name.guard():
             model_vocoder = Vocoder(cfg, args.batch_size)
-            model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
+            model_vocoder.set_dict(
+                load_checkpoint(
+                    str(args.vocoder_step),
+                    os.path.join(args.checkpoint_path, "vocoder")))
             model_vocoder.eval()
         # init input
         text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
-        mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
-        pos_text = np.arange(1, text.shape[1]+1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
-        
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
+        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
+        pos_text = np.arange(1, text.shape[1] + 1)
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
 
         pbar = tqdm(range(args.max_len))
 
         for i in pbar:
-            pos_mel = np.arange(1, mel_input.shape[1]+1)
-            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
-            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
-            mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
+            pos_mel = np.arange(1, mel_input.shape[1] + 1)
+            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
+            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                text, mel_input, pos_text, pos_mel)
+            mel_input = fluid.layers.concat(
+                [mel_input, postnet_pred[:, -1:, :]], axis=1)
         mag_pred = model_vocoder(postnet_pred)
 
         _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
-            num_mels=cfg['audio']['num_mels'], 
-            min_level_db=cfg['audio']['min_level_db'], 
-            ref_level_db=cfg['audio']['ref_level_db'], 
-            n_fft=cfg['audio']['n_fft'], 
-            win_length= cfg['audio']['win_length'], 
-            hop_length= cfg['audio']['hop_length'],
+            sample_rate=cfg['audio']['sr'],
+            num_mels=cfg['audio']['num_mels'],
+            min_level_db=cfg['audio']['min_level_db'],
+            ref_level_db=cfg['audio']['ref_level_db'],
+            n_fft=cfg['audio']['n_fft'],
+            win_length=cfg['audio']['win_length'],
+            hop_length=cfg['audio']['hop_length'],
             power=cfg['audio']['power'],
             preemphasis=cfg['audio']['preemphasis'],
             signal_norm=True,
@@ -86,13 +108,18 @@ def synthesis(text_input, args):
             do_trim_silence=False,
             sound_norm=False)
 
-        wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
+        wav = _ljspeech_processor.inv_spectrogram(
+            fluid.layers.transpose(
+                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
         writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
         if not os.path.exists(args.sample_path):
             os.mkdir(args.sample_path)
-        write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
+        write(
+            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
+            wav)
     writer.close()
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Synthesis model")
     add_config_options_to_parser(parser)
diff --git a/examples/transformer_tts/train_transformer.py b/examples/transformer_tts/train_transformer.py
index cbca569..f3dd023 100644
--- a/examples/transformer_tts/train_transformer.py
+++ b/examples/transformer_tts/train_transformer.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tqdm import tqdm
 from tensorboardX import SummaryWriter
@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 
+
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
     new_state_dict = OrderedDict()
     for param in model_dict:
         if param.startswith('_layers.'):
@@ -40,22 +55,27 @@ def main(args):
              if args.use_gpu else fluid.CPUPlace())
 
     if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'transformer')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'transformer')
 
     writer = SummaryWriter(path) if local_rank == 0 else None
-    
+
     with dg.guard(place):
         model = TransformerTTS(cfg)
 
         model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), 
-                                                  parameter_list=model.parameters())
-        
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
+
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
 
         if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.checkpoint_path, "transformer"))
             model.set_dict(model_dict)
             optimizer.set_dict(opti_dict)
             global_step = args.transformer_step
@@ -64,86 +84,112 @@ def main(args):
         if args.use_data_parallel:
             strategy = dg.parallel.prepare_context()
             model = fluid.dygraph.parallel.DataParallel(model, strategy)
-        
+
         for epoch in range(args.epochs):
             pbar = tqdm(reader)
             for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                 character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
 
                 global_step += 1
-                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
-                
+                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                    character, mel_input, pos_text, pos_mel)
 
                 label = (pos_mel == 0).astype(np.float32)
-                    
-                mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
-                post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
+
+                mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                post_mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(postnet_pred, mel)))
                 loss = mel_loss + post_mel_loss
                 # Note: When used stop token loss the learning did not work.
                 if args.stop_token:
                     stop_loss = cross_entropy(stop_preds, label)
                     loss = loss + stop_loss
 
-                if local_rank==0:
+                if local_rank == 0:
                     writer.add_scalars('training_loss', {
-                        'mel_loss':mel_loss.numpy(),
-                        'post_mel_loss':post_mel_loss.numpy()
+                        'mel_loss': mel_loss.numpy(),
+                        'post_mel_loss': post_mel_loss.numpy()
                     }, global_step)
 
                     if args.stop_token:
-                        writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
+                        writer.add_scalar('stop_loss',
+                                          stop_loss.numpy(), global_step)
 
                     if args.use_data_parallel:
                         writer.add_scalars('alphas', {
-                            'encoder_alpha':model._layers.encoder.alpha.numpy(),
-                            'decoder_alpha':model._layers.decoder.alpha.numpy(),
+                            'encoder_alpha':
+                            model._layers.encoder.alpha.numpy(),
+                            'decoder_alpha':
+                            model._layers.decoder.alpha.numpy(),
                         }, global_step)
                     else:
                         writer.add_scalars('alphas', {
-                            'encoder_alpha':model.encoder.alpha.numpy(),
-                            'decoder_alpha':model.decoder.alpha.numpy(),
+                            'encoder_alpha': model.encoder.alpha.numpy(),
+                            'decoder_alpha': model.decoder.alpha.numpy(),
                         }, global_step)
 
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
 
                     if global_step % args.image_step == 1:
                         for i, prob in enumerate(attn_probs):
                             for j in range(4):
-                                    x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
-                                    writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                x = np.uint8(
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
 
                         for i, prob in enumerate(attn_enc):
                             for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
-                                writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                x = np.uint8(
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_enc_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
 
                         for i, prob in enumerate(attn_dec):
                             for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
-                                writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
-                                
+                                x = np.uint8(
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_dec_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
+
                 if args.use_data_parallel:
                     loss = model.scale_loss(loss)
                     loss.backward()
                     model.apply_collective_grads()
                 else:
                     loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                 model.clear_gradients()
-                
+
                 # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                     if not os.path.exists(args.save_path):
                         os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'transformer/%d' % global_step)
                     dg.save_dygraph(model.state_dict(), save_path)
                     dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
             writer.close()
-                    
 
-if __name__ =='__main__':
+
+if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Train TransformerTTS model")
     add_config_options_to_parser(parser)
 
diff --git a/examples/transformer_tts/train_vocoder.py b/examples/transformer_tts/train_vocoder.py
index 857fdf0..7896223 100644
--- a/examples/transformer_tts/train_vocoder.py
+++ b/examples/transformer_tts/train_vocoder.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from tensorboardX import SummaryWriter
 import os
 from tqdm import tqdm
@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.vocoder import Vocoder
 
+
 def load_checkpoint(step, model_path):
     model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
     new_state_dict = OrderedDict()
@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
             new_state_dict[param] = model_dict[param]
     return new_state_dict, opti_dict
 
+
 def main(args):
-    
+
     local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
     nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
 
@@ -35,23 +50,26 @@ def main(args):
     place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
              if args.use_data_parallel else fluid.CUDAPlace(0)
              if args.use_gpu else fluid.CPUPlace())
-    
+
     if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'vocoder')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'vocoder')
 
     writer = SummaryWriter(path) if local_rank == 0 else None
 
-    with dg.guard(place):   
+    with dg.guard(place):
         model = Vocoder(cfg, args.batch_size)
 
         model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
-                                                  parameter_list=model.parameters())
-
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
 
         if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.vocoder_step),
+                os.path.join(args.checkpoint_path, "vocoder"))
             model.set_dict(model_dict)
             optimizer.set_dict(opti_dict)
             global_step = args.vocoder_step
@@ -61,48 +79,55 @@ def main(args):
             strategy = dg.parallel.prepare_context()
             model = fluid.dygraph.parallel.DataParallel(model, strategy)
 
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, is_vocoder=True).reader()
 
         for epoch in range(args.epochs):
             pbar = tqdm(reader)
             for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                 mel, mag = data
                 mag = dg.to_variable(mag.numpy())
                 mel = dg.to_variable(mel.numpy())
                 global_step += 1
 
                 mag_pred = model(mel)
-                loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
-                
+                loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mag_pred, mag)))
+
                 if args.use_data_parallel:
                     loss = model.scale_loss(loss)
                     loss.backward()
                     model.apply_collective_grads()
                 else:
                     loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                 model.clear_gradients()
-                
-                if local_rank==0:
-                    writer.add_scalars('training_loss',{
-                        'loss':loss.numpy(),
+
+                if local_rank == 0:
+                    writer.add_scalars('training_loss', {
+                        'loss': loss.numpy(),
                     }, global_step)
 
                     if global_step % args.save_step == 0:
                         if not os.path.exists(args.save_path):
                             os.mkdir(args.save_path)
-                        save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
+                        save_path = os.path.join(args.save_path,
+                                                 'vocoder/%d' % global_step)
                         dg.save_dygraph(model.state_dict(), save_path)
                         dg.save_dygraph(optimizer.state_dict(), save_path)
 
-        if local_rank==0:
+        if local_rank == 0:
             writer.close()
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Train vocoder model")
     add_config_options_to_parser(parser)
     args = parser.parse_args()
     # Print the whole config setting.
     pprint(args)
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/examples/waveflow/benchmark.py b/examples/waveflow/benchmark.py
index 24d83c4..3badeda 100644
--- a/examples/waveflow/benchmark.py
+++ b/examples/waveflow/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from pprint import pprint
diff --git a/examples/waveflow/synthesis.py b/examples/waveflow/synthesis.py
index 76df229..0647e94 100644
--- a/examples/waveflow/synthesis.py
+++ b/examples/waveflow/synthesis.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from pprint import pprint
diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py
index 92bb9ef..32059c8 100644
--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 import subprocess
diff --git a/examples/waveflow/utils.py b/examples/waveflow/utils.py
index 51f6296..da9b4ba 100644
--- a/examples/waveflow/utils.py
+++ b/examples/waveflow/utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time
diff --git a/parakeet/__init__.py b/parakeet/__init__.py
index 9dbb99b..9be1aaf 100644
--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 __version__ = "0.0.0"
 
 from . import data, g2p, models, modules
diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py
index 6212dee..253a887 100644
--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .audio import AudioProcessor
\ No newline at end of file
diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py
index b861a39..9133a47 100644
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
@@ -1,30 +1,46 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import librosa
 import soundfile as sf
 import numpy as np
 import scipy.io
 import scipy.signal
 
+
 class AudioProcessor(object):
-    def __init__(self,
-                 sample_rate=None, # int, sampling rate
-                 num_mels=None, # int, bands of mel spectrogram
-                 min_level_db=None, # float, minimum level db
-                 ref_level_db=None, # float, reference level db
-                 n_fft=None, # int: number of samples in a frame for stft
-                 win_length=None, # int: the same meaning with n_fft
-                 hop_length=None, # int: number of samples between neighboring frame
-                 power=None, # float:power to raise before griffin-lim
-                 preemphasis=None, # float: preemphasis coefficident
-                 signal_norm=None, # 
-                 symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
-                 max_norm=None, # float, max norm
-                 mel_fmin=None, # int: mel spectrogram's minimum frequency
-                 mel_fmax=None, # int: mel spectrogram's maximum frequency
-                 clip_norm=True, # bool: clip spectrogram's norm
-                 griffin_lim_iters=None, # int:
-                 do_trim_silence=False, # bool: trim silence
-                 sound_norm=False,
-                 **kwargs):
+    def __init__(
+            self,
+            sample_rate=None,  # int, sampling rate
+            num_mels=None,  # int, bands of mel spectrogram
+            min_level_db=None,  # float, minimum level db
+            ref_level_db=None,  # float, reference level db
+            n_fft=None,  # int: number of samples in a frame for stft
+            win_length=None,  # int: the same meaning with n_fft
+            hop_length=None,  # int: number of samples between neighboring frame
+            power=None,  # float:power to raise before griffin-lim
+            preemphasis=None,  # float: preemphasis coefficident
+            signal_norm=None,  # 
+            symmetric_norm=False,  # bool, apply clip norm in [-max_norm, max_form]
+            max_norm=None,  # float, max norm
+            mel_fmin=None,  # int: mel spectrogram's minimum frequency
+            mel_fmax=None,  # int: mel spectrogram's maximum frequency
+            clip_norm=True,  # bool: clip spectrogram's norm
+            griffin_lim_iters=None,  # int:
+            do_trim_silence=False,  # bool: trim silence
+            sound_norm=False,
+            **kwargs):
         self.sample_rate = sample_rate
         self.num_mels = num_mels
         self.min_level_db = min_level_db
@@ -34,8 +50,8 @@ class AudioProcessor(object):
         self.n_fft = n_fft
         self.win_length = win_length or n_fft
         # hop length defaults to 1/4 window_length
-        self.hop_length = hop_length or 0.25 * self.win_length 
-        
+        self.hop_length = hop_length or 0.25 * self.win_length
+
         self.power = power
         self.preemphasis = float(preemphasis)
 
@@ -52,7 +68,8 @@ class AudioProcessor(object):
         self.do_trim_silence = do_trim_silence
 
         self.sound_norm = sound_norm
-        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
+        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
+        )
 
     def _stft_parameters(self):
         """compute frame length and hop length in ms"""
@@ -65,44 +82,54 @@ class AudioProcessor(object):
         """object repr"""
         cls_name_str = self.__class__.__name__
         members = vars(self)
-        dict_str = "\n".join(["  {}: {},".format(k, v) for k, v in members.items()])
+        dict_str = "\n".join(
+            ["  {}: {},".format(k, v) for k, v in members.items()])
         repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
         return repr_str
 
     def save_wav(self, path, wav):
         """save audio with scipy.io.wavfile in 16bit integers"""
         wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
-        scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
+        scipy.io.wavfile.write(path, self.sample_rate,
+                               wav_norm.as_type(np.int16))
 
     def load_wav(self, path, sr=None):
         """load wav -> trim_silence -> rescale"""
 
         x, sr = librosa.load(path, sr=None)
-        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
+        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
+            sr, self.sample_rate)
         if self.do_trim_silence:
             try:
                 x = self.trim_silence(x)
             except ValueError:
-                print(" [!] File cannot be trimmed for silence - {}".format(path))
+                print(" [!] File cannot be trimmed for silence - {}".format(
+                    path))
         if self.sound_norm:
-            x = x / x.max() * 0.9 # why 0.9 ?
+            x = x / x.max() * 0.9  # why 0.9 ?
         return x
 
     def trim_silence(self, wav):
         """Trim soilent parts with a threshold and 0.01s margin"""
         margin = int(self.sample_rate * 0.01)
-        wav = wav[margin: -margin]
-        trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+        wav = wav[margin:-margin]
+        trimed_wav = librosa.effects.trim(
+            wav,
+            top_db=60,
+            frame_length=self.win_length,
+            hop_length=self.hop_length)[0]
         return trimed_wav
 
     def apply_preemphasis(self, x):
         if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
         return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
 
     def apply_inv_preemphasis(self, x):
         if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
         return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
 
     def _amplitude_to_db(self, x):
@@ -125,12 +152,11 @@ class AudioProcessor(object):
         """return mel basis for mel scale"""
         if self.mel_fmax is not None:
             assert self.mel_fmax <= self.sample_rate // 2
-        return librosa.filters.mel(
-            self.sample_rate, 
-            self.n_fft, 
-            n_mels=self.num_mels,
-            fmin=self.mel_fmin,
-            fmax=self.mel_fmax)
+        return librosa.filters.mel(self.sample_rate,
+                                   self.n_fft,
+                                   n_mels=self.num_mels,
+                                   fmin=self.mel_fmin,
+                                   fmax=self.mel_fmax)
 
     def _normalize(self, S):
         """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
@@ -156,25 +182,29 @@ class AudioProcessor(object):
             if self.symmetric_norm:
                 if self.clip_norm:
                     S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
-                S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
+                S_denorm = (S_denorm + self.max_norm) * (
+                    -self.min_level_db) / (2 * self.max_norm
+                                           ) + self.min_level_db
                 return S_denorm
             else:
                 if self.clip_norm:
                     S_denorm = np.clip(S_denorm, 0, self.max_norm)
-                S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
+                S_denorm = S_denorm * (-self.min_level_db
+                                       ) / self.max_norm + self.min_level_db
                 return S_denorm
         else:
             return S
 
     def _stft(self, y):
         return librosa.stft(
-            y=y, 
+            y=y,
             n_fft=self.n_fft,
             win_length=self.win_length,
             hop_length=self.hop_length)
 
     def _istft(self, S):
-        return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
+        return librosa.istft(
+            S, hop_length=self.hop_length, win_length=self.win_length)
 
     def spectrogram(self, y):
         """compute linear spectrogram(amplitude)
@@ -195,7 +225,8 @@ class AudioProcessor(object):
             D = self._stft(self.apply_preemphasis(y))
         else:
             D = self._stft(y)
-        S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+        S = self._amplitude_to_db(self._linear_to_mel(np.abs(
+            D))) - self.ref_level_db
         return self._normalize(S)
 
     def inv_spectrogram(self, spectrogram):
@@ -203,16 +234,16 @@ class AudioProcessor(object):
         S = self._denormalize(spectrogram)
         S = self._db_to_amplitude(S + self.ref_level_db)
         if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
-        return self._griffin_lim(S ** self.power)
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
+        return self._griffin_lim(S**self.power)
 
     def inv_melspectrogram(self, mel_spectrogram):
         S = self._denormalize(mel_spectrogram)
         S = self._db_to_amplitude(S + self.ref_level_db)
         S = self._mel_to_linear(np.abs(S))
         if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
-        return self._griffin_lim(S ** self.power)
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
+        return self._griffin_lim(S**self.power)
 
     def out_linear_to_mel(self, linear_spec):
         """convert output linear spec to mel spec"""
@@ -222,7 +253,7 @@ class AudioProcessor(object):
         S = self._amplitude_to_db(S) - self.ref_level_db
         mel = self._normalize(S)
         return mel
-        
+
     def _griffin_lim(self, S):
         angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
         S_complex = np.abs(S).astype(np.complex)
@@ -234,18 +265,18 @@ class AudioProcessor(object):
 
     @staticmethod
     def mulaw_encode(wav, qc):
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
         # wav_abs = np.minimum(np.abs(wav), 1.0)
         signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
         # Quantize signal to the specified number of levels.
         signal = (signal + 1) / 2 * mu + 0.5
-        return np.floor(signal,)
+        return np.floor(signal, )
 
     @staticmethod
     def mulaw_decode(wav, qc):
         """Recovers waveform from quantized values."""
-        mu = 2 ** qc - 1
-        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        mu = 2**qc - 1
+        x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
         return x
 
     @staticmethod
diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py
index ed86edd..be28f11 100644
--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .dataset import *
 from .datacargo import *
 from .sampler import *
diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py
index 8777472..22c24e4 100644
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@@ -1,18 +1,34 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 functions to make batch for arrays which satisfy some conditions.
 """
 import numpy as np
 
+
 class TextIDBatcher(object):
     """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
+
     def __init__(self, pad_id=0, dtype=np.int64):
         self.pad_id = pad_id
         self.dtype = dtype
-    
+
     def __call__(self, minibatch):
         out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
         return out
 
+
 def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
     """
     minibatch: List[Example]
@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
     """
     peek_example = minibatch[0]
     assert len(peek_example.shape) == 1, "text example is an 1D tensor"
-    
-    lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+
+    lengths = [example.shape[0] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
     max_len = np.max(lengths)
-    
+
     batch = []
     for example in minibatch:
         pad_len = max_len - example.shape[0]
-        batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
+        batch.append(
+            np.pad(example, [(0, pad_len)],
+                   mode='constant',
+                   constant_values=pad_id))
 
     return np.array(batch, dtype=dtype)
 
+
 class WavBatcher(object):
     def __init__(self, pad_value=0., dtype=np.float32):
         self.pad_value = pad_value
         self.dtype = dtype
-        
+
     def __call__(self, minibatch):
         out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
         return out
 
+
 def batch_wav(minibatch, pad_value=0., dtype=np.float32):
     """
     minibatch: List[Example]
@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
         mono_channel = True
     elif len(peek_example.shape) == 2:
         mono_channel = False
-    
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
     max_len = np.max(lengths)
-    
+
     batch = []
     for example in minibatch:
         pad_len = max_len - example.shape[-1]
         if mono_channel:
-            batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
         else:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
-    
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
+
     return np.array(batch, dtype=dtype)
 
 
@@ -75,6 +104,7 @@ class SpecBatcher(object):
         out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
         return out
 
+
 def batch_spec(minibatch, pad_value=0., dtype=np.float32):
     """
     minibatch: List[Example]
@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
         mono_channel = True
     elif len(peek_example.shape) == 3:
         mono_channel = False
-    
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
-    max_len = np.max(lengths)  
-    
+
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, F, n_frame) or (F, n_frame)
+    max_len = np.max(lengths)
+
     batch = []
     for example in minibatch:
         pad_len = max_len - example.shape[-1]
         if mono_channel:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
         else:
-            batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
-    
-    return np.array(batch, dtype=dtype) 
\ No newline at end of file
+            batch.append(
+                np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
+
+    return np.array(batch, dtype=dtype)
diff --git a/parakeet/data/datacargo.py b/parakeet/data/datacargo.py
index 8c9a3b2..904cd3c 100644
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import six
 from .sampler import SequentialSampler, RandomSampler, BatchSampler
 
diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py
index d9f9a1f..d577f9e 100644
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import six
 import numpy as np
 
@@ -9,8 +23,7 @@ class DatasetMixin(object):
         if isinstance(index, slice):
             start, stop, step = index.indices(len(self))
             return [
-                self.get_example(i)
-                for i in six.moves.range(start, stop, step)
+                self.get_example(i) for i in six.moves.range(start, stop, step)
             ]
         elif isinstance(index, (list, np.ndarray)):
             return [self.get_example(i) for i in index]
@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
 
     def get_example(self, i):
         if i < 0:
-            raise IndexError(
-                "ChainDataset doesnot support negative indexing.")
+            raise IndexError("ChainDataset doesnot support negative indexing.")
 
         for dataset in self._datasets:
             if i < len(dataset):
diff --git a/parakeet/data/sampler.py b/parakeet/data/sampler.py
index 60aa5db..b4ef097 100644
--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
 
@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
 So the sampler is only responsible for generating valid indices.
 """
 
-
 import numpy as np
 import random
 
+
 class Sampler(object):
     def __init__(self, data_source):
         pass
@@ -23,7 +36,7 @@ class Sampler(object):
 class SequentialSampler(Sampler):
     def __init__(self, data_source):
         self.data_source = data_source
-    
+
     def __iter__(self):
         return iter(range(len(self.data_source)))
 
@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
                              "replacement={}".format(self.replacement))
 
         if self._num_samples is not None and not replacement:
-            raise ValueError("With replacement=False, num_samples should not be specified, "
-                             "since a random permutation will be performed.")
+            raise ValueError(
+                "With replacement=False, num_samples should not be specified, "
+                "since a random permutation will be performed.")
 
         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
             raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(self.num_samples))
+                             "value, but got num_samples={}".format(
+                                 self.num_samples))
 
     @property
     def num_samples(self):
@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
     def __iter__(self):
         n = len(self.data_source)
         if self.replacement:
-            return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
+            return iter(
+                np.random.randint(
+                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
         return iter(np.random.permutation(n).tolist())
 
     def __len__(self):
@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
         self.indices = indices
 
     def __iter__(self):
-        return (self.indices[i] for i in np.random.permutation(len(self.indices)))
+        return (self.indices[i]
+                for i in np.random.permutation(len(self.indices)))
 
     def __len__(self):
         return len(self.indices)
@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
     3. Permutate mini-batchs
     """
 
-    def __init__(self, lengths, batch_size=4, batch_group_size=None,
+    def __init__(self,
+                 lengths,
+                 batch_size=4,
+                 batch_group_size=None,
                  permutate=True):
-        _lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
+        _lengths = np.array(
+            lengths,
+            dtype=np.int64)  # maybe better implement length as a sort key
         self.lengths = np.sort(_lengths)
         self.sorted_indices = np.argsort(_lengths)
 
@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
         for i in range(len(indices) // batch_group_size):
             s = i * batch_group_size
             e = s + batch_group_size
-            random.shuffle(indices[s: e]) # inplace
+            random.shuffle(indices[s:e])  # inplace
 
         # Permutate batches
         if self.permutate:
             perm = np.arange(len(indices[:e]) // self.batch_size)
             random.shuffle(perm)
-            indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
+            indices[:e] = indices[:e].reshape(
+                -1, self.batch_size)[perm, :].reshape(-1)
 
         # Handle last elements
         s += batch_group_size
         #print(indices)
         if s < len(indices):
             random.shuffle(indices[s:])
-        
+
         return iter(indices)
 
     def __len__(self):
@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
     def __init__(self, weights, num_samples, replacement):
         if not isinstance(num_samples, int) or num_samples <= 0:
             raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(num_samples))
+                             "value, but got num_samples={}".format(
+                                 num_samples))
         self.weights = np.array(weights, dtype=np.float64)
         self.num_samples = num_samples
         self.replacement = replacement
 
     def __iter__(self):
-        return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),  
-                                     replace=self.replacement, p=self.weights).tolist())
+        return iter(
+            np.random.choice(
+                len(self.weights),
+                size=(self.num_samples, ),
+                replace=self.replacement,
+                p=self.weights).tolist())
 
     def __len__(self):
         return self.num_samples
@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
 
         # Subset samples for each trainer.
         indices = indices[self.rank:self.total_size:self.num_trainers]
-        assert len(indices) ==  self.num_samples
+        assert len(indices) == self.num_samples
 
         return iter(indices)
 
@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
     def __init__(self, sampler, batch_size, drop_last):
         if not isinstance(sampler, Sampler):
             raise ValueError("sampler should be an instance of "
-                             "Sampler, but got sampler={}"
-                             .format(sampler))
+                             "Sampler, but got sampler={}".format(sampler))
         if not isinstance(batch_size, int) or batch_size <= 0:
             raise ValueError("batch_size should be a positive integer value, "
                              "but got batch_size={}".format(batch_size))
diff --git a/parakeet/datasets/README.md b/parakeet/datasets/README.md
index 96509ca..cd4f8f4 100644
--- a/parakeet/datasets/README.md
+++ b/parakeet/datasets/README.md
@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
 
 For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
 
-That is it! 
-
-
-
-
-
+That is it!
diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py
index e69de29..abf198b 100644
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py
index 7d4dffe..62209e9 100644
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 import numpy as np
 import pandas as pd
diff --git a/parakeet/datasets/vctk.py b/parakeet/datasets/vctk.py
index b6d2f0c..66e4f70 100644
--- a/parakeet/datasets/vctk.py
+++ b/parakeet/datasets/vctk.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 import pandas as pd
 from ruamel.yaml import YAML
@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
 from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, WavBatcher
 
+
 class VCTK(Dataset):
     def __init__(self, root):
-        assert isinstance(root, (str, Path)), "root should be a string or Path object"
+        assert isinstance(root, (
+            str, Path)), "root should be a string or Path object"
         self.root = root if isinstance(root, Path) else Path(root)
         self.text_root = self.root.joinpath("txt")
         self.wav_root = self.root.joinpath("wav48")
 
-        if not (self.root.joinpath("metadata.csv").exists() and 
+        if not (self.root.joinpath("metadata.csv").exists() and
                 self.root.joinpath("speaker_indices.yaml").exists()):
             self._prepare_metadata()
         self.speaker_indices, self.metadata = self._load_metadata()
 
     def _load_metadata(self):
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
         speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
-        metadata = pd.read_csv(self.root.joinpath("metadata.csv"), 
-                               sep="|", quoting=3, header=1)
+        metadata = pd.read_csv(
+            self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
         return speaker_indices, metadata
 
     def _prepare_metadata(self):
@@ -41,15 +57,19 @@ class VCTK(Dataset):
                         with io.open(str(text_file)) as f:
                             transcription = f.read().strip()
                     wav_file = text_file.with_suffix(".wav")
-                    metadata.append((wav_file.name, speaker_folder.name, transcription))
-        metadata = pd.DataFrame.from_records(metadata,
-                                             columns=["wave_file", "speaker", "text"])
-        
+                    metadata.append(
+                        (wav_file.name, speaker_folder.name, transcription))
+        metadata = pd.DataFrame.from_records(
+            metadata, columns=["wave_file", "speaker", "text"])
+
         # save them
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
         yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
-        metadata.to_csv(self.root.joinpath("metadata.csv"), 
-                        sep="|", quoting=3, index=False)
+        metadata.to_csv(
+            self.root.joinpath("metadata.csv"),
+            sep="|",
+            quoting=3,
+            index=False)
 
     def _get_example(self, metadatum):
         wave_file, speaker, text = metadatum
@@ -77,5 +97,3 @@ class VCTK(Dataset):
         speaker_batch = np.array(speaker_batch)
         phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
         return wav_batch, speaker_batch, phoneme_batch
-
-    
\ No newline at end of file
diff --git a/parakeet/g2p/__init__.py b/parakeet/g2p/__init__.py
index 2b88bdc..5840f33 100644
--- a/parakeet/g2p/__init__.py
+++ b/parakeet/g2p/__init__.py
@@ -1,5 +1,4 @@
 # coding: utf-8
-
 """Text processing frontend
 
 All frontend module should have the following functions:
diff --git a/parakeet/g2p/en/__init__.py b/parakeet/g2p/en/__init__.py
index 92faf11..01dd223 100644
--- a/parakeet/g2p/en/__init__.py
+++ b/parakeet/g2p/en/__init__.py
@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
     from ..text import text_to_sequence
     text = text_to_sequence(text, ["english_cleaners"])
     return text
-
-
-
diff --git a/parakeet/g2p/es/__init__.py b/parakeet/g2p/es/__init__.py
index fce4d18..8ac385f 100644
--- a/parakeet/g2p/es/__init__.py
+++ b/parakeet/g2p/es/__init__.py
@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
     from ..text import text_to_sequence
     text = text_to_sequence(text, ["basic_cleaners"])
     return text
-
-
-
diff --git a/parakeet/g2p/jp/__init__.py b/parakeet/g2p/jp/__init__.py
index dcb0845..36c7fd8 100644
--- a/parakeet/g2p/jp/__init__.py
+++ b/parakeet/g2p/jp/__init__.py
@@ -1,6 +1,5 @@
 # coding: utf-8
 
-
 import MeCab
 import jaconv
 from random import random
@@ -30,9 +29,9 @@ def _yomi(mecab_result):
 
 
 def _mix_pronunciation(tokens, yomis, p):
-    return "".join(
-        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
-        for idx in range(len(tokens)))
+    return "".join(yomis[idx]
+                   if yomis[idx] is not None and random() < p else tokens[idx]
+                   for idx in range(len(tokens)))
 
 
 def mix_pronunciation(text, p):
@@ -59,8 +58,7 @@ def normalize_delimitor(text):
 
 
 def text_to_sequence(text, p=0.0):
-    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
-              "（", "）", "(", ")"]:
+    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】", "（", "）", "(", ")"]:
         text = text.replace(c, "")
     text = text.replace("!", "！")
     text = text.replace("?", "？")
diff --git a/parakeet/g2p/ko/__init__.py b/parakeet/g2p/ko/__init__.py
index 2a6465b..ccb8b5f 100644
--- a/parakeet/g2p/ko/__init__.py
+++ b/parakeet/g2p/ko/__init__.py
@@ -1,6 +1,5 @@
 # coding: utf-8
 
-
 from random import random
 
 n_vocab = 0xffff
@@ -13,5 +12,6 @@ _tagger = None
 def text_to_sequence(text, p=0.0):
     return [ord(c) for c in text] + [_eos]  # EOS
 
+
 def sequence_to_text(seq):
     return "".join(chr(n) for n in seq)
diff --git a/parakeet/g2p/text/__init__.py b/parakeet/g2p/text/__init__.py
index 3942998..312b720 100644
--- a/parakeet/g2p/text/__init__.py
+++ b/parakeet/g2p/text/__init__.py
@@ -1,8 +1,21 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from . import cleaners
 from .symbols import symbols
 
-
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
         if not m:
             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
             break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _symbols_to_sequence(
+            _clean_text(m.group(1), cleaner_names))
         sequence += _arpabet_to_sequence(m.group(2))
         text = m.group(3)
 
diff --git a/parakeet/g2p/text/cleaners.py b/parakeet/g2p/text/cleaners.py
index 779a977..58553c1 100644
--- a/parakeet/g2p/text/cleaners.py
+++ b/parakeet/g2p/text/cleaners.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Cleaners are transformations that run over the input text at both training and eval time.
 
@@ -14,31 +27,31 @@ import re
 from unidecode import unidecode
 from .numbers import normalize_numbers
 
-
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
 
 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('mrs', 'misess'),
-    ('mr', 'mister'),
-    ('dr', 'doctor'),
-    ('st', 'saint'),
-    ('co', 'company'),
-    ('jr', 'junior'),
-    ('maj', 'major'),
-    ('gen', 'general'),
-    ('drs', 'doctors'),
-    ('rev', 'reverend'),
-    ('lt', 'lieutenant'),
-    ('hon', 'honorable'),
-    ('sgt', 'sergeant'),
-    ('capt', 'captain'),
-    ('esq', 'esquire'),
-    ('ltd', 'limited'),
-    ('col', 'colonel'),
-    ('ft', 'fort'),
-]]
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+                  for x in [
+                      ('mrs', 'misess'),
+                      ('mr', 'mister'),
+                      ('dr', 'doctor'),
+                      ('st', 'saint'),
+                      ('co', 'company'),
+                      ('jr', 'junior'),
+                      ('maj', 'major'),
+                      ('gen', 'general'),
+                      ('drs', 'doctors'),
+                      ('rev', 'reverend'),
+                      ('lt', 'lieutenant'),
+                      ('hon', 'honorable'),
+                      ('sgt', 'sergeant'),
+                      ('capt', 'captain'),
+                      ('esq', 'esquire'),
+                      ('ltd', 'limited'),
+                      ('col', 'colonel'),
+                      ('ft', 'fort'),
+                  ]]
 
 
 def expand_abbreviations(text):
diff --git a/parakeet/g2p/text/cmudict.py b/parakeet/g2p/text/cmudict.py
index 1f1ea9b..bbe7903 100644
--- a/parakeet/g2p/text/cmudict.py
+++ b/parakeet/g2p/text/cmudict.py
@@ -1,14 +1,28 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 
-
 valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
-    'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
-    'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
-    'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
-    'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
-    'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
-    'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
+    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
+    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
+    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
+    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
+    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
+    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
+    'Y', 'Z', 'ZH'
 ]
 
 _valid_symbol_set = set(valid_symbols)
@@ -24,7 +38,10 @@ class CMUDict:
         else:
             entries = _parse_cmudict(file_or_path)
         if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+            entries = {
+                word: pron
+                for word, pron in entries.items() if len(pron) == 1
+            }
         self._entries = entries
 
     def __len__(self):
diff --git a/parakeet/g2p/text/numbers.py b/parakeet/g2p/text/numbers.py
index 93f676d..24b5817 100644
--- a/parakeet/g2p/text/numbers.py
+++ b/parakeet/g2p/text/numbers.py
@@ -3,7 +3,6 @@
 import inflect
 import re
 
-
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@@ -56,7 +55,8 @@ def _expand_number(m):
         elif num % 100 == 0:
             return _inflect.number_to_words(num // 100) + ' hundred'
         else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(
+                num, andword='', zero='oh', group=2).replace(', ', ' ')
     else:
         return _inflect.number_to_words(num, andword='')
 
diff --git a/parakeet/g2p/text/symbols.py b/parakeet/g2p/text/symbols.py
index da87c93..299ca58 100644
--- a/parakeet/g2p/text/symbols.py
+++ b/parakeet/g2p/text/symbols.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Defines the set of symbols used in text input to the model.
 
diff --git a/parakeet/models/__init__.py b/parakeet/models/__init__.py
index e69de29..abf198b 100644
--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/parakeet/models/deepvoice3/__init__.py b/parakeet/models/deepvoice3/__init__.py
index 0430987..86f91e0 100644
--- a/parakeet/models/deepvoice3/__init__.py
+++ b/parakeet/models/deepvoice3/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
 from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
 from parakeet.models.deepvoice3.converter import Converter
diff --git a/parakeet/models/deepvoice3/attention.py b/parakeet/models/deepvoice3/attention.py
index 8f2c2c5..33ffc11 100644
--- a/parakeet/models/deepvoice3/attention.py
+++ b/parakeet/models/deepvoice3/attention.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from collections import namedtuple
 from paddle import fluid
@@ -19,23 +33,19 @@ class Attention(dg.Layer):
                  value_projection=True):
         super(Attention, self).__init__()
         std = np.sqrt(1 / query_dim)
-        self.query_proj = Linear(query_dim,
-                                 embed_dim,
-                                 param_attr=I.Normal(scale=std))
+        self.query_proj = Linear(
+            query_dim, embed_dim, param_attr=I.Normal(scale=std))
         if key_projection:
             std = np.sqrt(1 / embed_dim)
-            self.key_proj = Linear(embed_dim,
-                                   embed_dim,
-                                   param_attr=I.Normal(scale=std))
+            self.key_proj = Linear(
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
         if value_projection:
             std = np.sqrt(1 / embed_dim)
-            self.value_proj = Linear(embed_dim,
-                                     embed_dim,
-                                     param_attr=I.Normal(scale=std))
+            self.value_proj = Linear(
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
         std = np.sqrt(1 / embed_dim)
-        self.out_proj = Linear(embed_dim,
-                               query_dim,
-                               param_attr=I.Normal(scale=std))
+        self.out_proj = Linear(
+            embed_dim, query_dim, param_attr=I.Normal(scale=std))
 
         self.key_projection = key_projection
         self.value_projection = value_projection
@@ -102,9 +112,8 @@ class Attention(dg.Layer):
 
         x = F.softmax(x)
         attn_scores = x
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
         x = F.matmul(x, values)
         encoder_length = keys.shape[1]
         # CAUTION: is it wrong? let it be now
diff --git a/parakeet/models/deepvoice3/conv1dglu.py b/parakeet/models/deepvoice3/conv1dglu.py
index 23f0109..584c3d7 100644
--- a/parakeet/models/deepvoice3/conv1dglu.py
+++ b/parakeet/models/deepvoice3/conv1dglu.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 
 from paddle import fluid
@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
     has residual connection from the input x, and scale the output by 
     np.sqrt(0.5).
     """
+
     def __init__(self,
                  n_speakers,
                  speaker_dim,
@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
             ), "this block uses residual connection"\
                 "the input_channes should equals num_filters"
         std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
-        self.conv = Conv1DCell(in_channels,
-                               2 * num_filters,
-                               filter_size,
-                               dilation,
-                               causal,
-                               param_attr=I.Normal(scale=std))
+        self.conv = Conv1DCell(
+            in_channels,
+            2 * num_filters,
+            filter_size,
+            dilation,
+            causal,
+            param_attr=I.Normal(scale=std))
 
         if n_speakers > 1:
             assert (speaker_dim is not None
                     ), "speaker embed should not be null in multi-speaker case"
             std = np.sqrt(1 / speaker_dim)
-            self.fc = Linear(speaker_dim,
-                             num_filters,
-                             param_attr=I.Normal(scale=std))
+            self.fc = Linear(
+                speaker_dim, num_filters, param_attr=I.Normal(scale=std))
 
     def forward(self, x, speaker_embed=None):
         """
@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
                 C_out means the output channels of Conv1DGLU.
         """
         residual = x
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
         x = self.conv(x)
         content, gate = F.split(x, num_or_sections=2, dim=1)
 
@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
                 C_out means the output channels of Conv1DGLU.
         """
         residual = x_t
-        x_t = F.dropout(x_t,
-                        self.dropout,
-                        dropout_implementation="upscale_in_train")
+        x_t = F.dropout(
+            x_t, self.dropout, dropout_implementation="upscale_in_train")
         x_t = self.conv.add_input(x_t)
         content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)
 
diff --git a/parakeet/models/deepvoice3/converter.py b/parakeet/models/deepvoice3/converter.py
index 7f94805..5181a5c 100644
--- a/parakeet/models/deepvoice3/converter.py
+++ b/parakeet/models/deepvoice3/converter.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from itertools import chain
 
@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
             2,
             stride=2,
             param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout),
-        Conv1DTranspose(
+        Conv1DGLU(
+            n_speakers,
+            speaker_dim,
             target_channels,
             target_channels,
-            2,
-            stride=2,
-            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout)
+            3,
+            dilation=1,
+            std_mul=1.,
+            dropout=dropout), Conv1DGLU(
+                n_speakers,
+                speaker_dim,
+                target_channels,
+                target_channels,
+                3,
+                dilation=3,
+                std_mul=4.,
+                dropout=dropout), Conv1DTranspose(
+                    target_channels,
+                    target_channels,
+                    2,
+                    stride=2,
+                    param_attr=I.Normal(scale=np.sqrt(
+                        4. / (2 * target_channels)))), Conv1DGLU(
+                            n_speakers,
+                            speaker_dim,
+                            target_channels,
+                            target_channels,
+                            3,
+                            dilation=1,
+                            std_mul=1.,
+                            dropout=dropout), Conv1DGLU(
+                                n_speakers,
+                                speaker_dim,
+                                target_channels,
+                                target_channels,
+                                3,
+                                dilation=3,
+                                std_mul=4.,
+                                dropout=dropout)
     ]
     return upsampling_convolutions
 
@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
             2,
             stride=2,
             param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout)
+        Conv1DGLU(
+            n_speakers,
+            speaker_dim,
+            target_channels,
+            target_channels,
+            3,
+            dilation=1,
+            std_mul=1.,
+            dropout=dropout), Conv1DGLU(
+                n_speakers,
+                speaker_dim,
+                target_channels,
+                target_channels,
+                3,
+                dilation=3,
+                std_mul=4.,
+                dropout=dropout)
     ]
     return upsampling_convolutions
 
 
 def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
     upsampling_convolutions = [
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout)
+        Conv1DGLU(
+            n_speakers,
+            speaker_dim,
+            target_channels,
+            target_channels,
+            3,
+            dilation=3,
+            std_mul=4.,
+            dropout=dropout)
     ]
     return upsampling_convolutions
 
@@ -108,6 +125,7 @@ class Converter(dg.Layer):
     Vocoder that transforms mel spectrogram (or ecoder hidden states) 
     to waveform.
     """
+
     def __init__(self,
                  n_speakers,
                  speaker_dim,
@@ -161,33 +179,36 @@ class Converter(dg.Layer):
                 std = np.sqrt(std_mul / in_channels)
                 # CAUTION: relu
                 self.convolutions.append(
-                    Conv1D(in_channels,
-                           out_channels,
-                           1,
-                           act="relu",
-                           param_attr=I.Normal(scale=std)))
+                    Conv1D(
+                        in_channels,
+                        out_channels,
+                        1,
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                 in_channels = out_channels
                 std_mul = 2.0
             self.convolutions.append(
-                Conv1DGLU(n_speakers,
-                          speaker_dim,
-                          in_channels,
-                          out_channels,
-                          filter_size,
-                          dilation=dilation,
-                          std_mul=std_mul,
-                          dropout=dropout))
+                Conv1DGLU(
+                    n_speakers,
+                    speaker_dim,
+                    in_channels,
+                    out_channels,
+                    filter_size,
+                    dilation=dilation,
+                    std_mul=std_mul,
+                    dropout=dropout))
             in_channels = out_channels
             std_mul = 4.0
 
         # final conv proj, channel transformed to linear dim
         std = np.sqrt(std_mul * (1 - dropout) / in_channels)
         # CAUTION: sigmoid
-        self.last_conv_proj = Conv1D(in_channels,
-                                     linear_dim,
-                                     1,
-                                     act="sigmoid",
-                                     param_attr=I.Normal(scale=std))
+        self.last_conv_proj = Conv1D(
+            in_channels,
+            linear_dim,
+            1,
+            act="sigmoid",
+            param_attr=I.Normal(scale=std))
 
     def forward(self, x, speaker_embed=None):
         """
@@ -229,4 +250,4 @@ class Converter(dg.Layer):
 
         out = self.last_conv_proj(x)
         out = F.transpose(out, [0, 2, 1])
-        return out
\ No newline at end of file
+        return out
diff --git a/parakeet/models/deepvoice3/decoder.py b/parakeet/models/deepvoice3/decoder.py
index 8e6a46b..7b7f581 100644
--- a/parakeet/models/deepvoice3/decoder.py
+++ b/parakeet/models/deepvoice3/decoder.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import paddle.fluid.layers as F
 import paddle.fluid.initializer as I
@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
 
 class Decoder(dg.Layer):
     def __init__(
-        self,
-        n_speakers,
-        speaker_dim,
-        embed_dim,
-        mel_dim,
-        r=1,
-        max_positions=512,
-        padding_idx=None,  # remove it!
-        preattention=(ConvSpec(128, 5, 1), ) * 4,
-        convolutions=(ConvSpec(128, 5, 1), ) * 4,
-        attention=True,
-        dropout=0.0,
-        use_memory_mask=False,
-        force_monotonic_attention=False,
-        query_position_rate=1.0,
-        key_position_rate=1.0,
-        window_range=WindowRange(-1, 3),
-        key_projection=True,
-        value_projection=True):
+            self,
+            n_speakers,
+            speaker_dim,
+            embed_dim,
+            mel_dim,
+            r=1,
+            max_positions=512,
+            padding_idx=None,  # remove it!
+            preattention=(ConvSpec(128, 5, 1), ) * 4,
+            convolutions=(ConvSpec(128, 5, 1), ) * 4,
+            attention=True,
+            dropout=0.0,
+            use_memory_mask=False,
+            force_monotonic_attention=False,
+            query_position_rate=1.0,
+            key_position_rate=1.0,
+            window_range=WindowRange(-1, 3),
+            key_projection=True,
+            value_projection=True):
         super(Decoder, self).__init__()
 
         self.dropout = dropout
@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
 
         conv_channels = convolutions[0].out_channels
         # only when padding idx is 0 can we easilt handle it
-        self.embed_keys_positions = PositionEmbedding(max_positions,
-                                                      embed_dim,
-                                                      padding_idx=0)
-        self.embed_query_positions = PositionEmbedding(max_positions,
-                                                       conv_channels,
-                                                       padding_idx=0)
+        self.embed_keys_positions = PositionEmbedding(
+            max_positions, embed_dim, padding_idx=0)
+        self.embed_query_positions = PositionEmbedding(
+            max_positions, conv_channels, padding_idx=0)
 
         if n_speakers > 1:
             std = np.sqrt((1 - dropout) / speaker_dim)
-            self.speaker_proj1 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
-            self.speaker_proj2 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
+            self.speaker_proj1 = Linear(
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
+            self.speaker_proj2 = Linear(
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
 
         # prenet
         self.prenet = dg.LayerList()
@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
                 # conv1d & relu
                 std = np.sqrt(std_mul / in_channels)
                 self.prenet.append(
-                    Conv1D(in_channels,
-                           out_channels,
-                           1,
-                           act="relu",
-                           param_attr=I.Normal(scale=std)))
+                    Conv1D(
+                        in_channels,
+                        out_channels,
+                        1,
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                 in_channels = out_channels
                 std_mul = 2.0
             self.prenet.append(
-                Conv1DGLU(n_speakers,
-                          speaker_dim,
-                          in_channels,
-                          out_channels,
-                          filter_size,
-                          dilation,
-                          std_mul,
-                          dropout,
-                          causal=True,
-                          residual=True))
+                Conv1DGLU(
+                    n_speakers,
+                    speaker_dim,
+                    in_channels,
+                    out_channels,
+                    filter_size,
+                    dilation,
+                    std_mul,
+                    dropout,
+                    causal=True,
+                    residual=True))
             in_channels = out_channels
             std_mul = 4.0
 
@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
             assert (
                 in_channels == out_channels
             ), "the stack of convolution & attention does not change channels"
-            conv_layer = Conv1DGLU(n_speakers,
-                                   speaker_dim,
-                                   in_channels,
-                                   out_channels,
-                                   filter_size,
-                                   dilation,
-                                   std_mul,
-                                   dropout,
-                                   causal=True,
-                                   residual=False)
+            conv_layer = Conv1DGLU(
+                n_speakers,
+                speaker_dim,
+                in_channels,
+                out_channels,
+                filter_size,
+                dilation,
+                std_mul,
+                dropout,
+                causal=True,
+                residual=False)
             attn_layer = Attention(
                 out_channels,
                 embed_dim,
@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
 
         # 1 * 1 conv to transform channels
         std = np.sqrt(std_mul * (1 - dropout) / in_channels)
-        self.last_conv = Conv1D(in_channels,
-                                mel_dim * r,
-                                1,
-                                param_attr=I.Normal(scale=std))
+        self.last_conv = Conv1D(
+            in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
 
         # mel (before sigmoid) to done hat
         std = np.sqrt(1 / in_channels)
@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
         # (B, C, T)
         frames = F.transpose(frames, [0, 2, 1])
         x = frames
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
         # Prenet
         for layer in self.prenet:
             if isinstance(layer, Conv1DGLU):
@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
             test_inputs = fold_adjacent_frames(test_inputs, self.r)
             test_inputs = F.transpose(test_inputs, [0, 2, 1])
 
-        initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
-                                dtype=keys.dtype)
+        initial_input = F.zeros(
+            (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
 
         t = 0  # decoder time step
         while True:
-            frame_pos = F.fill_constant((batch_size, 1),
-                                        value=t + 1,
-                                        dtype="int64")
+            frame_pos = F.fill_constant(
+                (batch_size, 1), value=t + 1, dtype="int64")
             w = self.query_position_rate
             if self.n_speakers > 1:
                 w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
                     current_input = initial_input
 
             x_t = current_input
-            x_t = F.dropout(x_t,
-                            self.dropout,
-                            dropout_implementation="upscale_in_train")
+            x_t = F.dropout(
+                x_t, self.dropout, dropout_implementation="upscale_in_train")
 
             # Prenet
             for layer in self.prenet:
@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
                     x_t = F.transpose(x_t, [0, 2, 1])
                     if frame_pos_embed is not None:
                         x_t += frame_pos_embed
-                    x_t, attn_scores = attn(
-                        x_t, (keys, values), mask,
-                        last_attended[i] if test_inputs is None else None)
+                    x_t, attn_scores = attn(x_t, (keys, values), mask,
+                                            last_attended[i]
+                                            if test_inputs is None else None)
                     x_t = F.transpose(x_t, [0, 2, 1])
                     step_attn_scores.append(attn_scores)  #(B, T_dec=1, T_enc)
                     # update last attended when necessary
                     if self.force_monotonic_attention[i]:
-                        last_attended[i] = np.argmax(attn_scores.numpy(),
-                                                     axis=-1)[0][0]
+                        last_attended[i] = np.argmax(
+                            attn_scores.numpy(), axis=-1)[0][0]
                 x_t = F.scale(residual + x_t, np.sqrt(0.5))
             if len(step_attn_scores):
                 # (B, 1, T_enc) again
@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
             t += 1
 
             if test_inputs is None:
-                if F.reduce_min(done_t).numpy(
-                )[0] > 0.5 and t > self.min_decoder_steps:
+                if F.reduce_min(done_t).numpy()[
+                        0] > 0.5 and t > self.min_decoder_steps:
                     break
                 elif t > self.max_decoder_steps:
                     break
diff --git a/parakeet/models/deepvoice3/encoder.py b/parakeet/models/deepvoice3/encoder.py
index ebcd62f..b3e8bfb 100644
--- a/parakeet/models/deepvoice3/encoder.py
+++ b/parakeet/models/deepvoice3/encoder.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from collections import namedtuple
 
@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
         self.dropout = dropout
         if n_speakers > 1:
             std = np.sqrt((1 - dropout) / speaker_dim)
-            self.sp_proj1 = Linear(speaker_dim,
-                                   embed_dim,
-                                   act="softsign",
-                                   param_attr=I.Normal(scale=std))
-            self.sp_proj2 = Linear(speaker_dim,
-                                   embed_dim,
-                                   act="softsign",
-                                   param_attr=I.Normal(scale=std))
+            self.sp_proj1 = Linear(
+                speaker_dim,
+                embed_dim,
+                act="softsign",
+                param_attr=I.Normal(scale=std))
+            self.sp_proj2 = Linear(
+                speaker_dim,
+                embed_dim,
+                act="softsign",
+                param_attr=I.Normal(scale=std))
         self.n_speakers = n_speakers
 
         self.convolutions = dg.LayerList()
@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
             if in_channels != out_channels:
                 std = np.sqrt(std_mul / in_channels)
                 self.convolutions.append(
-                    Conv1D(in_channels,
-                           out_channels,
-                           1,
-                           act="relu",
-                           param_attr=I.Normal(scale=std)))
+                    Conv1D(
+                        in_channels,
+                        out_channels,
+                        1,
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                 in_channels = out_channels
                 std_mul = 2.0
 
             self.convolutions.append(
-                Conv1DGLU(n_speakers,
-                          speaker_dim,
-                          in_channels,
-                          out_channels,
-                          filter_size,
-                          dilation,
-                          std_mul,
-                          dropout,
-                          causal=False,
-                          residual=True))
+                Conv1DGLU(
+                    n_speakers,
+                    speaker_dim,
+                    in_channels,
+                    out_channels,
+                    filter_size,
+                    dilation,
+                    std_mul,
+                    dropout,
+                    causal=False,
+                    residual=True))
             in_channels = out_channels
             std_mul = 4.0
 
         std = np.sqrt(std_mul * (1 - dropout) / in_channels)
         self.convolutions.append(
-            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
+            Conv1D(
+                in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
 
     def forward(self, x, speaker_embed=None):
         """
@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
                 representation for values.
         """
         x = self.embed(x)
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
         x = F.transpose(x, [0, 2, 1])
 
         if self.n_speakers > 1 and speaker_embed is not None:
diff --git a/parakeet/models/deepvoice3/loss.py b/parakeet/models/deepvoice3/loss.py
index 86412e7..be6f0bd 100644
--- a/parakeet/models/deepvoice3/loss.py
+++ b/parakeet/models/deepvoice3/loss.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from numba import jit
 
@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
     return W
 
 
-def guided_attentions(encoder_lengths,
-                      decoder_lengths,
-                      max_decoder_len,
+def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
                       g=0.2):
     B = len(encoder_lengths)
     max_input_len = encoder_lengths.max()
@@ -93,9 +105,8 @@ class TTSLoss(object):
     def binary_divergence(self, prediction, target, mask):
         flattened_prediction = F.reshape(prediction, [-1, 1])
         flattened_target = F.reshape(target, [-1, 1])
-        flattened_loss = F.log_loss(flattened_prediction,
-                                    flattened_target,
-                                    epsilon=1e-8)
+        flattened_loss = F.log_loss(
+            flattened_prediction, flattened_target, epsilon=1e-8)
         bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
 
         w = self.masked_weight
@@ -163,23 +174,20 @@ class TTSLoss(object):
         max_mel_steps = max_frames // self.downsample_factor
         max_decoder_steps = max_mel_steps // self.r
 
-        decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
-                                       self.r,
-                                       max_decoder_steps,
-                                       dtype="float32")
-        mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
-                                   max_mel_steps,
-                                   dtype="float32")
+        decoder_mask = F.sequence_mask(
+            n_frames // self.downsample_factor // self.r,
+            max_decoder_steps,
+            dtype="float32")
+        mel_mask = F.sequence_mask(
+            n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
         lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
 
         if compute_lin_loss:
             lin_hyp = lin_hyp[:, :-self.time_shift, :]
             lin_ref = lin_ref[:, self.time_shift:, :]
             lin_mask = lin_mask[:, self.time_shift:, :]
-            lin_l1_loss = self.l1_loss(lin_hyp,
-                                       lin_ref,
-                                       lin_mask,
-                                       priority_bin=self.priority_bin)
+            lin_l1_loss = self.l1_loss(
+                lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
             lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
             lin_loss = self.binary_divergence_weight * lin_bce_loss \
                      + (1 - self.binary_divergence_weight) * lin_l1_loss
@@ -197,9 +205,10 @@ class TTSLoss(object):
             total_loss += mel_loss
 
         if compute_attn_loss:
-            attn_loss = self.attention_loss(
-                attn_hyp, input_lengths.numpy(),
-                n_frames.numpy() // (self.downsample_factor * self.r))
+            attn_loss = self.attention_loss(attn_hyp,
+                                            input_lengths.numpy(),
+                                            n_frames.numpy() //
+                                            (self.downsample_factor * self.r))
             total_loss += attn_loss
 
         if compute_done_loss:
diff --git a/parakeet/models/deepvoice3/model.py b/parakeet/models/deepvoice3/model.py
index 57c3fcf..f2fb271 100644
--- a/parakeet/models/deepvoice3/model.py
+++ b/parakeet/models/deepvoice3/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 
 import paddle.fluid.layers as F
@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
         mel_outputs, alignments, done, decoder_states = self.decoder(
             (keys, values), valid_lengths, mel_inputs, text_positions,
             frame_positions, speaker_embed)
-        linear_outputs = self.converter(
-            decoder_states if self.use_decoder_states else mel_outputs,
-            speaker_embed)
+        linear_outputs = self.converter(decoder_states
+                                        if self.use_decoder_states else
+                                        mel_outputs, speaker_embed)
         return mel_outputs, linear_outputs, alignments, done
 
     def transduce(self, text_sequences, text_positions, speaker_indices=None):
@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
         keys, values = self.encoder(text_sequences, speaker_embed)
         mel_outputs, alignments, done, decoder_states = self.decoder.decode(
             (keys, values), text_positions, speaker_embed)
-        linear_outputs = self.converter(
-            decoder_states if self.use_decoder_states else mel_outputs,
-            speaker_embed)
+        linear_outputs = self.converter(decoder_states
+                                        if self.use_decoder_states else
+                                        mel_outputs, speaker_embed)
         return mel_outputs, linear_outputs, alignments, done
diff --git a/parakeet/models/deepvoice3/position_embedding.py b/parakeet/models/deepvoice3/position_embedding.py
index aefb00c..88ef5cb 100644
--- a/parakeet/models/deepvoice3/position_embedding.py
+++ b/parakeet/models/deepvoice3/position_embedding.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle import fluid
 import paddle.fluid.layers as F
@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
                                             speaker_position_rate)  # (B, V, C)
         # make indices for gather_nd
         batch_id = F.expand(
-            F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
-            [1, time_steps])
+            F.unsqueeze(
+                F.range(
+                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
         # (B, T, 2)
         gather_nd_id = F.stack([batch_id, indices], -1)
 
         out = F.gather_nd(weight, gather_nd_id)
-        return out
\ No newline at end of file
+        return out
diff --git a/parakeet/models/fastspeech/__init__.py b/parakeet/models/fastspeech/__init__.py
index e69de29..131e065 100644
--- a/parakeet/models/fastspeech/__init__.py
+++ b/parakeet/models/fastspeech/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/parakeet/models/fastspeech/decoder.py b/parakeet/models/fastspeech/decoder.py
index 732fed4..46eb391 100644
--- a/parakeet/models/fastspeech/decoder.py
+++ b/parakeet/models/fastspeech/decoder.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 
+
 class Decoder(dg.Layer):
     def __init__(self,
                  len_max_seq,
@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
         super(Decoder, self).__init__()
 
         n_position = len_max_seq + 1
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] 
+        self.pos_inp = get_sinusoid_encoding_table(
+            n_position, d_model, padding_idx=0)
+        self.position_enc = dg.Embedding(
+            size=[n_position, d_model],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
         for i, layer in enumerate(self.layer_stack):
             self.add_sublayer('fft_{}'.format(i), layer)
-    
+
     def forward(self, enc_seq, enc_pos):
         """
         Decoder layer of FastSpeech.
@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
                 slf_attn_mask=slf_attn_mask)
             dec_slf_attn_list += [dec_slf_attn]
 
-        return dec_output, dec_slf_attn_list
\ No newline at end of file
+        return dec_output, dec_slf_attn_list
diff --git a/parakeet/models/fastspeech/encoder.py b/parakeet/models/fastspeech/encoder.py
index ac96e39..15c8d60 100644
--- a/parakeet/models/fastspeech/encoder.py
+++ b/parakeet/models/fastspeech/encoder.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 
+
 class Encoder(dg.Layer):
     def __init__(self,
                  n_src_vocab,
@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
         super(Encoder, self).__init__()
         n_position = len_max_seq + 1
 
-        self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
+        self.src_word_emb = dg.Embedding(
+            size=[n_src_vocab, d_model], padding_idx=0)
+        self.pos_inp = get_sinusoid_encoding_table(
+            n_position, d_model, padding_idx=0)
+        self.position_enc = dg.Embedding(
+            size=[n_position, d_model],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
         for i, layer in enumerate(self.layer_stack):
             self.add_sublayer('fft_{}'.format(i), layer)
 
@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
         non_pad_mask = get_non_pad_mask(character)
 
         # -- Forward
-        enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
+        enc_output = self.src_word_emb(character) + self.position_enc(
+            text_pos)  #(N, T, C)
 
         for enc_layer in self.layer_stack:
             enc_output, enc_slf_attn = enc_layer(
@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
                 non_pad_mask=non_pad_mask,
                 slf_attn_mask=slf_attn_mask)
             enc_slf_attn_list += [enc_slf_attn]
-        
-        return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
+
+        return enc_output, non_pad_mask, enc_slf_attn_list
diff --git a/parakeet/models/fastspeech/fastspeech.py b/parakeet/models/fastspeech/fastspeech.py
index 4a01b95..91478af 100644
--- a/parakeet/models/fastspeech/fastspeech.py
+++ b/parakeet/models/fastspeech/fastspeech.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
 from parakeet.models.fastspeech.encoder import Encoder
 from parakeet.models.fastspeech.decoder import Decoder
 
+
 class FastSpeech(dg.Layer):
     def __init__(self, cfg):
         " FastSpeech"
         super(FastSpeech, self).__init__()
 
-        self.encoder = Encoder(n_src_vocab=len(symbols)+1,
-                               len_max_seq=cfg['max_seq_len'],
-                               n_layers=cfg['encoder_n_layer'],
-                               n_head=cfg['encoder_head'],
-                               d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_model=cfg['fs_hidden_size'],
-                               d_inner=cfg['encoder_conv1d_filter_size'],
-                               fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
-                               fft_conv1d_padding=cfg['fft_conv1d_padding'],
-                               dropout=0.1)
-        self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], 
-                                                out_channels=cfg['duration_predictor_output_size'], 
-                                                filter_size=cfg['duration_predictor_filter_size'], 
-                                                dropout=cfg['dropout'])
-        self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
-                                n_layers=cfg['decoder_n_layer'],
-                                n_head=cfg['decoder_head'],
-                                d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                d_model=cfg['fs_hidden_size'],
-                                d_inner=cfg['decoder_conv1d_filter_size'],
-                                fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
-                                fft_conv1d_padding=cfg['fft_conv1d_padding'],
-                                dropout=0.1)
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.encoder = Encoder(
+            n_src_vocab=len(symbols) + 1,
+            len_max_seq=cfg['max_seq_len'],
+            n_layers=cfg['encoder_n_layer'],
+            n_head=cfg['encoder_head'],
+            d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            d_model=cfg['fs_hidden_size'],
+            d_inner=cfg['encoder_conv1d_filter_size'],
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            dropout=0.1)
+        self.length_regulator = LengthRegulator(
+            input_size=cfg['fs_hidden_size'],
+            out_channels=cfg['duration_predictor_output_size'],
+            filter_size=cfg['duration_predictor_filter_size'],
+            dropout=cfg['dropout'])
+        self.decoder = Decoder(
+            len_max_seq=cfg['max_seq_len'],
+            n_layers=cfg['decoder_n_layer'],
+            n_head=cfg['decoder_head'],
+            d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            d_model=cfg['fs_hidden_size'],
+            d_inner=cfg['decoder_conv1d_filter_size'],
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            dropout=0.1)
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
         k = math.sqrt(1 / cfg['fs_hidden_size'])
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
-        self.mel_linear = dg.Linear(cfg['fs_hidden_size'], 
-                                    cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
-                                    param_attr = self.weight,
-                                    bias_attr = self.bias,)
-        self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
-                 num_hidden=512,
-                 filter_size=5,
-                 padding=int(5 / 2),
-                 num_conv=5,
-                 outputs_per_step=cfg['audio']['outputs_per_step'],
-                 use_cudnn=True,
-                 dropout=0.1,
-                 batchnorm_last=True)
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))
+        self.mel_linear = dg.Linear(
+            cfg['fs_hidden_size'],
+            cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
+            param_attr=self.weight,
+            bias_attr=self.bias, )
+        self.postnet = PostConvNet(
+            n_mels=cfg['audio']['num_mels'],
+            num_hidden=512,
+            filter_size=5,
+            padding=int(5 / 2),
+            num_conv=5,
+            outputs_per_step=cfg['audio']['outputs_per_step'],
+            use_cudnn=True,
+            dropout=0.1,
+            batchnorm_last=True)
 
-    def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
+    def forward(self,
+                character,
+                text_pos,
+                mel_pos=None,
+                length_target=None,
+                alpha=1.0):
         """
         FastSpeech model.
         
@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
             dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
         """
 
-        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
+        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
+            character, text_pos)
         if fluid.framework._dygraph_tracer()._train_mode:
-            
-            length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
-                                                                                       target=length_target,
-                                                                                       alpha=alpha)
-            decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
+
+            length_regulator_output, duration_predictor_output = self.length_regulator(
+                encoder_output, target=length_target, alpha=alpha)
+            decoder_output, dec_slf_attn_list = self.decoder(
+                length_regulator_output, mel_pos)
 
             mel_output = self.mel_linear(decoder_output)
             mel_output_postnet = self.postnet(mel_output) + mel_output
 
             return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
         else:
-            length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
-            decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
+            length_regulator_output, decoder_pos = self.length_regulator(
+                encoder_output, alpha=alpha)
+            decoder_output, _ = self.decoder(length_regulator_output,
+                                             decoder_pos)
             mel_output = self.mel_linear(decoder_output)
             mel_output_postnet = self.postnet(mel_output) + mel_output
 
-            return mel_output, mel_output_postnet
\ No newline at end of file
+            return mel_output, mel_output_postnet
diff --git a/parakeet/models/fastspeech/fft_block.py b/parakeet/models/fastspeech/fft_block.py
index ea86328..f50f11a 100644
--- a/parakeet/models/fastspeech/fft_block.py
+++ b/parakeet/models/fastspeech/fft_block.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import paddle.fluid.dygraph as dg
@@ -6,11 +19,32 @@ import paddle.fluid as fluid
 from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 
+
 class FFTBlock(dg.Layer):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
+    def __init__(self,
+                 d_model,
+                 d_inner,
+                 n_head,
+                 d_k,
+                 d_v,
+                 filter_size,
+                 padding,
+                 dropout=0.2):
         super(FFTBlock, self).__init__()
-        self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
-        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
+        self.slf_attn = MultiheadAttention(
+            d_model,
+            d_k,
+            d_v,
+            num_head=n_head,
+            is_bias=True,
+            dropout=dropout,
+            is_concat=False)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model,
+            d_inner,
+            filter_size=filter_size,
+            padding=padding,
+            dropout=dropout)
 
     def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
         """
@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
             output (Variable), Shape(B, T, C), the output after self-attention & ffn.
             slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
         """
-        output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        output, slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
         output *= non_pad_mask
 
         output = self.pos_ffn(output)
         output *= non_pad_mask
 
-        return output, slf_attn
\ No newline at end of file
+        return output, slf_attn
diff --git a/parakeet/models/fastspeech/length_regulator.py b/parakeet/models/fastspeech/length_regulator.py
index d90eaa5..331597a 100644
--- a/parakeet/models/fastspeech/length_regulator.py
+++ b/parakeet/models/fastspeech/length_regulator.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import parakeet.models.fastspeech.utils
@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 
+
 class LengthRegulator(dg.Layer):
     def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
         super(LengthRegulator, self).__init__()
-        self.duration_predictor = DurationPredictor(input_size=input_size, 
-                                                    out_channels=out_channels, 
-                                                    filter_size=filter_size, 
-                                                    dropout=dropout)
+        self.duration_predictor = DurationPredictor(
+            input_size=input_size,
+            out_channels=out_channels,
+            filter_size=filter_size,
+            dropout=dropout)
 
     def LR(self, x, duration_predictor_output, alpha=1.0):
         output = []
         batch_size = x.shape[0]
         for i in range(batch_size):
-            output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
+            output.append(
+                self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
+                            alpha))
         output = self.pad(output)
         return output
-    
+
     def pad(self, input_ele):
         max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
         out_list = []
         for i in range(len(input_ele)):
             pad_len = max_len - input_ele[i].shape[0]
-            one_batch_padded = layers.pad(
-                input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
+            one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
+                                          pad_value=0.0)
             out_list.append(one_batch_padded)
         out_padded = layers.stack(out_list)
         return out_padded
-    
+
     def expand(self, batch, predicted, alpha):
         out = []
         time_steps = batch.shape[1]
         fertilities = predicted.numpy()
-        batch = layers.squeeze(batch,[0]) 
-        
-        
+        batch = layers.squeeze(batch, [0])
+
         for i in range(time_steps):
-            if fertilities[0,i]==0:
+            if fertilities[0, i] == 0:
                 continue
-            out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
+            out.append(
+                layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
         out = layers.concat(out, axis=0)
         return out
-    
 
     def forward(self, x, alpha=1.0, target=None):
         """
@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
         else:
             duration_predictor_output = layers.round(duration_predictor_output)
             output = self.LR(x, duration_predictor_output, alpha)
-            mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
+            mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
             mel_pos = layers.unsqueeze(mel_pos, [0])
             return output, mel_pos
 
+
 class DurationPredictor(dg.Layer):
     def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
         super(DurationPredictor, self).__init__()
@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
         self.dropout = dropout
 
         k = math.sqrt(1 / self.input_size)
-        self.conv1 = Conv1D(num_channels = self.input_size, 
-                        num_filters = self.out_channels, 
-                        filter_size = self.filter_size,
-                        padding=1,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
-                        #data_format='NTC')
+        self.conv1 = Conv1D(
+            num_channels=self.input_size,
+            num_filters=self.out_channels,
+            filter_size=self.filter_size,
+            padding=1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
         k = math.sqrt(1 / self.out_channels)
-        self.conv2 = Conv1D(num_channels = self.out_channels, 
-                        num_filters = self.out_channels, 
-                        filter_size = self.filter_size,
-                        padding=1,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
-                        #data_format='NTC')
+        self.conv2 = Conv1D(
+            num_channels=self.out_channels,
+            num_filters=self.out_channels,
+            filter_size=self.filter_size,
+            padding=1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
         self.layer_norm1 = dg.LayerNorm(self.out_channels)
         self.layer_norm2 = dg.LayerNorm(self.out_channels)
 
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
         k = math.sqrt(1 / self.out_channels)
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))
 
-        self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
-                            bias_attr = self.bias)
+        self.linear = dg.Linear(
+            self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
 
     def forward(self, encoder_output):
         """
@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
             out (Variable), Shape(B, T, C), the output of duration predictor.
         """
         # encoder_output.shape(N, T, C)
-        out = layers.transpose(encoder_output, [0,2,1])
+        out = layers.transpose(encoder_output, [0, 2, 1])
         out = self.conv1(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
         out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
         out = self.conv2(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
         out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
         out = layers.relu(self.linear(out))
         out = layers.squeeze(out, axes=[-1])
-        
-            
-        return out
 
-        
+        return out
diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py
index a94de8d..5e680f0 100644
--- a/parakeet/models/fastspeech/utils.py
+++ b/parakeet/models/fastspeech/utils.py
@@ -1,5 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 
+
 def get_alignment(attn_probs, mel_lens, n_head):
     max_F = 0
     assert attn_probs[0].shape[0] % n_head == 0
@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
     for i in range(len(attn_probs)):
         multi_attn = attn_probs[i].numpy()
         for j in range(n_head):
-            attn = multi_attn[j*batch_size:(j+1)*batch_size]
+            attn = multi_attn[j * batch_size:(j + 1) * batch_size]
             F = score_F(attn)
             if max_F < F:
                 max_F = F
                 max_attn = attn
     alignment = compute_duration(max_attn, mel_lens)
     return alignment
-    
+
+
 def score_F(attn):
     max = np.max(attn, axis=-1)
     mean = np.mean(max)
     return mean
 
+
 def compute_duration(attn, mel_lens):
-    alignment = np.zeros([attn.shape[0],attn.shape[2]])
+    alignment = np.zeros([attn.shape[0], attn.shape[2]])
     mel_lens = mel_lens.numpy()
     for i in range(attn.shape[0]):
         for j in range(mel_lens[i]):
-            max_index = np.argmax(attn[i,j])
-            alignment[i,max_index] += 1
+            max_index = np.argmax(attn[i, j])
+            alignment[i, max_index] += 1
 
     return alignment
-
-
diff --git a/parakeet/models/transformer_tts/__init__.py b/parakeet/models/transformer_tts/__init__.py
index e69de29..131e065 100644
--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/parakeet/models/transformer_tts/cbhg.py b/parakeet/models/transformer_tts/cbhg.py
index 94b907f..ca93536 100644
--- a/parakeet/models/transformer_tts/cbhg.py
+++ b/parakeet/models/transformer_tts/cbhg.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
 from parakeet.modules.dynamic_gru import DynamicGRU
 import numpy as np
 
+
 class CBHG(dg.Layer):
-    def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, 
-                 max_pool_kernel_size=2, is_post=False):
+    def __init__(self,
+                 hidden_size,
+                 batch_size,
+                 K=16,
+                 projection_size=256,
+                 num_gru_layers=2,
+                 max_pool_kernel_size=2,
+                 is_post=False):
         super(CBHG, self).__init__()
         """
         :param hidden_size: dimension of hidden unit
@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
         self.projection_size = projection_size
         self.conv_list = []
         k = math.sqrt(1 / projection_size)
-        self.conv_list.append(Conv1D(num_channels = projection_size,
-                            num_filters = hidden_size,
-                            filter_size = 1,
-                            padding = int(np.floor(1/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=projection_size,
+                num_filters=hidden_size,
+                filter_size=1,
+                padding=int(np.floor(1 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k))))
         k = math.sqrt(1 / hidden_size)
-        for i in range(2,K+1):
-            self.conv_list.append(Conv1D(num_channels = hidden_size,
-                            num_filters = hidden_size,
-                            filter_size = i,
-                            padding = int(np.floor(i/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+        for i in range(2, K + 1):
+            self.conv_list.append(
+                Conv1D(
+                    num_channels=hidden_size,
+                    num_filters=hidden_size,
+                    filter_size=i,
+                    padding=int(np.floor(i / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
 
         for i, layer in enumerate(self.conv_list):
             self.add_sublayer("conv_list_{}".format(i), layer)
 
         self.batchnorm_list = []
         for i in range(K):
-            self.batchnorm_list.append(dg.BatchNorm(hidden_size, 
-                            data_layout='NCHW'))
+            self.batchnorm_list.append(
+                dg.BatchNorm(
+                    hidden_size, data_layout='NCHW'))
 
         for i, layer in enumerate(self.batchnorm_list):
             self.add_sublayer("batchnorm_list_{}".format(i), layer)
@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
         conv_outdim = hidden_size * K
 
         k = math.sqrt(1 / conv_outdim)
-        self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
-                            num_filters = hidden_size,
-                            filter_size = 3,
-                            padding = int(np.floor(3/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+        self.conv_projection_1 = Conv1D(
+            num_channels=conv_outdim,
+            num_filters=hidden_size,
+            filter_size=3,
+            padding=int(np.floor(3 / 2)),
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
 
         k = math.sqrt(1 / hidden_size)
-        self.conv_projection_2 = Conv1D(num_channels = hidden_size,
-                            num_filters = projection_size,
-                            filter_size = 3,
-                            padding = int(np.floor(3/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+        self.conv_projection_2 = Conv1D(
+            num_channels=hidden_size,
+            num_filters=projection_size,
+            filter_size=3,
+            padding=int(np.floor(3 / 2)),
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
 
-        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, 
-                            data_layout='NCHW')
-        self.batchnorm_proj_2 = dg.BatchNorm(projection_size, 
-                            data_layout='NCHW')
-        self.max_pool = Pool1D(pool_size = max_pool_kernel_size, 
-                    pool_type='max', 
-                    pool_stride=1, 
-                    pool_padding=1,
-                    data_format = "NCT")
+        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
+        self.batchnorm_proj_2 = dg.BatchNorm(
+            projection_size, data_layout='NCHW')
+        self.max_pool = Pool1D(
+            pool_size=max_pool_kernel_size,
+            pool_type='max',
+            pool_stride=1,
+            pool_padding=1,
+            data_format="NCT")
         self.highway = Highwaynet(self.projection_size)
 
         h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
         h_0 = dg.to_variable(h_0)
         k = math.sqrt(1 / hidden_size)
-        self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                            param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                            bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse = False,
-                              origin_mode = True,
-                              h_0 = h_0)
-        self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse=True,
-                              origin_mode=True,
-                              h_0 = h_0)
+        self.fc_forward1 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.fc_reverse1 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward1 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse1 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=True,
+            origin_mode=True,
+            h_0=h_0)
 
-        self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse = False,
-                              origin_mode = True,
-                              h_0 = h_0)
-        self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse=True,
-                              origin_mode=True,
-                              h_0 = h_0)
+        self.fc_forward2 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.fc_reverse2 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=True,
+            origin_mode=True,
+            h_0=h_0)
 
     def _conv_fit_dim(self, x, filter_size=3):
         if filter_size % 2 == 0:
-            return x[:,:,:-1]
+            return x[:, :, :-1]
         else:
-            return x 
+            return x
 
     def forward(self, input_):
         # input_.shape = [N, C, T]
 
         conv_list = []
         conv_input = input_
-        
-        for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
-            conv_input = self._conv_fit_dim(conv(conv_input), i+1)
+
+        for i, (conv, batchnorm
+                ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+            conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
             conv_input = layers.relu(batchnorm(conv_input))
             conv_list.append(conv_input)
-        
+
         conv_cat = layers.concat(conv_list, axis=1)
-        conv_pool = self.max_pool(conv_cat)[:,:,:-1]
-        
-        
-        conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
-        conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
-        
+        conv_pool = self.max_pool(conv_cat)[:, :, :-1]
+
+        conv_proj = layers.relu(
+            self.batchnorm_proj_1(
+                self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+        conv_proj = self.batchnorm_proj_2(
+            self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
+
         # conv_proj.shape = [N, C, T]
-        highway = layers.transpose(conv_proj, [0,2,1])
+        highway = layers.transpose(conv_proj, [0, 2, 1])
         highway = self.highway(highway)
 
         # highway.shape = [N, T, C]
@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
         out_forward = self.gru_forward2(fc_forward)
         out_reverse = self.gru_reverse2(fc_reverse)
         out = layers.concat([out_forward, out_reverse], axis=-1)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
         return out
 
+
 class Highwaynet(dg.Layer):
     def __init__(self, num_units, num_layers=4):
         super(Highwaynet, self).__init__()
@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
         self.linears = []
         k = math.sqrt(1 / num_units)
         for i in range(num_layers):
-            self.linears.append(dg.Linear(num_units, num_units,
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
-            self.gates.append(dg.Linear(num_units, num_units,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
-        
-        for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
+            self.linears.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+            self.gates.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+
+        for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
             self.add_sublayer("linears_{}".format(i), linear)
             self.add_sublayer("gates_{}".format(i), gate)
 
@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
             t_ = fluid.layers.sigmoid(gate(out))
 
             c = 1 - t_
-            out  = h * t_ + out  * c
-            
+            out = h * t_ + out * c
+
         return out
-
-
-
-
-                
-        
diff --git a/parakeet/models/transformer_tts/decoder.py b/parakeet/models/transformer_tts/decoder.py
index b0da788..3d7adf1 100644
--- a/parakeet/models/transformer_tts/decoder.py
+++ b/parakeet/models/transformer_tts/decoder.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.prenet import PreNet
 from parakeet.models.transformer_tts.post_convnet import PostConvNet
 
+
 class Decoder(dg.Layer):
     def __init__(self, num_hidden, config, num_head=4):
         super(Decoder, self).__init__()
         self.num_hidden = num_hidden
         param = fluid.ParamAttr()
-        self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
-                        default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], 
-                                            hidden_size = num_hidden * 2, 
-                                            output_size = num_hidden, 
-                                            dropout_rate=0.2)
+        self.alpha = self.create_parameter(
+            shape=(1, ),
+            attr=param,
+            dtype='float32',
+            default_initializer=fluid.initializer.ConstantInitializer(
+                value=1.0))
+        self.pos_inp = get_sinusoid_encoding_table(
+            1024, self.num_hidden, padding_idx=0)
+        self.pos_emb = dg.Embedding(
+            size=[1024, num_hidden],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.decoder_prenet = PreNet(
+            input_size=config['audio']['num_mels'],
+            hidden_size=num_hidden * 2,
+            output_size=num_hidden,
+            dropout_rate=0.2)
         k = math.sqrt(1 / num_hidden)
-        self.linear = dg.Linear(num_hidden, num_hidden,
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.linear = dg.Linear(
+            num_hidden,
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
 
-        self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.selfattn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
         for i, layer in enumerate(self.selfattn_layers):
             self.add_sublayer("self_attn_{}".format(i), layer)
-        self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.attn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
         for i, layer in enumerate(self.attn_layers):
             self.add_sublayer("attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden, num_hidden * num_head, filter_size=1)
+            for _ in range(3)
+        ]
         for i, layer in enumerate(self.ffns):
             self.add_sublayer("ffns_{}".format(i), layer)
-        self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.stop_linear = dg.Linear(num_hidden, 1,
-                                  param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                  bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.mel_linear = dg.Linear(
+            num_hidden,
+            config['audio']['num_mels'] * config['audio']['outputs_per_step'],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.stop_linear = dg.Linear(
+            num_hidden,
+            1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
 
-        self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], 
-                                       filter_size = 5, padding = 4, num_conv=5, 
-                                       outputs_per_step=config['audio']['outputs_per_step'], 
-                                       use_cudnn = True)
+        self.postconvnet = PostConvNet(
+            config['audio']['num_mels'],
+            config['hidden_size'],
+            filter_size=5,
+            padding=4,
+            num_conv=5,
+            outputs_per_step=config['audio']['outputs_per_step'],
+            use_cudnn=True)
 
     def forward(self, key, value, query, c_mask, positional):
 
         # get decoder mask with triangular matrix
-        
+
         if fluid.framework._dygraph_tracer()._train_mode:
             m_mask = get_non_pad_mask(positional)
-            mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
-            triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
+            mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
+                                         query)
+            triu_tensor = dg.to_variable(
+                get_triu_tensor(query.numpy(), query.numpy())).astype(
+                    np.float32)
             mask = mask + triu_tensor
             mask = fluid.layers.cast(mask == 0, np.float32)
-            
+
             # (batch_size, decoder_len, encoder_len)
-            zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
+            zero_mask = get_attn_key_pad_mask(
+                layers.squeeze(c_mask, [-1]), query)
         else:
-            mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
+            mask = get_triu_tensor(query.numpy(),
+                                   query.numpy()).astype(np.float32)
             mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
             m_mask, zero_mask = None, None
 
         # Decoder pre-network
         query = self.decoder_prenet(query)
-        
+
         # Centered position
         query = self.linear(query)
 
@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
         # Attention decoder-decoder, encoder-decoder
         selfattn_list = list()
         attn_list = list()
-        
-        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
-            query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
-            query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
+
+        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
+                                       self.ffns):
+            query, attn_dec = selfattn(
+                query, query, query, mask=mask, query_mask=m_mask)
+            query, attn_dot = attn(
+                key, value, query, mask=zero_mask, query_mask=m_mask)
             query = ffn(query)
             selfattn_list.append(attn_dec)
             attn_list.append(attn_dot)
@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
         # Post Mel Network
         out = self.postconvnet(mel_out)
         out = mel_out + out
-        
+
         # Stop tokens
         stop_tokens = self.stop_linear(query)
         stop_tokens = layers.squeeze(stop_tokens, [-1])
diff --git a/parakeet/models/transformer_tts/encoder.py b/parakeet/models/transformer_tts/encoder.py
index 8cd37b2..548ea8e 100644
--- a/parakeet/models/transformer_tts/encoder.py
+++ b/parakeet/models/transformer_tts/encoder.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
 
+
 class Encoder(dg.Layer):
     def __init__(self, embedding_size, num_hidden, num_head=4):
         super(Encoder, self).__init__()
         self.num_hidden = num_hidden
-        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
-        self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, 
-                                            num_hidden = num_hidden, 
-                                            use_cudnn=True)
-        self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=1.0))
+        self.alpha = self.create_parameter(
+            shape=(1, ), attr=param, dtype='float32')
+        self.pos_inp = get_sinusoid_encoding_table(
+            1024, self.num_hidden, padding_idx=0)
+        self.pos_emb = dg.Embedding(
+            size=[1024, num_hidden],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.encoder_prenet = EncoderPrenet(
+            embedding_size=embedding_size,
+            num_hidden=num_hidden,
+            use_cudnn=True)
+        self.layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
         for i, layer in enumerate(self.layers):
             self.add_sublayer("self_attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden,
+                num_hidden * num_head,
+                filter_size=1,
+                use_cudnn=True) for _ in range(3)
+        ]
         for i, layer in enumerate(self.ffns):
             self.add_sublayer("ffns_{}".format(i), layer)
 
@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
             mask = get_attn_key_pad_mask(positional, x)
         else:
             query_mask, mask = None, None
-        
+
         # Encoder pre_network
-        x = self.encoder_prenet(x) #(N,T,C)
-        
-        
+        x = self.encoder_prenet(x)  #(N,T,C)
+
         # Get positional encoding
-        positional = self.pos_emb(positional) 
-        
-        x = positional * self.alpha + x #(N, T, C)
-       
+        positional = self.pos_emb(positional)
+
+        x = positional * self.alpha + x  #(N, T, C)
 
         # Positional dropout
         x = layers.dropout(x, 0.1)
-        
+
         # Self attention encoder
         attentions = list()
         for layer, ffn in zip(self.layers, self.ffns):
-            x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
+            x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
             x = ffn(x)
             attentions.append(attention)
 
-        return x, query_mask, attentions
\ No newline at end of file
+        return x, query_mask, attentions
diff --git a/parakeet/models/transformer_tts/encoderprenet.py b/parakeet/models/transformer_tts/encoderprenet.py
index b27f2fe..d701424 100644
--- a/parakeet/models/transformer_tts/encoderprenet.py
+++ b/parakeet/models/transformer_tts/encoderprenet.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
         self.embedding_size = embedding_size
         self.num_hidden = num_hidden
         self.use_cudnn = use_cudnn
-        self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
-                                        padding_idx = None)
+        self.embedding = dg.Embedding(
+            size=[len(symbols), embedding_size], padding_idx=None)
         self.conv_list = []
         k = math.sqrt(1 / embedding_size)
-        self.conv_list.append(Conv1D(num_channels = embedding_size, 
-                            num_filters = num_hidden, 
-                            filter_size = 5,
-                            padding = int(np.floor(5/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                            use_cudnn = use_cudnn))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=embedding_size,
+                num_filters=num_hidden,
+                filter_size=5,
+                padding=int(np.floor(5 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
         k = math.sqrt(1 / num_hidden)
         for _ in range(2):
-            self.conv_list.append(Conv1D(num_channels = num_hidden, 
-                                num_filters = num_hidden, 
-                                filter_size = 5,
-                                padding = int(np.floor(5/2)),
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                                use_cudnn = use_cudnn))
+            self.conv_list.append(
+                Conv1D(
+                    num_channels=num_hidden,
+                    num_filters=num_hidden,
+                    filter_size=5,
+                    padding=int(np.floor(5 / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
 
         for i, layer in enumerate(self.conv_list):
             self.add_sublayer("conv_list_{}".format(i), layer)
 
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
-                            data_layout='NCHW') for _ in range(3)]
+        self.batch_norm_list = [
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(3)
+        ]
 
         for i, layer in enumerate(self.batch_norm_list):
             self.add_sublayer("batch_norm_list_{}".format(i), layer)
 
         k = math.sqrt(1 / num_hidden)
-        self.projection = dg.Linear(num_hidden, num_hidden,
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.projection = dg.Linear(
+            num_hidden,
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
 
     def forward(self, x):
-        x = self.embedding(x) #(batch_size, seq_len, embending_size)
-        x = layers.transpose(x,[0,2,1])
+        x = self.embedding(x)  #(batch_size, seq_len, embending_size)
+        x = layers.transpose(x, [0, 2, 1])
         for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
             x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
-        x = layers.transpose(x,[0,2,1]) #(N,T,C)
+        x = layers.transpose(x, [0, 2, 1])  #(N,T,C)
         x = self.projection(x)
 
-        return x
\ No newline at end of file
+        return x
diff --git a/parakeet/models/transformer_tts/post_convnet.py b/parakeet/models/transformer_tts/post_convnet.py
index 3e393ee..8882e79 100644
--- a/parakeet/models/transformer_tts/post_convnet.py
+++ b/parakeet/models/transformer_tts/post_convnet.py
@@ -1,11 +1,25 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from parakeet.modules.customized import Conv1D
 
+
 class PostConvNet(dg.Layer):
-    def __init__(self, 
+    def __init__(self,
                  n_mels=80,
                  num_hidden=512,
                  filter_size=5,
@@ -16,49 +30,66 @@ class PostConvNet(dg.Layer):
                  dropout=0.1,
                  batchnorm_last=False):
         super(PostConvNet, self).__init__()
-        
+
         self.dropout = dropout
         self.num_conv = num_conv
         self.batchnorm_last = batchnorm_last
         self.conv_list = []
         k = math.sqrt(1 / (n_mels * outputs_per_step))
-        self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
-                            num_filters = num_hidden,
-                            filter_size = filter_size,
-                            padding = padding,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                            use_cudnn = use_cudnn))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=n_mels * outputs_per_step,
+                num_filters=num_hidden,
+                filter_size=filter_size,
+                padding=padding,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
 
         k = math.sqrt(1 / num_hidden)
-        for _ in range(1, num_conv-1):
-            self.conv_list.append(Conv1D(num_channels = num_hidden,
-                                num_filters = num_hidden,
-                                filter_size = filter_size,
-                                padding = padding,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                                use_cudnn = use_cudnn))
+        for _ in range(1, num_conv - 1):
+            self.conv_list.append(
+                Conv1D(
+                    num_channels=num_hidden,
+                    num_filters=num_hidden,
+                    filter_size=filter_size,
+                    padding=padding,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
 
-        self.conv_list.append(Conv1D(num_channels = num_hidden,
-                            num_filters = n_mels * outputs_per_step,
-                            filter_size = filter_size,
-                            padding = padding,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                            use_cudnn = use_cudnn))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=num_hidden,
+                num_filters=n_mels * outputs_per_step,
+                filter_size=filter_size,
+                padding=padding,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
 
         for i, layer in enumerate(self.conv_list):
             self.add_sublayer("conv_list_{}".format(i), layer)
 
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
-                            data_layout='NCHW') for _ in range(num_conv-1)]
+        self.batch_norm_list = [
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
+        ]
         if self.batchnorm_last:
-            self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, 
-                                data_layout='NCHW'))
+            self.batch_norm_list.append(
+                dg.BatchNorm(
+                    n_mels * outputs_per_step, data_layout='NCHW'))
         for i, layer in enumerate(self.batch_norm_list):
             self.add_sublayer("batch_norm_list_{}".format(i), layer)
-        
 
     def forward(self, input):
         """
@@ -69,18 +100,19 @@ class PostConvNet(dg.Layer):
         Returns:
             output (Variable), Shape(B, T, C), the result after postconvnet.
         """
-        
-        input = layers.transpose(input, [0,2,1])
+
+        input = layers.transpose(input, [0, 2, 1])
         len = input.shape[-1]
-        for i in range(self.num_conv-1):
+        for i in range(self.num_conv - 1):
             batch_norm = self.batch_norm_list[i]
             conv = self.conv_list[i]
-            
-            input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
-        conv = self.conv_list[self.num_conv-1]
-        input = conv(input)[:,:,:len]
+
+            input = layers.dropout(
+                layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
+        conv = self.conv_list[self.num_conv - 1]
+        input = conv(input)[:, :, :len]
         if self.batchnorm_last:
-            batch_norm = self.batch_norm_list[self.num_conv-1]
+            batch_norm = self.batch_norm_list[self.num_conv - 1]
             input = layers.dropout(batch_norm(input), self.dropout)
-        output = layers.transpose(input, [0,2,1])
-        return output
\ No newline at end of file
+        output = layers.transpose(input, [0, 2, 1])
+        return output
diff --git a/parakeet/models/transformer_tts/prenet.py b/parakeet/models/transformer_tts/prenet.py
index e9b0667..6039b60 100644
--- a/parakeet/models/transformer_tts/prenet.py
+++ b/parakeet/models/transformer_tts/prenet.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 
+
 class PreNet(dg.Layer):
     def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
         """
@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
         self.dropout_rate = dropout_rate
 
         k = math.sqrt(1 / input_size)
-        self.linear1 = dg.Linear(input_size, hidden_size,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.linear1 = dg.Linear(
+            input_size,
+            hidden_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
         k = math.sqrt(1 / hidden_size)
-        self.linear2 = dg.Linear(hidden_size, output_size,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.linear2 = dg.Linear(
+            hidden_size,
+            output_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
 
     def forward(self, x):
         """
diff --git a/parakeet/models/transformer_tts/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py
index bf2924a..1205c6b 100644
--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.encoder import Encoder
 from parakeet.models.transformer_tts.decoder import Decoder
 
+
 class TransformerTTS(dg.Layer):
     def __init__(self, config):
         super(TransformerTTS, self).__init__()
@@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer):
         self.config = config
 
     def forward(self, characters, mel_input, pos_text, pos_mel):
-        
+
         key, c_mask, attns_enc = self.encoder(characters, pos_text)
-        
-        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
+
+        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
+            key, key, mel_input, c_mask, pos_mel)
 
         return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
-
-
-    
-
-
-            
-            
diff --git a/parakeet/models/transformer_tts/utils.py b/parakeet/models/transformer_tts/utils.py
index ab575f9..2212744 100644
--- a/parakeet/models/transformer_tts/utils.py
+++ b/parakeet/models/transformer_tts/utils.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import librosa
 import os, copy
@@ -6,14 +19,15 @@ import paddle.fluid.layers as layers
 
 
 def get_positional_table(d_pos_vec, n_position=1024):
-    position_enc = np.array([
-        [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
-        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc = np.array(
+        [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
+         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
 
-    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
-    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
     return position_enc
 
+
 def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
     ''' Sinusoid position encoding table '''
 
@@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
     def get_posi_angle_vec(position):
         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 
-    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table = np.array(
+        [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
 
     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
@@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 
     return sinusoid_table
 
+
 def get_non_pad_mask(seq):
-    return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
+    return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
+
 
 def get_attn_key_pad_mask(seq_k, seq_q):
     ''' For masking out the padding part of key sequence. '''
@@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
     # Expand to fit the shape of key query attention matrix.
     len_q = seq_q.shape[1]
     padding_mask = (seq_k != 0).astype(np.float32)
-    padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) 
+    padding_mask = layers.expand(
+        layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
     return padding_mask
 
+
 def get_triu_tensor(seq_k, seq_q):
     ''' For make a triu tensor '''
     len_k = seq_k.shape[1]
     len_q = seq_q.shape[1]
     batch_size = seq_k.shape[0]
     triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
-    triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
-    
+    triu_tensor = np.repeat(
+        np.expand_dims(
+            triu_tensor, axis=0), batch_size, axis=0)
+
     return triu_tensor
 
+
 def guided_attention(N, T, g=0.2):
     '''Guided attention. Refer to page 3 on the paper.'''
     W = np.zeros((N, T), dtype=np.float32)
     for n_pos in range(W.shape[0]):
         for t_pos in range(W.shape[1]):
-            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
+            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
+                                         **2 / (2 * g * g))
     return W
 
 
 def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
-    output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
+    output = -1 * label * layers.log(input + epsilon) - (
+        1 - label) * layers.log(1 - input + epsilon)
     output = output * (label * (position_weight - 1) + 1)
 
     return layers.reduce_sum(output, dim=[0, 1])
-        
-
diff --git a/parakeet/models/transformer_tts/vocoder.py b/parakeet/models/transformer_tts/vocoder.py
index 3fa19a6..33ffe1c 100644
--- a/parakeet/models/transformer_tts/vocoder.py
+++ b/parakeet/models/transformer_tts/vocoder.py
@@ -1,27 +1,44 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.transformer_tts.cbhg import CBHG
 
+
 class Vocoder(dg.Layer):
     """
     CBHG Network (mel -> linear)
     """
+
     def __init__(self, config, batch_size):
         super(Vocoder, self).__init__()
-        self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], 
-                             num_filters = config['hidden_size'],
-                             filter_size=1)
+        self.pre_proj = Conv1D(
+            num_channels=config['audio']['num_mels'],
+            num_filters=config['hidden_size'],
+            filter_size=1)
         self.cbhg = CBHG(config['hidden_size'], batch_size)
-        self.post_proj = Conv1D(num_channels = config['hidden_size'], 
-                             num_filters = (config['audio']['n_fft'] // 2) + 1,
-                             filter_size=1)
+        self.post_proj = Conv1D(
+            num_channels=config['hidden_size'],
+            num_filters=(config['audio']['n_fft'] // 2) + 1,
+            filter_size=1)
 
     def forward(self, mel):
-        mel = layers.transpose(mel, [0,2,1])
+        mel = layers.transpose(mel, [0, 2, 1])
         mel = self.pre_proj(mel)
         mel = self.cbhg(mel)
         mag_pred = self.post_proj(mel)
-        mag_pred = layers.transpose(mag_pred, [0,2,1])
+        mag_pred = layers.transpose(mag_pred, [0, 2, 1])
         return mag_pred
diff --git a/parakeet/models/waveflow/__init__.py b/parakeet/models/waveflow/__init__.py
index 20475cd..73a7914 100644
--- a/parakeet/models/waveflow/__init__.py
+++ b/parakeet/models/waveflow/__init__.py
@@ -1 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.models.waveflow.waveflow import WaveFlow
diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py
index b5ad2c9..0c1e914 100644
--- a/parakeet/models/waveflow/data.py
+++ b/parakeet/models/waveflow/data.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import random
 
 import librosa
diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py
index 1b1b8bf..a8bd8af 100644
--- a/parakeet/models/waveflow/waveflow.py
+++ b/parakeet/models/waveflow/waveflow.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time
diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py
index 1b8938a..e5b9a3e 100644
--- a/parakeet/models/waveflow/waveflow_modules.py
+++ b/parakeet/models/waveflow/waveflow_modules.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import numpy as np
 import paddle.fluid.dygraph as dg
diff --git a/parakeet/models/wavenet/README.md b/parakeet/models/wavenet/README.md
index 18efd0b..21a0f92 100644
--- a/parakeet/models/wavenet/README.md
+++ b/parakeet/models/wavenet/README.md
@@ -2,7 +2,7 @@
 
 Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms.
 WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499).
-Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. 
+Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels.
 
 We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
 
@@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \
 #### Save and Load checkpoints
 
 Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default.
-The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. 
+The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
 
 There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
-1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. 
+1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
 2. Use `--iteration=500000`.
 3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`.
 
@@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \
     --root=./data/LJSpeech-1.1 \
     --name=${ModelName} --use_gpu=true \
     --output=./syn_audios \
-    --sample=${SAMPLE} 
+    --sample=${SAMPLE}
 ```
 
 In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
diff --git a/parakeet/models/wavenet/data.py b/parakeet/models/wavenet/data.py
index a4f1b70..db19667 100644
--- a/parakeet/models/wavenet/data.py
+++ b/parakeet/models/wavenet/data.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import random
 
 import librosa
@@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
         self.fft_window_shift = config.fft_window_shift
         # Calculate context frames.
         frames_per_second = config.sample_rate // self.fft_window_shift
-        train_clip_frames = int(np.ceil(
-            config.train_clip_second * frames_per_second))
+        train_clip_frames = int(
+            np.ceil(config.train_clip_second * frames_per_second))
         context_frames = config.context_size // self.fft_window_shift
         self.num_frames = train_clip_frames + context_frames
 
@@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech):
         fft_window_shift = config.fft_window_shift
         fft_window_size = config.fft_window_size
         fft_size = config.fft_size
-        
+
         audio, loaded_sr = librosa.load(wav_path, sr=None)
         assert loaded_sr == sr
 
@@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech):
         fft_padding = (fft_size - fft_window_shift) // 2
         desired_length = frames * fft_window_shift + fft_padding * 2
         pad_amount = (desired_length - audio.size) // 2
-        
+
         if audio.size % 2 == 0:
             audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect')
         else:
             audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect')
-        
+
         # Normalize audio.
         audio = audio / np.abs(audio).max() * 0.999
-        
+
         # Compute mel-spectrogram.
         # Turn center to False to prevent internal padding.
         spectrogram = librosa.core.stft(
-            audio, hop_length=fft_window_shift,
-            win_length=fft_window_size, n_fft=fft_size, center=False)
+            audio,
+            hop_length=fft_window_shift,
+            win_length=fft_window_size,
+            n_fft=fft_size,
+            center=False)
         spectrogram_magnitude = np.abs(spectrogram)
-        
+
         # Compute mel-spectrograms.
-        mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size,
+        mel_filter_bank = librosa.filters.mel(sr=sr,
+                                              n_fft=fft_size,
                                               n_mels=config.mel_bands)
         mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
         mel_spectrogram = mel_spectrogram.T
-        
+
         # Rescale mel_spectrogram.
         min_level, ref_level = 1e-5, 20
         mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
         mel_spectrogram = mel_spectrogram - ref_level
         mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
-        
+
         # Extract the center of audio that corresponds to mel spectrograms.
-        audio = audio[fft_padding : -fft_padding]
+        audio = audio[fft_padding:-fft_padding]
         assert mel_spectrogram.shape[0] * fft_window_shift == audio.size
 
         return audio, mel_spectrogram
 
 
-class Subset(dataset.Dataset): 
+class Subset(dataset.Dataset):
     def __init__(self, dataset, indices, valid):
         self.dataset = dataset
         self.indices = indices
@@ -100,23 +118,23 @@ class Subset(dataset.Dataset):
 
             audio_start = frame_start * fft_window_shift
             audio_end = frame_end * fft_window_shift
-            
-            audio = audio[audio_start : audio_end]
+
+            audio = audio[audio_start:audio_end]
 
         return audio, mel, audio_start
 
     def _batch_examples(self, batch):
         audios = [sample[0] for sample in batch]
         audio_starts = [sample[2] for sample in batch]
-    
+
         # mels shape [num_frames, mel_bands]
-        max_frames = max(sample[1].shape[0] for sample in batch) 
+        max_frames = max(sample[1].shape[0] for sample in batch)
         mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch]
-        
+
         audios = np.array(audios, dtype=np.float32)
         mels = np.array(mels, dtype=np.float32)
         audio_starts = np.array(audio_starts, dtype=np.int32)
-    
+
         return audios, mels, audio_starts
 
     def __len__(self):
@@ -138,17 +156,17 @@ class LJSpeech:
 
         # Train dataset.
         trainset = Subset(ds, train_indices, valid=False)
-        sampler = DistributedSampler(len(trainset), nranks, rank) 
+        sampler = DistributedSampler(len(trainset), nranks, rank)
         total_bs = config.batch_size
         assert total_bs % nranks == 0
-        train_sampler = BatchSampler(sampler, total_bs // nranks,
-            drop_last=True)
+        train_sampler = BatchSampler(
+            sampler, total_bs // nranks, drop_last=True)
         trainloader = DataCargo(trainset, batch_sampler=train_sampler)
 
         trainreader = fluid.io.PyReader(capacity=50, return_list=True)
         trainreader.decorate_batch_generator(trainloader, place)
         self.trainloader = (data for _ in iter(int, 1)
-            for data in trainreader())
+                            for data in trainreader())
 
         # Valid dataset.
         validset = Subset(ds, valid_indices, valid=True)
@@ -156,5 +174,5 @@ class LJSpeech:
         validloader = DataCargo(validset, batch_size=1, shuffle=False)
 
         validreader = fluid.io.PyReader(capacity=20, return_list=True)
-        validreader.decorate_batch_generator(validloader, place) 
+        validreader.decorate_batch_generator(validloader, place)
         self.validloader = validreader
diff --git a/parakeet/models/wavenet/slurm.py b/parakeet/models/wavenet/slurm.py
index 47af2dc..dfd22e4 100644
--- a/parakeet/models/wavenet/slurm.py
+++ b/parakeet/models/wavenet/slurm.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Utility module for restarting training when using SLURM.
 """
@@ -45,8 +58,8 @@ def parse_time(text):
     try:
         return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
     except ValueError as e:
-        raise ValueError("Error parsing time {}. Got error {}.".format(
-            text, str(e)))
+        raise ValueError("Error parsing time {}. Got error {}.".format(text,
+                                                                       str(e)))
 
 
 def restart_command():
@@ -76,8 +89,10 @@ def restart_command():
     gres, partition = info.get("Gres"), info.get("Partition")
     stderr, stdout = info.get("StdErr"), info.get("StdOut")
     job_name = info.get("JobName")
-    command = ["sbatch", "--job-name={}".format(job_name),
-               "--ntasks={}".format(num_tasks)]
+    command = [
+        "sbatch", "--job-name={}".format(job_name),
+        "--ntasks={}".format(num_tasks)
+    ]
 
     if partition:
         command.extend(["--partition", partition])
@@ -98,12 +113,13 @@ def restart_command():
     dist_setting = ['-m', 'paddle.distributed.launch']
     wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv
 
-    command.append(
-        "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd)))
+    command.append("--wrap={}".format(" ".join(
+        shlex.quote(arg) for arg in wrap_cmd)))
     time_limit_string = info["TimeLimit"]
     if time_limit_string.lower() == "unlimited":
-        print("UNLIMITED detected: restart OFF, infinite learning ON.",
-              flush=True)
+        print(
+            "UNLIMITED detected: restart OFF, infinite learning ON.",
+            flush=True)
         return command, None
     time_limit = parse_time(time_limit_string)
     runtime = parse_time(info["RunTime"])
diff --git a/parakeet/models/wavenet/synthesis.py b/parakeet/models/wavenet/synthesis.py
index d87a188..43d78de 100644
--- a/parakeet/models/wavenet/synthesis.py
+++ b/parakeet/models/wavenet/synthesis.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from pprint import pprint
@@ -12,25 +26,42 @@ from wavenet import WaveNet
 
 
 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
         help="general name of the model")
-    parser.add_argument('--name', type=str,
-        help="specific name of the training model")
-    parser.add_argument('--root', type=str,
-        help="root path of the LJSpeech dataset")
+    parser.add_argument(
+        '--name', type=str, help="specific name of the training model")
+    parser.add_argument(
+        '--root', type=str, help="root path of the LJSpeech dataset")
 
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
         help="option to use gpu training")
 
-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
         help=("which iteration of checkpoint to load, "
               "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
         help="path of the checkpoint to load")
 
-    parser.add_argument('--output', type=str, default="./syn_audios",
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="./syn_audios",
         help="path to write synthesized audio files")
-    parser.add_argument('--sample', type=int,
+    parser.add_argument(
+        '--sample',
+        type=int,
         help="which of the valid samples to synthesize audio")
 
 
@@ -52,7 +83,7 @@ def synthesize(config):
         fluid.default_startup_program().random_seed = seed
         fluid.default_main_program().random_seed = seed
         print("Random Seed: ", seed)
-        
+
         # Build model.
         model = WaveNet(config, checkpoint_dir)
         model.build(training=False)
diff --git a/parakeet/models/wavenet/train.py b/parakeet/models/wavenet/train.py
index 1a17bbd..7ebf58d 100644
--- a/parakeet/models/wavenet/train.py
+++ b/parakeet/models/wavenet/train.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 import subprocess
@@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60
 
 
 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
         help="general name of the model")
-    parser.add_argument('--name', type=str,
-        help="specific name of the training model")
-    parser.add_argument('--root', type=str,
-        help="root path of the LJSpeech dataset")
+    parser.add_argument(
+        '--name', type=str, help="specific name of the training model")
+    parser.add_argument(
+        '--root', type=str, help="root path of the LJSpeech dataset")
 
-    parser.add_argument('--parallel', type=bool, default=True,
+    parser.add_argument(
+        '--parallel',
+        type=bool,
+        default=True,
         help="option to use data parallel training")
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
         help="option to use gpu training")
 
-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
         help=("which iteration of checkpoint to load, "
               "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
         help="path of the checkpoint to load")
-    parser.add_argument('--slurm', type=bool, default=False,
+    parser.add_argument(
+        '--slurm',
+        type=bool,
+        default=False,
         help="whether you are using slurm to submit training jobs")
 
 
@@ -104,8 +136,8 @@ def train(config):
 
             # Check whether reaching the time limit.
             if config.slurm:
-                done = (death_time is not None and death_time - time.time() <
-                    MAXIMUM_SAVE_TIME)
+                done = (death_time is not None and
+                        death_time - time.time() < MAXIMUM_SAVE_TIME)
 
             if rank == 0 and done:
                 print("Saving progress before exiting.")
@@ -127,8 +159,8 @@ def train(config):
 
 if __name__ == "__main__":
     # Create parser.
-    parser = jsonargparse.ArgumentParser(description="Train WaveNet model",
-        formatter_class='default_argparse')
+    parser = jsonargparse.ArgumentParser(
+        description="Train WaveNet model", formatter_class='default_argparse')
     add_options_to_parser(parser)
     utils.add_config_options_to_parser(parser)
 
@@ -136,4 +168,4 @@ if __name__ == "__main__":
     # For conflicting updates to the same field, 
     # the preceding update will be overwritten by the following one.
     config = parser.parse_args()
-    train(config) 
+    train(config)
diff --git a/parakeet/models/wavenet/utils.py b/parakeet/models/wavenet/utils.py
index c2b6601..bb21b93 100644
--- a/parakeet/models/wavenet/utils.py
+++ b/parakeet/models/wavenet/utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time
@@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg
 
 
 def add_config_options_to_parser(parser):
-    parser.add_argument('--valid_size', type=int,
-        help="size of the valid dataset")
-    parser.add_argument('--train_clip_second', type=float,
+    parser.add_argument(
+        '--valid_size', type=int, help="size of the valid dataset")
+    parser.add_argument(
+        '--train_clip_second',
+        type=float,
         help="the length of audio clip for training")
-    parser.add_argument('--sample_rate', type=int,
-        help="sampling rate of audio data file")
-    parser.add_argument('--fft_window_shift', type=int,
+    parser.add_argument(
+        '--sample_rate', type=int, help="sampling rate of audio data file")
+    parser.add_argument(
+        '--fft_window_shift',
+        type=int,
         help="the shift of fft window for each frame")
-    parser.add_argument('--fft_window_size', type=int,
+    parser.add_argument(
+        '--fft_window_size',
+        type=int,
         help="the size of fft window for each frame")
-    parser.add_argument('--fft_size', type=int,
-        help="the size of fft filter on each frame")
-    parser.add_argument('--mel_bands', type=int,
+    parser.add_argument(
+        '--fft_size', type=int, help="the size of fft filter on each frame")
+    parser.add_argument(
+        '--mel_bands',
+        type=int,
         help="the number of mel bands when calculating mel spectrograms")
 
-    parser.add_argument('--seed', type=int,
-        help="seed of random initialization for the model")
-    parser.add_argument('--batch_size', type=int,
-        help="batch size for training")
-    parser.add_argument('--test_every', type=int,
-        help="test interval during training")
-    parser.add_argument('--save_every', type=int,
+    parser.add_argument(
+        '--seed', type=int, help="seed of random initialization for the model")
+    parser.add_argument(
+        '--batch_size', type=int, help="batch size for training")
+    parser.add_argument(
+        '--test_every', type=int, help="test interval during training")
+    parser.add_argument(
+        '--save_every',
+        type=int,
         help="checkpointing interval during training")
-    parser.add_argument('--max_iterations', type=int,
-        help="maximum training iterations")
+    parser.add_argument(
+        '--max_iterations', type=int, help="maximum training iterations")
 
-    parser.add_argument('--layers', type=int,
-        help="number of dilated convolution layers")
-    parser.add_argument('--kernel_width', type=int,
-        help="dilated convolution kernel width")
-    parser.add_argument('--dilation_block', type=list,
-        help="dilated convolution kernel width")
+    parser.add_argument(
+        '--layers', type=int, help="number of dilated convolution layers")
+    parser.add_argument(
+        '--kernel_width', type=int, help="dilated convolution kernel width")
+    parser.add_argument(
+        '--dilation_block', type=list, help="dilated convolution kernel width")
     parser.add_argument('--residual_channels', type=int)
     parser.add_argument('--skip_channels', type=int)
-    parser.add_argument('--loss_type', type=str,
-        help="mix-gaussian-pdf or softmax")
-    parser.add_argument('--num_channels', type=int, default=None,
+    parser.add_argument(
+        '--loss_type', type=str, help="mix-gaussian-pdf or softmax")
+    parser.add_argument(
+        '--num_channels',
+        type=int,
+        default=None,
         help="number of channels for softmax output")
-    parser.add_argument('--num_mixtures', type=int, default=None,
+    parser.add_argument(
+        '--num_mixtures',
+        type=int,
+        default=None,
         help="number of gaussian mixtures for gaussian output")
-    parser.add_argument('--log_scale_min', type=float, default=None,
+    parser.add_argument(
+        '--log_scale_min',
+        type=float,
+        default=None,
         help="minimum clip value of log variance of gaussian output")
 
-    parser.add_argument('--conditioner.filter_sizes', type=list,
+    parser.add_argument(
+        '--conditioner.filter_sizes',
+        type=list,
         help="conv2d tranpose op filter sizes for building conditioner")
-    parser.add_argument('--conditioner.upsample_factors', type=list,
+    parser.add_argument(
+        '--conditioner.upsample_factors',
+        type=list,
         help="list of upsample factors for building conditioner")
 
     parser.add_argument('--learning_rate', type=float)
     parser.add_argument('--gradient_max_norm', type=float)
-    parser.add_argument('--anneal.every', type=int,
+    parser.add_argument(
+        '--anneal.every',
+        type=int,
         help="step interval for annealing learning rate")
     parser.add_argument('--anneal.rate', type=float)
 
@@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
         handle.write("model_checkpoint_path: step-{}".format(iteration))
 
 
-def load_parameters(checkpoint_dir, rank, model, optimizer=None,
-                    iteration=None, file_path=None):
+def load_parameters(checkpoint_dir,
+                    rank,
+                    model,
+                    optimizer=None,
+                    iteration=None,
+                    file_path=None):
     if file_path is None:
         if iteration is None:
             iteration = load_latest_checkpoint(checkpoint_dir, rank)
@@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
     if optimizer and optimizer_dict:
         optimizer.set_dict(optimizer_dict)
         print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
-              rank, file_path))
+            rank, file_path))
 
 
 def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
diff --git a/parakeet/models/wavenet/wavenet.py b/parakeet/models/wavenet/wavenet.py
index c636c4b..db7a06e 100644
--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time
@@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule
 
 
 class WaveNet():
-    def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
-                 nranks=1, tb_logger=None):
+    def __init__(self,
+                 config,
+                 checkpoint_dir,
+                 parallel=False,
+                 rank=0,
+                 nranks=1,
+                 tb_logger=None):
         # Process config to calculate the context size
         dilations = list(
             itertools.islice(
@@ -29,12 +48,12 @@ class WaveNet():
 
     def build(self, training=True):
         config = self.config
-        dataset = LJSpeech(config, self.nranks, self.rank) 
+        dataset = LJSpeech(config, self.nranks, self.rank)
         self.trainloader = dataset.trainloader
         self.validloader = dataset.validloader
 
         wavenet = WaveNetModule("wavenet", config, self.rank)
-        
+
         # Dry run once to create and initalize all necessary parameters.
         audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32))
         mel = dg.to_variable(
@@ -45,38 +64,44 @@ class WaveNet():
         if training:
             # Create Learning rate scheduler.
             lr_scheduler = dg.ExponentialDecay(
-                learning_rate = config.learning_rate,
-                decay_steps = config.anneal.every,
-                decay_rate = config.anneal.rate,
+                learning_rate=config.learning_rate,
+                decay_steps=config.anneal.every,
+                decay_rate=config.anneal.rate,
                 staircase=True)
-    
+
             optimizer = fluid.optimizer.AdamOptimizer(
                 learning_rate=lr_scheduler)
-    
+
             clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
                 config.gradient_max_norm)
 
             # Load parameters.
-            utils.load_parameters(self.checkpoint_dir, self.rank,
-                                  wavenet, optimizer,
-                                  iteration=config.iteration,
-                                  file_path=config.checkpoint)
+            utils.load_parameters(
+                self.checkpoint_dir,
+                self.rank,
+                wavenet,
+                optimizer,
+                iteration=config.iteration,
+                file_path=config.checkpoint)
             print("Rank {}: checkpoint loaded.".format(self.rank))
-    
+
             # Data parallelism.
             if self.parallel:
                 strategy = dg.parallel.prepare_context()
                 wavenet = dg.parallel.DataParallel(wavenet, strategy)
-    
+
             self.wavenet = wavenet
             self.optimizer = optimizer
             self.clipper = clipper
 
         else:
             # Load parameters.
-            utils.load_parameters(self.checkpoint_dir, self.rank, wavenet,
-                                  iteration=config.iteration,
-                                  file_path=config.checkpoint)
+            utils.load_parameters(
+                self.checkpoint_dir,
+                self.rank,
+                wavenet,
+                iteration=config.iteration,
+                file_path=config.checkpoint)
             print("Rank {}: checkpoint loaded.".format(self.rank))
 
             self.wavenet = wavenet
@@ -104,7 +129,9 @@ class WaveNet():
         else:
             current_lr = self.optimizer._learning_rate
 
-        self.optimizer.minimize(loss, grad_clip=self.clipper,
+        self.optimizer.minimize(
+            loss,
+            grad_clip=self.clipper,
             parameter_list=self.wavenet.parameters())
         self.wavenet.clear_gradients()
 
@@ -143,10 +170,16 @@ class WaveNet():
 
             tb = self.tb_logger
             tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
-            tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(),
-                iteration, sample_rate=self.config.sample_rate)
-            tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
-                iteration, sample_rate=self.config.sample_rate)
+            tb.add_audio(
+                "Teacher-Forced-Audio-0",
+                sample_audios[0].numpy(),
+                iteration,
+                sample_rate=self.config.sample_rate)
+            tb.add_audio(
+                "Teacher-Forced-Audio-1",
+                sample_audios[1].numpy(),
+                iteration,
+                sample_rate=self.config.sample_rate)
 
     @dg.no_grad
     def infer(self, iteration):
@@ -165,10 +198,9 @@ class WaveNet():
         start_time = time.time()
         syn_audio = self.wavenet.synthesize(mels_list[sample])
         syn_time = time.time() - start_time
-        print("audio shape {}, synthesis time {}".format(
-            syn_audio.shape, syn_time))
-        librosa.output.write_wav(filename, syn_audio,
-            sr=config.sample_rate)
+        print("audio shape {}, synthesis time {}".format(syn_audio.shape,
+                                                         syn_time))
+        librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate)
 
     def save(self, iteration):
         utils.save_latest_parameters(self.checkpoint_dir, iteration,
diff --git a/parakeet/models/wavenet/wavenet_modules.py b/parakeet/models/wavenet/wavenet_modules.py
index fbab741..2c62643 100644
--- a/parakeet/models/wavenet/wavenet_modules.py
+++ b/parakeet/models/wavenet/wavenet_modules.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 
 import numpy as np
@@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'):
 
 def extract_slices(x, audio_starts, audio_length, rank):
     slices = []
-    for i in range(x.shape[0]): 
+    for i in range(x.shape[0]):
         start = audio_starts.numpy()[i]
         end = start + audio_length
         slice = fluid.layers.slice(
-            x, axes=[0, 1], starts=[i, start], ends=[i+1, end])
+            x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
         slices.append(fluid.layers.squeeze(slice, [0]))
 
     x = fluid.layers.stack(slices, axis=0)
@@ -50,7 +64,7 @@ class Conditioner(dg.Layer):
         # Register python list as parameters.
         for i, layer in enumerate(self.deconvs):
             self.add_sublayer("conv_transpose_{}".format(i), layer)
-        
+
     def forward(self, x):
         x = fluid.layers.unsqueeze(x, 1)
         for layer in self.deconvs:
@@ -62,7 +76,7 @@ class Conditioner(dg.Layer):
 class WaveNetModule(dg.Layer):
     def __init__(self, name_scope, config, rank):
         super(WaveNetModule, self).__init__(name_scope)
-        
+
         self.rank = rank
         self.conditioner = Conditioner(self.full_name(), config)
         self.dilations = list(
@@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer):
                 embed_dim=config.residual_channels,
                 std=0.1)
         elif config.loss_type == "mix-gaussian-pdf":
-            self.embedding_fc = modules.FC(
-                self.full_name(),
-                in_features=1,
-                size=config.residual_channels,
-                num_flatten_dims=2,
-                relu=False)
+            self.embedding_fc = modules.FC(self.full_name(),
+                                           in_features=1,
+                                           size=config.residual_channels,
+                                           num_flatten_dims=2,
+                                           relu=False)
         else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
 
         self.dilated_causal_convs = []
         for dilation in self.dilations:
@@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer):
                     num_filters=config.residual_channels,
                     filter_size=config.kernel_width,
                     dilation=dilation,
-                    causal=True
-                )
-            )
+                    causal=True))
 
         for i, layer in enumerate(self.dilated_causal_convs):
-            self.add_sublayer("dilated_causal_conv_{}".format(i), layer) 
+            self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
 
-        self.fc1 = modules.FC(
-            self.full_name(),
-            in_features=config.residual_channels,
-            size=config.skip_channels,
-            num_flatten_dims=2,
-            relu=True,
-            act="relu")
+        self.fc1 = modules.FC(self.full_name(),
+                              in_features=config.residual_channels,
+                              size=config.skip_channels,
+                              num_flatten_dims=2,
+                              relu=True,
+                              act="relu")
 
-        self.fc2 = modules.FC(
-            self.full_name(),
-            in_features=config.skip_channels,
-            size=config.skip_channels,
-            num_flatten_dims=2,
-            relu=True,
-            act="relu")
+        self.fc2 = modules.FC(self.full_name(),
+                              in_features=config.skip_channels,
+                              size=config.skip_channels,
+                              num_flatten_dims=2,
+                              relu=True,
+                              act="relu")
 
         if config.loss_type == "softmax":
-            self.fc3 = modules.FC(
-                self.full_name(),
-                in_features=config.skip_channels,
-                size=config.num_channels,
-                num_flatten_dims=2,
-                relu=False)
+            self.fc3 = modules.FC(self.full_name(),
+                                  in_features=config.skip_channels,
+                                  size=config.num_channels,
+                                  num_flatten_dims=2,
+                                  relu=False)
         elif config.loss_type == "mix-gaussian-pdf":
-            self.fc3 = modules.FC(
-                self.full_name(),
-                in_features=config.skip_channels,
-                size=3 * config.num_mixtures,
-                num_flatten_dims=2,
-                relu=False)
+            self.fc3 = modules.FC(self.full_name(),
+                                  in_features=config.skip_channels,
+                                  size=3 * config.num_mixtures,
+                                  num_flatten_dims=2,
+                                  relu=False)
         else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
 
     def sample_softmax(self, mix_parameters):
         batch, length, hidden = mix_parameters.shape
         mix_param_2d = fluid.layers.reshape(mix_parameters,
-            [batch * length, hidden])
+                                            [batch * length, hidden])
         mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)
 
         # quantized: [batch * length]
-        quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),
-            dtype="float32")
+        quantized = fluid.layers.cast(
+            fluid.layers.sampling_id(mix_param_2d), dtype="float32")
         samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0
 
         # samples: [batch * length]
@@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer):
         # to [bs * len, 3 * num_mixtures].
         batch, length, hidden = mix_parameters.shape
         mix_param_2d = fluid.layers.reshape(mix_parameters,
-            [batch * length, hidden])
+                                            [batch * length, hidden])
         K = hidden // 3
 
         # Unpack the parameters of the mixture of gaussian.
-        logits_pi = mix_param_2d[:, 0 : K]
-        mu = mix_param_2d[:, K : 2*K]
-        log_s = mix_param_2d[:, 2*K : 3*K]
+        logits_pi = mix_param_2d[:, 0:K]
+        mu = mix_param_2d[:, K:2 * K]
+        log_s = mix_param_2d[:, 2 * K:3 * K]
         s = fluid.layers.exp(log_s)
 
         pi = fluid.layers.softmax(logits_pi, axis=-1)
         comp_samples = fluid.layers.sampling_id(pi)
-        
+
         row_idx = dg.to_variable(np.arange(batch * length))
         comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)
 
         mu_comp = fluid.layers.gather_nd(mu, comp_samples)
-        s_comp = fluid.layers.gather_nd(s, comp_samples) 
+        s_comp = fluid.layers.gather_nd(s, comp_samples)
 
         # N(0, 1) normal sample.
         u = fluid.layers.gaussian_random(shape=[batch * length])
@@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer):
 
         # Calculate gaussian loss.
         targets = fluid.layers.unsqueeze(targets, -1)
-        targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
-        x_std =  inv_s * (targets - mu)
+        targets = fluid.layers.expand(targets,
+                                      [1, 1, self.config.num_mixtures])
+        x_std = inv_s * (targets - mu)
         exponent = fluid.layers.exp(-0.5 * x_std * x_std)
         pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
         pdf_x = pi * pdf_x
@@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer):
 
         # Slice conditioners.
         audio_length = audios.shape[1]
-        conditioner = extract_slices(full_conditioner,
-            audio_starts, audio_length, self.rank)
-    
+        conditioner = extract_slices(full_conditioner, audio_starts,
+                                     audio_length, self.rank)
+
         # input_audio, target_audio: [bs, len]
         input_audios = audios[:, :-1]
         target_audios = audios[:, 1:]
@@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer):
             layer_input = self.embedding_fc(
                 fluid.layers.unsqueeze(input_audios, 2))
         else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
 
         # layer_input: [bs, res_channel, 1, len]
         layer_input = fluid.layers.unsqueeze(
-            fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
+            fluid.layers.transpose(
+                layer_input, perm=[0, 2, 1]), 2)
         # conditioner: [bs, mel_bands, 1, len]
         conditioner = fluid.layers.unsqueeze(
-            fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
+            fluid.layers.transpose(
+                conditioner, perm=[0, 2, 1]), 2)
 
         skip = None
         for i, layer in enumerate(self.dilated_causal_convs):
@@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer):
             elif loss_type == "mix-gaussian-pdf":
                 sample_audios = self.sample_mix_gaussian(mix_parameters)
             else:
-                raise ValueError(
-                    "loss_type {} is unsupported!".format(loss_type))
+                raise ValueError("loss_type {} is unsupported!".format(
+                    loss_type))
 
         if loss_type == "softmax":
             loss = self.softmax_loss(target_audios, mix_parameters)
         elif loss_type == "mix-gaussian-pdf":
-            loss = self.mixture_density_loss(target_audios,
-                mix_parameters, self.log_scale_min)
+            loss = self.mixture_density_loss(target_audios, mix_parameters,
+                                             self.log_scale_min)
         else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
 
         return loss, sample_audios
 
     def synthesize(self, mels):
         self.start_new_sequence()
-        bs, n_frames, mel_bands = mels.shape 
+        bs, n_frames, mel_bands = mels.shape
         conditioner = self.conditioner(mels)
         time_steps = conditioner.shape[1]
 
@@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer):
             elif loss_type == "mix-gaussian-pdf":
                 audio_input = self.embedding_fc(current_sample)
             else:
-                raise ValueError(
-                    "loss_type {} is unsupported!".format(loss_type))
+                raise ValueError("loss_type {} is unsupported!".format(
+                    loss_type))
 
             # [bs, channel, 1, 1]
             audio_input = fluid.layers.unsqueeze(
-                fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
+                fluid.layers.transpose(
+                    audio_input, perm=[0, 2, 1]), 2)
             # [bs, mel_bands]
             cond_input = conditioner[:, i, :]
             # [bs, mel_bands, 1, 1]
-            cond_input = fluid.layers.reshape(
-                cond_input, cond_input.shape + [1, 1])
+            cond_input = fluid.layers.reshape(cond_input,
+                                              cond_input.shape + [1, 1])
 
             skip = None
             for layer in self.dilated_causal_convs:
-                audio_input, skip = layer.add_input(
-                    audio_input, skip, cond_input)
-            
+                audio_input, skip = layer.add_input(audio_input, skip,
+                                                    cond_input)
+
             # [bs, 1, channel]
             skip = fluid.layers.transpose(
                 fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
@@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer):
             elif loss_type == "mix-gaussian-pdf":
                 sample = self.sample_mix_gaussian(mix_parameters)
             else:
-                raise ValueError(
-                    "loss_type {} is unsupported!".format(loss_type))
+                raise ValueError("loss_type {} is unsupported!".format(
+                    loss_type))
             audio_samples.append(sample)
             # [bs]
             current_sample = audio_samples[-1]
             # [bs, 1, 1]
-            current_sample = fluid.layers.reshape(current_sample,
-                current_sample.shape + [1, 1])
+            current_sample = fluid.layers.reshape(
+                current_sample, current_sample.shape + [1, 1])
 
         # syn_audio: [num_samples]
         syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
 
-        return syn_audio        
+        return syn_audio
 
     def start_new_sequence(self):
         for layer in self.sublayers():
diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py
index 8123194..d964a59 100644
--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
@@ -1,2 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from . import weight_norm
 from .customized import *
\ No newline at end of file
diff --git a/parakeet/modules/customized.py b/parakeet/modules/customized.py
index dc2259a..783625a 100644
--- a/parakeet/modules/customized.py
+++ b/parakeet/modules/customized.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle import fluid
 import paddle.fluid.layers as F
 import paddle.fluid.dygraph as dg
@@ -7,14 +21,15 @@ class Pool1D(dg.Layer):
     """
     A Pool 1D block implemented with Pool2D.
     """
+
     def __init__(self,
-                 pool_size=-1, 
-                 pool_type='max', 
-                 pool_stride=1, 
-                 pool_padding=0, 
-                 global_pooling=False, 
-                 use_cudnn=True, 
-                 ceil_mode=False, 
+                 pool_size=-1,
+                 pool_type='max',
+                 pool_stride=1,
+                 pool_padding=0,
+                 global_pooling=False,
+                 use_cudnn=True,
+                 ceil_mode=False,
                  exclusive=True,
                  data_format='NCT'):
         super(Pool1D, self).__init__()
@@ -28,13 +43,16 @@ class Pool1D(dg.Layer):
         self.exclusive = exclusive
         self.data_format = data_format
 
+        self.pool2d = dg.Pool2D(
+            [1, pool_size],
+            pool_type=pool_type,
+            pool_stride=[1, pool_stride],
+            pool_padding=[0, pool_padding],
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn,
+            ceil_mode=ceil_mode,
+            exclusive=exclusive)
 
-        self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
-                                pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
-                                global_pooling = global_pooling, use_cudnn = use_cudnn,
-                                ceil_mode = ceil_mode, exclusive = exclusive)
-
-    
     def forward(self, x):
         """
         Args:
@@ -53,12 +71,14 @@ class Pool1D(dg.Layer):
             x = fluid.layers.transpose(x, [0, 2, 1])
         return x
 
+
 class Conv1D(dg.Conv2D):
     """A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and 
     use (B, C, 1, T) data layout to compute 1D convolution. Nothing more.
     NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple
     layer, instead of a complex one. So we can easily apply weight norm to it.
     """
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D):
                  use_cudnn=True,
                  act=None,
                  dtype='float32'):
-        super(Conv1D, self).__init__(num_channels,
-                                     num_filters, (1, filter_size),
-                                     stride=(1, stride),
-                                     padding=(0, padding),
-                                     dilation=(1, dilation),
-                                     groups=groups,
-                                     param_attr=param_attr,
-                                     bias_attr=bias_attr,
-                                     use_cudnn=use_cudnn,
-                                     act=act,
-                                     dtype=dtype)
+        super(Conv1D, self).__init__(
+            num_channels,
+            num_filters, (1, filter_size),
+            stride=(1, stride),
+            padding=(0, padding),
+            dilation=(1, dilation),
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)
 
     def forward(self, x):
         x = F.unsqueeze(x, [2])
@@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose):
                  use_cudnn=True,
                  act=None,
                  dtype='float32'):
-        super(Conv1DTranspose, self).__init__(num_channels,
-                                              num_filters, (1, filter_size),
-                                              output_size=None,
-                                              padding=(0, padding),
-                                              stride=(1, stride),
-                                              dilation=(1, dilation),
-                                              groups=groups,
-                                              param_attr=param_attr,
-                                              bias_attr=bias_attr,
-                                              use_cudnn=use_cudnn,
-                                              act=act,
-                                              dtype=dtype)
+        super(Conv1DTranspose, self).__init__(
+            num_channels,
+            num_filters, (1, filter_size),
+            output_size=None,
+            padding=(0, padding),
+            stride=(1, stride),
+            dilation=(1, dilation),
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)
 
     def forward(self, x):
         x = F.unsqueeze(x, [2])
@@ -134,6 +156,7 @@ class Conv1DCell(Conv1D):
     It is a cell that it acts like an RNN cell. It does not support stride > 1, and it
     ensures 1-to-1 mapping from input time steps to output timesteps.
     """
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -150,18 +173,19 @@ class Conv1DCell(Conv1D):
         padding = receptive_field - 1 if causal else receptive_field // 2
         self._receptive_field = receptive_field
         self.causal = causal
-        super(Conv1DCell, self).__init__(num_channels,
-                                         num_filters,
-                                         filter_size,
-                                         stride=1,
-                                         padding=padding,
-                                         dilation=dilation,
-                                         groups=groups,
-                                         param_attr=param_attr,
-                                         bias_attr=bias_attr,
-                                         use_cudnn=use_cudnn,
-                                         act=act,
-                                         dtype=dtype)
+        super(Conv1DCell, self).__init__(
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)
 
     def forward(self, x):
         # it ensures that ouput time steps == input time steps
@@ -189,15 +213,16 @@ class Conv1DCell(Conv1D):
     def add_input(self, x_t):
         batch_size, c_in, _ = x_t.shape
         if self._buffer is None:
-            self._buffer = F.zeros((batch_size, c_in, self.receptive_field),
-                                   dtype=x_t.dtype)
+            self._buffer = F.zeros(
+                (batch_size, c_in, self.receptive_field), dtype=x_t.dtype)
         self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1)
         if self._dilation[1] > 1:
-            input = F.strided_slice(self._buffer,
-                                    axes=[2],
-                                    starts=[0],
-                                    ends=[self.receptive_field],
-                                    strides=[self._dilation[1]])
+            input = F.strided_slice(
+                self._buffer,
+                axes=[2],
+                starts=[0],
+                ends=[self.receptive_field],
+                strides=[self._dilation[1]])
         else:
             input = self._buffer
         input = F.reshape(input, (batch_size, -1))
diff --git a/parakeet/modules/dynamic_gru.py b/parakeet/modules/dynamic_gru.py
index e84c598..3a6602e 100644
--- a/parakeet/modules/dynamic_gru.py
+++ b/parakeet/modules/dynamic_gru.py
@@ -1,6 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 
+
 class DynamicGRU(dg.Layer):
     def __init__(self,
                  size,
@@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer):
             res = res[::-1]
         res = layers.concat(res, axis=1)
         return res
-
diff --git a/parakeet/modules/ffn.py b/parakeet/modules/ffn.py
index dc413bf..3fa8c16 100644
--- a/parakeet/modules/ffn.py
+++ b/parakeet/modules/ffn.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 import paddle.fluid as fluid
@@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D
 
 class PositionwiseFeedForward(dg.Layer):
     ''' A two-feed-forward-layer module '''
-    def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
+
+    def __init__(self,
+                 d_in,
+                 num_hidden,
+                 filter_size,
+                 padding=0,
+                 use_cudnn=True,
+                 dropout=0.1):
         super(PositionwiseFeedForward, self).__init__()
         self.num_hidden = num_hidden
         self.use_cudnn = use_cudnn
         self.dropout = dropout
 
         k = math.sqrt(1 / d_in)
-        self.w_1 = Conv1D(num_channels = d_in, 
-                        num_filters = num_hidden, 
-                        filter_size = filter_size,
-                        padding=padding,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                        use_cudnn = use_cudnn)
+        self.w_1 = Conv1D(
+            num_channels=d_in,
+            num_filters=num_hidden,
+            filter_size=filter_size,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)),
+            use_cudnn=use_cudnn)
         k = math.sqrt(1 / num_hidden)
-        self.w_2 = Conv1D(num_channels = num_hidden,
-                        num_filters = d_in,
-                        filter_size = filter_size,
-                        padding=padding,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                        use_cudnn = use_cudnn)
+        self.w_2 = Conv1D(
+            num_channels=num_hidden,
+            num_filters=d_in,
+            filter_size=filter_size,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)),
+            use_cudnn=use_cudnn)
         self.layer_norm = dg.LayerNorm(d_in)
 
     def forward(self, input):
@@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer):
         Returns:
             output (Variable), Shape(B, T, C), the result after FFN.
         """
-        x = layers.transpose(input, [0,2,1])
+        x = layers.transpose(input, [0, 2, 1])
         #FFN Networt
         x = self.w_2(layers.relu(self.w_1(x)))
-        
+
         # dropout
         x = layers.dropout(x, self.dropout)
 
-        x = layers.transpose(x, [0,2,1])
+        x = layers.transpose(x, [0, 2, 1])
         # residual connection
         x = x + input
-        
+
         #layer normalization
         output = self.layer_norm(x)
 
-        return output
\ No newline at end of file
+        return output
diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py
index 40d8164..89783b9 100644
--- a/parakeet/modules/multihead_attention.py
+++ b/parakeet/modules/multihead_attention.py
@@ -1,37 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 
+
 class Linear(dg.Layer):
-    def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 is_bias=True,
+                 dtype="float32"):
         super(Linear, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.dtype = dtype
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
-        self.bias  = is_bias
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
+        self.bias = is_bias
 
         if is_bias is not False:
             k = math.sqrt(1 / in_features)
-            self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+            self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k))
+
+        self.linear = dg.Linear(
+            in_features,
+            out_features,
+            param_attr=self.weight,
+            bias_attr=self.bias, )
 
-        self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
-                            bias_attr = self.bias,)
-    
     def forward(self, x):
         x = self.linear(x)
         return x
 
+
 class ScaledDotProductAttention(dg.Layer):
     def __init__(self, d_key):
         super(ScaledDotProductAttention, self).__init__()
 
         self.d_key = d_key
-    
+
     # please attention this mask is diff from pytorch
-    def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                query_mask=None,
+                dropout=0.1):
         """
         Scaled Dot Product Attention.
         
@@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer):
             attention (Variable), Shape(n_head * B, T, C), the attention of key.
         """
         # Compute attention score
-        attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
+        attention = layers.matmul(
+            query, key, transpose_y=True)  #transpose the last dim in y
         attention = attention / math.sqrt(self.d_key)
 
         # Mask key to ignore padding
         if mask is not None:
             attention = attention * mask
-            mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
+            mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
             attention = attention + mask
-        
+
         attention = layers.softmax(attention)
         attention = layers.dropout(attention, dropout)
-        
+
         # Mask query to ignore padding
         if query_mask is not None:
             attention = attention * query_mask
-        
+
         result = layers.matmul(attention, value)
         return result, attention
 
+
 class MultiheadAttention(dg.Layer):
-    def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True):
+    def __init__(self,
+                 num_hidden,
+                 d_k,
+                 d_q,
+                 num_head=4,
+                 is_bias=False,
+                 dropout=0.1,
+                 is_concat=True):
         super(MultiheadAttention, self).__init__()
         self.num_hidden = num_hidden
         self.num_head = num_head
@@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer):
 
         # repeat masks h times
         if query_mask is not None:
-            query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
+            query_mask = layers.expand(query_mask,
+                                       [self.num_head, 1, seq_len_key])
         if mask is not None:
             mask = layers.expand(mask, (self.num_head, 1, 1))
-        
-        
+
         # Make multihead attention
         # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
-        key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
-        value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
-        query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
+        key = layers.reshape(
+            self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
+        value = layers.reshape(
+            self.value(value),
+            [batch_size, seq_len_key, self.num_head, self.d_k])
+        query = layers.reshape(
+            self.query(query_input),
+            [batch_size, seq_len_query, self.num_head, self.d_q])
+
+        key = layers.reshape(
+            layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
+        value = layers.reshape(
+            layers.transpose(value, [2, 0, 1, 3]),
+            [-1, seq_len_key, self.d_k])
+        query = layers.reshape(
+            layers.transpose(query, [2, 0, 1, 3]),
+            [-1, seq_len_query, self.d_q])
+
+        result, attention = self.scal_attn(
+            key, value, query, mask=mask, query_mask=query_mask)
 
-        key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
-        value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
-        query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
-        
-        result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
-        
         # concat all multihead result
-        result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
-        result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
+        result = layers.reshape(
+            result, [self.num_head, batch_size, seq_len_query, self.d_q])
+        result = layers.reshape(
+            layers.transpose(result, [1, 2, 0, 3]),
+            [batch_size, seq_len_query, -1])
         if self.is_concat:
-            result = layers.concat([query_input,result], axis=-1)
+            result = layers.concat([query_input, result], axis=-1)
         result = layers.dropout(self.fc(result), self.dropout)
         result = result + query_input
-        
+
         result = self.layer_norm(result)
-        return result, attention
\ No newline at end of file
+        return result, attention
diff --git a/parakeet/modules/weight_norm.py b/parakeet/modules/weight_norm.py
index 9e28792..92f1085 100644
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle import fluid
 import paddle.fluid.dygraph as dg
diff --git a/parakeet/utils/layer_tools.py b/parakeet/utils/layer_tools.py
index eaa9c9e..a045c78 100644
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from torch import nn
 import paddle.fluid.dygraph as dg
@@ -10,8 +24,8 @@ def summary(layer):
         print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
         num_elements += np.prod(param.shape)
         num_params += 1
-    print("layer has {} parameters, {} elements.".format(
-        num_params, num_elements))
+    print("layer has {} parameters, {} elements.".format(num_params,
+                                                         num_elements))
 
 
 def freeze(layer):
@@ -31,5 +45,5 @@ def torch_summary(layer):
         print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
         num_elements += np.prod(param.shape)
         num_params += 1
-    print("layer has {} parameters, {} elements.".format(
-        num_params, num_elements))
+    print("layer has {} parameters, {} elements.".format(num_params,
+                                                         num_elements))
diff --git a/setup.py b/setup.py
index 1cd6e8a..2384837 100644
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,27 @@
-import os 
-import io 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import io
 import re
 from setuptools import setup, find_packages
 
+
 def read(*names, **kwargs):
     with io.open(
-        os.path.join(os.path.dirname(__file__), *names),
-        encoding=kwargs.get("encoding", "utf8")
-    ) as fp:
+            os.path.join(os.path.dirname(__file__), *names),
+            encoding=kwargs.get("encoding", "utf8")) as fp:
         return fp.read()
 
 
@@ -19,6 +33,7 @@ def find_version(*file_paths):
         return version_match.group(1)
     raise RuntimeError("Unable to find version string.")
 
+
 VERSION = find_version('parakeet', '__init__.py')
 long_description = read('README.md')
 
@@ -32,17 +47,26 @@ setup_info = dict(
     description='Speech synthesis tools and models based on Paddlepaddle',
     long_description=long_description,
     license='Apache 2',
-
     install_requires=[
-        'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', 
-        'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy',
-        'ruamel.yaml', 'pandas', 'sox', 'soundfile',  
+        'numpy',
+        'nltk',
+        'inflect',
+        'librosa',
+        'unidecode',
+        'numba',
+        'tqdm',
+        'matplotlib',
+        'tensorboardX',
+        'tensorboard',
+        'scipy',
+        'ruamel.yaml',
+        'pandas',
+        'sox',
+        'soundfile',
     ],
 
     # Package info
     packages=find_packages(exclude=('tests', 'tests.*')),
+    zip_safe=True, )
 
-    zip_safe=True,
-)
-
-setup(**setup_info)
\ No newline at end of file
+setup(**setup_info)
diff --git a/tests/test_ljspeech.py b/tests/test_ljspeech.py
index 34f5011..d6187e8 100644
--- a/tests/test_ljspeech.py
+++ b/tests/test_ljspeech.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.datasets.ljspeech import LJSpeech
 from parakeet.data.datacargo import DataCargo
 
diff --git a/tests/test_vctk.py b/tests/test_vctk.py
index 3f7d61e..58ef0ca 100644
--- a/tests/test_vctk.py
+++ b/tests/test_vctk.py
@@ -1,11 +1,25 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.datasets import vctk
 from pathlib import Path
 from parakeet.data.datacargo import DataCargo
 
 root = Path("/workspace/datasets/VCTK-Corpus")
 vctk_dataset = vctk.VCTK(root)
-vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
+vctk_cargo = DataCargo(
+    vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
 
 for i, batch in enumerate(vctk_cargo):
     print(i)
-
diff --git a/tools/copyright.hook b/tools/copyright.hook
new file mode 100644
index 0000000..23aaf38
--- /dev/null
+++ b/tools/copyright.hook
@@ -0,0 +1,121 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())