format code with pre-commit

2021-05-13 16:22:56 +08:00 · 2021-05-13 16:22:56 +08:00 · 6a1fb158d9
parent 73ca693395
commit 6a1fb158d9
62 changed files with 1068 additions and 709 deletions
--- a/README.md
+++ b/README.md
@ -45,7 +45,7 @@ See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. T
 pip install -U paddle-parakeet
 ```

-or 
+or
 ```bash
 git clone https://github.com/PaddlePaddle/Parakeet
 cd Parakeet
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -68,7 +68,6 @@ exclude_patterns = []

 html_theme = "sphinx_rtd_theme"

-
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
--- a/examples/ge2e/README.md
+++ b/examples/ge2e/README.md
@ -127,6 +127,3 @@ python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpo

 1. [Generalized End-to-end Loss for Speaker Verification](https://arxiv.org/pdf/1710.10467.pdf)
 2. [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
-
-
-
--- a/examples/ge2e/README_cn.md
+++ b/examples/ge2e/README_cn.md
@ -4,7 +4,7 @@

 ## 模型

-本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。	
+本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。

 ## 目录结构

@ -122,6 +122,3 @@ python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpo

 1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf)
 2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
-
-
-
--- a/examples/ge2e/audio_processor.py
+++ b/examples/ge2e/audio_processor.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 from warnings import warn
 import struct
@ -30,16 +44,18 @@ def normalize_volume(wav,
    if increase_only and decrease_only:
        raise ValueError("Both increase only and decrease only are set")
    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
-    if ((dBFS_change < 0 and increase_only)
-            or (dBFS_change > 0 and decrease_only)):
+    if ((dBFS_change < 0 and increase_only) or
+        (dBFS_change > 0 and decrease_only)):
        return wav
    gain = 10**(dBFS_change / 20)
    return wav * gain


-def trim_long_silences(wav, vad_window_length: int,
+def trim_long_silences(wav,
+                       vad_window_length: int,
                       vad_moving_average_width: int,
-                       vad_max_silence_length: int, sampling_rate: int):
+                       vad_max_silence_length: int,
+                       sampling_rate: int):
    """
    Ensures that segments without voice in the waveform remain no longer than a
    threshold determined by the VAD parameters in params.py.
@ -63,14 +79,15 @@ def trim_long_silences(wav, vad_window_length: int,
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(
-            vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
-                          sample_rate=sampling_rate))
+            vad.is_speech(
+                pcm_wave[window_start * 2:window_end * 2],
+                sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)

    # Smooth the voice detection with a moving average
    def moving_average(array, width):
-        array_padded = np.concatenate((np.zeros(
-            (width - 1) // 2), array, np.zeros(width // 2)))
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array,
+                                       np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width
@ -89,8 +106,8 @@ def trim_long_silences(wav, vad_window_length: int,
 def compute_partial_slices(n_samples: int,
                           partial_utterance_n_frames: int,
                           hop_length: int,
-                           min_pad_coverage: float = 0.75,
-                           overlap: float = 0.5):
+                           min_pad_coverage: float=0.75,
+                           overlap: float=0.5):
    """
    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
@ -121,8 +138,8 @@ def compute_partial_slices(n_samples: int,
    # librosa's function to compute num_frames from num_samples
    n_frames = int(np.ceil((n_samples + 1) / hop_length))
    # frame shift between ajacent partials
-    frame_step = max(1,
-                     int(np.round(partial_utterance_n_frames * (1 - overlap))))
+    frame_step = max(
+        1, int(np.round(partial_utterance_n_frames * (1 - overlap))))

    # Compute the slices
    wav_slices, mel_slices = [], []
@ -135,8 +152,8 @@ def compute_partial_slices(n_samples: int,

    # Evaluate whether extra padding is warranted or not
    last_wav_range = wav_slices[-1]
-    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop -
-                                                     last_wav_range.start)
+    coverage = (n_samples - last_wav_range.start) / (
+        last_wav_range.stop - last_wav_range.start)
    if coverage < min_pad_coverage and len(mel_slices) > 1:
        mel_slices = mel_slices[:-1]
        wav_slices = wav_slices[:-1]
@ -155,8 +172,8 @@ class SpeakerVerificationPreprocessor(object):
                 mel_window_step,
                 n_mels,
                 partial_n_frames: int,
-                 min_pad_coverage: float = 0.75,
-                 partial_overlap_ratio: float = 0.5):
+                 min_pad_coverage: float=0.75,
+                 partial_overlap_ratio: float=0.5):
        self.sampling_rate = sampling_rate
        self.audio_norm_target_dBFS = audio_norm_target_dBFS

@ -184,24 +201,23 @@ class SpeakerVerificationPreprocessor(object):
            wav = librosa.resample(wav, source_sr, self.sampling_rate)

        # loudness normalization
-        wav = normalize_volume(wav,
-                               self.audio_norm_target_dBFS,
-                               increase_only=True)
+        wav = normalize_volume(
+            wav, self.audio_norm_target_dBFS, increase_only=True)

        # trim long silence
        if webrtcvad:
-            wav = trim_long_silences(wav, self.vad_window_length,
-                                     self.vad_moving_average_width,
-                                     self.vad_max_silence_length,
-                                     self.sampling_rate)
+            wav = trim_long_silences(
+                wav, self.vad_window_length, self.vad_moving_average_width,
+                self.vad_max_silence_length, self.sampling_rate)
        return wav

    def melspectrogram(self, wav):
-        mel = librosa.feature.melspectrogram(wav,
-                                             sr=self.sampling_rate,
-                                             n_fft=self.n_fft,
-                                             hop_length=self.hop_length,
-                                             n_mels=self.n_mels)
+        mel = librosa.feature.melspectrogram(
+            wav,
+            sr=self.sampling_rate,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels)
        mel = mel.astype(np.float32).T
        return mel

--- a/examples/ge2e/config.py
+++ b/examples/ge2e/config.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from yacs.config import CfgNode

 _C = CfgNode()
--- a/examples/ge2e/dataset_processors.py
+++ b/examples/ge2e/dataset_processors.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from functools import partial
 from typing import List
 from pathlib import Path
@ -29,7 +43,7 @@ def _process_speaker(speaker_dir: Path,
                     datasets_root: Path,
                     output_dir: Path,
                     pattern: str,
-                     skip_existing: bool = False):
+                     skip_existing: bool=False):
    # datastes root: a reference path to compute speaker_name
    # we prepand dataset name to speaker_id becase we are mixing serveal
    # multispeaker datasets together
@ -67,24 +81,25 @@ def _process_dataset(processor: SpeakerVerificationPreprocessor,
                     dataset_name: str,
                     output_dir: Path,
                     pattern: str,
-                     skip_existing: bool = False):
+                     skip_existing: bool=False):
    print(
-        f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers."
-    )
+        f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")

-    _func = partial(_process_speaker,
-                    processor=processor,
-                    datasets_root=datasets_root,
-                    output_dir=output_dir,
-                    pattern=pattern,
-                    skip_existing=skip_existing)
+    _func = partial(
+        _process_speaker,
+        processor=processor,
+        datasets_root=datasets_root,
+        output_dir=output_dir,
+        pattern=pattern,
+        skip_existing=skip_existing)

    with mp.Pool(16) as pool:
        list(
-            tqdm(pool.imap(_func, speaker_dirs),
-                 dataset_name,
-                 len(speaker_dirs),
-                 unit="speakers"))
+            tqdm(
+                pool.imap(_func, speaker_dirs),
+                dataset_name,
+                len(speaker_dirs),
+                unit="speakers"))
    print(f"Done preprocessing {dataset_name}.")


--- a/examples/ge2e/inference.py
+++ b/examples/ge2e/inference.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 from pathlib import Path

@ -26,7 +40,9 @@ def embed_utterance(processor, model, fpath_or_wav):
    return embed


-def _process_utterance(ifpath: Path, input_dir: Path, output_dir: Path,
+def _process_utterance(ifpath: Path,
+                       input_dir: Path,
+                       output_dir: Path,
                       processor: SpeakerVerificationPreprocessor,
                       model: LSTMSpeakerEncoder):
    rel_path = ifpath.relative_to(input_dir)
@ -62,8 +78,7 @@ def main(config, args):
        n_mels=c.n_mels,
        partial_n_frames=c.partial_n_frames,
        min_pad_coverage=c.min_pad_coverage,
-        partial_overlap_ratio=c.min_pad_coverage,
-    )
+        partial_overlap_ratio=c.min_pad_coverage, )

    # input output preparation
    input_dir = Path(args.input).expanduser()
@ -83,34 +98,34 @@ if __name__ == "__main__":
        "--config",
        metavar="FILE",
        help="path of the config file to overwrite to default config with.")
-    parser.add_argument("--input",
-                        type=str,
-                        help="path of the audio_file folder.")
-    parser.add_argument("--pattern",
-                        type=str,
-                        default="*.wav",
-                        help="pattern to filter audio files.")
-    parser.add_argument("--output",
-                        metavar="OUTPUT_DIR",
-                        help="path to save checkpoint and logs.")
+    parser.add_argument(
+        "--input", type=str, help="path of the audio_file folder.")
+    parser.add_argument(
+        "--pattern",
+        type=str,
+        default="*.wav",
+        help="pattern to filter audio files.")
+    parser.add_argument(
+        "--output",
+        metavar="OUTPUT_DIR",
+        help="path to save checkpoint and logs.")

    # load from saved checkpoint
-    parser.add_argument("--checkpoint_path",
-                        type=str,
-                        help="path of the checkpoint to load")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load")

    # running
-    parser.add_argument("--device",
-                        type=str,
-                        choices=["cpu", "gpu"],
-                        help="device type to use, cpu and gpu are supported.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "gpu"],
+        help="device type to use, cpu and gpu are supported.")

    # overwrite extra config and default config
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )

    args = parser.parse_args()
--- a/examples/ge2e/preprocess.py
+++ b/examples/ge2e/preprocess.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 from pathlib import Path
 from config import get_cfg_defaults
@ -12,25 +26,21 @@ if __name__ == "__main__":
    parser.add_argument(
        "--datasets_root",
        type=Path,
-        help=
-        "Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
+        help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
    )
-    parser.add_argument("--output_dir",
-                        type=Path,
-                        help="Path to save processed dataset.")
+    parser.add_argument(
+        "--output_dir", type=Path, help="Path to save processed dataset.")
    parser.add_argument(
        "--dataset_names",
        type=str,
        default="librispeech_other,voxceleb1,voxceleb2",
-        help=
-        "comma-separated list of names of the datasets you want to preprocess. only "
+        help="comma-separated list of names of the datasets you want to preprocess. only "
        "the train set of these datastes will be used. Possible names: librispeech_other, "
        "voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
    parser.add_argument(
        "--skip_existing",
        action="store_true",
-        help=
-        "Whether to skip ouput files with the same name. Useful if this script was interrupted."
+        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
    )
    parser.add_argument(
        "--no_trim",
@ -74,8 +84,7 @@ if __name__ == "__main__":
        n_mels=c.n_mels,
        partial_n_frames=c.partial_n_frames,
        min_pad_coverage=c.min_pad_coverage,
-        partial_overlap_ratio=c.min_pad_coverage,
-    )
+        partial_overlap_ratio=c.min_pad_coverage, )

    preprocess_func = {
        "librispeech_other": process_librispeech,
--- a/examples/ge2e/random_cycle.py
+++ b/examples/ge2e/random_cycle.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import random


--- a/examples/ge2e/speaker_verification_dataset.py
+++ b/examples/ge2e/speaker_verification_dataset.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import random
 from pathlib import Path

@ -22,6 +36,7 @@ class MultiSpeakerMelDataset(Dataset):
        utterance2.npy
        utterance3.npy
    """
+
    def __init__(self, dataset_root: Path):
        self.root = Path(dataset_root).expanduser()
        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
@ -57,8 +72,11 @@ class MultiSpeakerSampler(BatchSampler):
    First, N speakers from all speakers are sampled randomly. Then, for each
    speaker, randomly sample M utterances from their corresponding utterances.
    """
-    def __init__(self, dataset: MultiSpeakerMelDataset,
-                 speakers_per_batch: int, utterances_per_speaker: int):
+
+    def __init__(self,
+                 dataset: MultiSpeakerMelDataset,
+                 speakers_per_batch: int,
+                 utterances_per_speaker: int):
        self._speakers = list(dataset.speaker_dirs)
        self._speaker_to_utterances = dataset.speaker_to_utterances

--- a/examples/ge2e/train.py
+++ b/examples/ge2e/train.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time

 from paddle import distributed as dist
@ -22,9 +36,10 @@ class Ge2eExperiment(ExperimentBase):
        model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
                                   config.model.hidden_size,
                                   config.model.embedding_size)
-        optimizer = Adam(config.training.learning_rate_init,
-                         parameters=model.parameters(),
-                         grad_clip=ClipGradByGlobalNorm(3))
+        optimizer = Adam(
+            config.training.learning_rate_init,
+            parameters=model.parameters(),
+            grad_clip=ClipGradByGlobalNorm(3))
        self.model = DataParallel(model) if self.parallel else model
        self.model_core = model
        self.optimizer = optimizer
@ -35,11 +50,11 @@ class Ge2eExperiment(ExperimentBase):
        sampler = MultiSpeakerSampler(train_dataset,
                                      config.training.speakers_per_batch,
                                      config.training.utterances_per_speaker)
-        train_loader = DataLoader(train_dataset,
-                                  batch_sampler=sampler,
-                                  collate_fn=Collate(
-                                      config.data.partial_n_frames),
-                                  num_workers=16)
+        train_loader = DataLoader(
+            train_dataset,
+            batch_sampler=sampler,
+            collate_fn=Collate(config.data.partial_n_frames),
+            num_workers=16)

        self.train_dataset = train_dataset
        self.train_loader = train_loader
@ -72,8 +87,8 @@ class Ge2eExperiment(ExperimentBase):
                                       self.iteration)
            self.visualizer.add_scalar("train/eer", eer, self.iteration)
            self.visualizer.add_scalar(
-                "param/w", float(self.model_core.similarity_weight),
-                self.iteration)
+                "param/w",
+                float(self.model_core.similarity_weight), self.iteration)
            self.visualizer.add_scalar("param/b",
                                       float(self.model_core.similarity_bias),
                                       self.iteration)
--- a/examples/tacotron2/README.md
+++ b/examples/tacotron2/README.md
@ -87,7 +87,6 @@ Pretrained Models can be downloaded from links below. We provide 2 models with d
 2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)


-## Notebook: End-to-end TTS 
+## Notebook: End-to-end TTS

 See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
-
--- a/examples/tacotron2/config.py
+++ b/examples/tacotron2/config.py
@ -32,16 +32,14 @@ _C.data = CN(
 _C.model = CN(
    dict(
        vocab_size=37,  # set this according to the frontend's vocab size
-        n_tones=None, 
+        n_tones=None,
        reduction_factor=1,  # reduction factor
        d_encoder=512,  # embedding & encoder's internal size
        encoder_conv_layers=3,  # number of conv layer in tacotron2 encoder
        encoder_kernel_size=5,  # kernel size of conv layers in tacotron2 encoder
        d_prenet=256,  # hidden size of decoder prenet
-        d_attention_rnn=
-        1024,  # hidden size of the first rnn layer in tacotron2 decoder
-        d_decoder_rnn=
-        1024,  # hidden size of the second rnn layer in tacotron2 decoder
+        d_attention_rnn=1024,  # hidden size of the first rnn layer in tacotron2 decoder
+        d_decoder_rnn=1024,  # hidden size of the second rnn layer in tacotron2 decoder
        d_attention=128,  # hidden size of  decoder location linear layer
        attention_filters=32,  # number of filter in decoder location conv layer
        attention_kernel_size=31,  # kernel size of decoder location conv layer
@ -50,14 +48,11 @@ _C.model = CN(
        postnet_conv_layers=5,  # number of conv layer in decoder postnet
        p_encoder_dropout=0.5,  # droput probability in encoder
        p_prenet_dropout=0.5,  # droput probability in decoder prenet
-        p_attention_dropout=
-        0.1,  # droput probability of first rnn layer in decoder
-        p_decoder_dropout=
-        0.1,  # droput probability of second rnn layer in decoder
+        p_attention_dropout=0.1,  # droput probability of first rnn layer in decoder
+        p_decoder_dropout=0.1,  # droput probability of second rnn layer in decoder
        p_postnet_dropout=0.5,  # droput probability in decoder postnet
        d_global_condition=None,
-        use_stop_token=
-        True,  # wherther to use binary classifier to predict when to stop
+        use_stop_token=True,  # wherther to use binary classifier to predict when to stop
        use_guided_attention_loss=False,  # whether to use guided attention loss
        guided_attention_loss_sigma=0.2  # sigma in guided attention loss
    ))
--- a/examples/tacotron2/ljspeech.py
+++ b/examples/tacotron2/ljspeech.py
@ -23,6 +23,7 @@ from parakeet.data.batch import batch_spec, batch_text_id

 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
+
    def __init__(self, root):
        self.root = Path(root).expanduser()
        records = []
@ -44,9 +45,8 @@ class LJSpeech(Dataset):

 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
-    def __init__(self,
-                 padding_idx=0,
-                 padding_value=0.,
+
+    def __init__(self, padding_idx=0, padding_value=0.,
                 padding_stop_token=1.0):
        self.padding_idx = padding_idx
        self.padding_value = padding_value
@ -68,16 +68,19 @@ class LJSpeechCollector(object):

        # Sort by text_len in descending order
        texts = [
-            i for i, _ in sorted(
+            i
+            for i, _ in sorted(
                zip(texts, text_lens), key=lambda x: x[1], reverse=True)
        ]
        mels = [
-            i for i, _ in sorted(
+            i
+            for i, _ in sorted(
                zip(mels, text_lens), key=lambda x: x[1], reverse=True)
        ]

        mel_lens = [
-            i for i, _ in sorted(
+            i
+            for i, _ in sorted(
                zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
        ]

--- a/examples/tacotron2/preprocess.py
+++ b/examples/tacotron2/preprocess.py
@ -35,13 +35,14 @@ def create_dataset(config, source_path, target_path, verbose=False):

    meta_data = LJSpeechMetaData(source_path)
    frontend = EnglishCharacter()
-    processor = AudioProcessor(sample_rate=config.data.sample_rate,
-                               n_fft=config.data.n_fft,
-                               n_mels=config.data.n_mels,
-                               win_length=config.data.win_length,
-                               hop_length=config.data.hop_length,
-                               fmax=config.data.fmax,
-                               fmin=config.data.fmin)
+    processor = AudioProcessor(
+        sample_rate=config.data.sample_rate,
+        n_fft=config.data.n_fft,
+        n_mels=config.data.n_mels,
+        win_length=config.data.win_length,
+        hop_length=config.data.hop_length,
+        fmax=config.data.fmax,
+        fmin=config.data.fmin)
    normalizer = LogMagnitude()

    records = []
@ -70,26 +71,22 @@ def create_dataset(config, source_path, target_path, verbose=False):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config",
-                        type=str,
-                        metavar="FILE",
-                        help="extra config to overwrite the default config")
-    parser.add_argument("--input",
-                        type=str,
-                        help="path of the ljspeech dataset")
-    parser.add_argument("--output",
-                        type=str,
-                        help="path to save output dataset")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v",
-                        "--verbose",
-                        action="store_true",
-                        help="print msg")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")

    config = get_cfg_defaults()
    args = parser.parse_args()
--- a/examples/tacotron2/synthesize.py
+++ b/examples/tacotron2/synthesize.py
@ -65,29 +65,24 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--config",
-                        type=str,
-                        metavar="FILE",
-                        help="extra config to overwrite the default config")
-    parser.add_argument("--checkpoint_path",
-                        type=str,
-                        help="path of the checkpoint to load.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
    parser.add_argument("--input", type=str, help="path of the text sentences")
    parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device",
-                        type=str,
-                        default="cpu",
-                        help="device type to use.")
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="device type to use.")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v",
-                        "--verbose",
-                        action="store_true",
-                        help="print msg")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")

    args = parser.parse_args()
    if args.config:
--- a/examples/tacotron2/train.py
+++ b/examples/tacotron2/train.py
@ -98,9 +98,8 @@ class Experiment(ExperimentBase):
                display.plot_spectrogram(mels[0].numpy().T), self.iteration)
            self.visualizer.add_figure(
                f"valid_sentence_{i}_predicted_spectrogram",
-                display.plot_spectrogram(
-                    outputs['mel_outputs_postnet'][0].numpy().T),
-                self.iteration)
+                display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
+                                         .numpy().T), self.iteration)

        # write visual log
        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
@ -169,26 +168,27 @@ class Experiment(ExperimentBase):
        batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)

        if not self.parallel:
-            self.train_loader = DataLoader(train_set,
-                                           batch_size=config.data.batch_size,
-                                           shuffle=True,
-                                           drop_last=True,
-                                           collate_fn=batch_fn)
+            self.train_loader = DataLoader(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True,
+                collate_fn=batch_fn)
        else:
            sampler = DistributedBatchSampler(
                train_set,
                batch_size=config.data.batch_size,
                shuffle=True,
                drop_last=True)
-            self.train_loader = DataLoader(train_set,
-                                           batch_sampler=sampler,
-                                           collate_fn=batch_fn)
+            self.train_loader = DataLoader(
+                train_set, batch_sampler=sampler, collate_fn=batch_fn)

-        self.valid_loader = DataLoader(valid_set,
-                                       batch_size=config.data.batch_size,
-                                       shuffle=False,
-                                       drop_last=False,
-                                       collate_fn=batch_fn)
+        self.valid_loader = DataLoader(
+            valid_set,
+            batch_size=config.data.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=batch_fn)


 def main_sp(config, args):
--- a/examples/tacotron2_aishell3/README_cn.md
+++ b/examples/tacotron2_aishell3/README_cn.md
@ -80,7 +80,7 @@ input 是处理后的音频所在的文件夹，output 是输出频谱的文件
 运行脚本训练。

 ```python
-python train.py --data=<data> --output=<output> --device="gpu" 
+python train.py --data=<data> --output=<output> --device="gpu"
 ```

 我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题，每个句子可能有几百帧对应负样例，只有一帧正样例，而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
@ -90,7 +90,7 @@ python train.py --data=<data> --output=<output> --device="gpu"
 可以使用 visualdl 查看训练过程的 log。

 ```bash
-visualdl --logdir=<output> --host=$HOSTNAME 
+visualdl --logdir=<output> --host=$HOSTNAME
 ```

 示例 training loss / validation loss 曲线如下。
@ -109,4 +109,4 @@ visualdl --logdir=<output> --host=$HOSTNAME

 ## 使用

-本实验包含了一个简单的使用示例，用户可以替换作为参考的声音以及文本，用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
+本实验包含了一个简单的使用示例，用户可以替换作为参考的声音以及文本，用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
--- a/examples/tacotron2_aishell3/aishell3.py
+++ b/examples/tacotron2_aishell3/aishell3.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pickle
 from pathlib import Path

@ -16,6 +30,7 @@ print("vocab_tones:\n", voc_tones)

 class AiShell3(Dataset):
    """Processed AiShell3 dataset."""
+
    def __init__(self, root):
        super().__init__()
        self.root = Path(root).expanduser()
@ -31,10 +46,10 @@ class AiShell3(Dataset):
        speaker_id = sentence_id[:7]
        phones = metadatum["phones"]
        tones = metadatum["tones"]
-        phones = np.array([voc_phones.lookup(item) for item in phones],
-                          dtype=np.int64)
-        tones = np.array([voc_tones.lookup(item) for item in tones],
-                         dtype=np.int64)
+        phones = np.array(
+            [voc_phones.lookup(item) for item in phones], dtype=np.int64)
+        tones = np.array(
+            [voc_tones.lookup(item) for item in tones], dtype=np.int64)
        mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
        embed = np.load(
            str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
@ -50,8 +65,8 @@ def collate_aishell3_examples(examples):
    text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
    T_dec = np.max(spec_lengths)
-    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
-                                                      -1)).astype(np.float32)
+    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
+                   ).astype(np.float32)
    phones, _ = batch_text_id(phones)
    tones, _ = batch_text_id(tones)
    mel, _ = batch_spec(mel)
--- a/examples/tacotron2_aishell3/chinese_g2p.py
+++ b/examples/tacotron2_aishell3/chinese_g2p.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List, Tuple

 from chinese_text_to_pinyin import convert_to_pinyin
--- a/examples/tacotron2_aishell3/chinese_phonology.py
+++ b/examples/tacotron2_aishell3/chinese_phonology.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 A pinyin to phone transcription system for chinese.
 Syllables are splited as initial and final. 'er' is also treated as s special symbol.
@ -96,9 +109,8 @@ def convert(syllable):
    syllable = syllable.replace("ing", "ieng").replace("in", "ien")

    # expansion for un, ui, iu
-    syllable = syllable.replace("un",
-                                "uen").replace("ui",
-                                               "uei").replace("iu", "iou")
+    syllable = syllable.replace("un", "uen").replace(
+        "ui", "uei").replace("iu", "iou")

    # rule for variants of i
    syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
--- a/examples/tacotron2_aishell3/chinese_text_to_pinyin.py
+++ b/examples/tacotron2_aishell3/chinese_text_to_pinyin.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List

 from pypinyin import lazy_pinyin, Style
@ -7,7 +21,6 @@ def convert_to_pinyin(text: str) -> List[str]:
    """convert text into list of syllables, other characters that are not chinese, thus
    cannot be converted to pinyin are splited.
    """
-    syllables = lazy_pinyin(text,
-                            style=Style.TONE3,
-                            neutral_tone_with_five=True)
+    syllables = lazy_pinyin(
+        text, style=Style.TONE3, neutral_tone_with_five=True)
    return syllables
--- a/examples/tacotron2_aishell3/config.py
+++ b/examples/tacotron2_aishell3/config.py
@ -62,8 +62,7 @@ _C.model = CN(
        # whether to use a classifier to predict stop probability
        use_stop_token=False,
        # whether to use guided attention loss in training
-        use_guided_attention_loss=True,
-    ))
+        use_guided_attention_loss=True, ))

 _C.training = CN(
    dict(
--- a/examples/tacotron2_aishell3/extract_mel.py
+++ b/examples/tacotron2_aishell3/extract_mel.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import multiprocessing as mp
 from functools import partial
@ -12,8 +26,11 @@ import tqdm
 from config import get_cfg_defaults


-def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
-                p: AudioProcessor, n: NormalizerBase):
+def extract_mel(fname: Path,
+                input_dir: Path,
+                output_dir: Path,
+                p: AudioProcessor,
+                n: NormalizerBase):
    relative_path = fname.relative_to(input_dir)
    out_path = (output_dir / relative_path).with_suffix(".npy")
    out_path.parent.mkdir(parents=True, exist_ok=True)
@ -34,41 +51,37 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
                       config.fmax)
    n = LogMagnitude(1e-5)

-    func = partial(extract_mel,
-                   input_dir=input_dir,
-                   output_dir=output_dir,
-                   p=p,
-                   n=n)
+    func = partial(
+        extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)

    with mp.Pool(16) as pool:
        list(
-            tqdm.tqdm(pool.imap(func, fnames),
-                      total=len(fnames),
-                      unit="utterance"))
+            tqdm.tqdm(
+                pool.imap(func, fnames), total=len(fnames), unit="utterance"))


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description=
-        "Extract mel spectrogram from processed wav in AiShell3 training dataset."
+        description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
    )
    parser.add_argument(
        "--config",
        type=str,
        help="yaml config file to overwrite the default config")
-    parser.add_argument("--input",
-                        type=str,
-                        default="~/datasets/aishell3/train/normalized_wav",
-                        help="path of the processed wav folder")
-    parser.add_argument("--output",
-                        type=str,
-                        default="~/datasets/aishell3/train/mel",
-                        help="path of the folder to save mel spectrograms")
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="~/datasets/aishell3/train/normalized_wav",
+        help="path of the processed wav folder")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="~/datasets/aishell3/train/mel",
+        help="path of the folder to save mel spectrograms")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
    default_config = get_cfg_defaults()

--- a/examples/tacotron2_aishell3/preprocess_transcription.py
+++ b/examples/tacotron2_aishell3/preprocess_transcription.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 from pathlib import Path
 import re
@ -107,9 +121,8 @@ def convert(syllable):
    syllable = syllable.replace("ing", "ieng").replace("in", "ien")

    # expansion for un, ui, iu
-    syllable = syllable.replace("un",
-                                "uen").replace("ui",
-                                               "uei").replace("iu", "iou")
+    syllable = syllable.replace("un", "uen").replace(
+        "ui", "uei").replace("iu", "iou")

    # rule for variants of i
    syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
@ -218,18 +231,15 @@ def process_aishell3(dataset_root, output_dir):
        pickle.dump(processed_records, f)

    with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
-        yaml.safe_dump(processed_records,
-                       f,
-                       default_flow_style=None,
-                       allow_unicode=True)
+        yaml.safe_dump(
+            processed_records, f, default_flow_style=None, allow_unicode=True)

    print("metadata done!")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description=
-        "Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
+        description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
    )
    parser.add_argument(
        "--input",
--- a/examples/tacotron2_aishell3/process_wav.py
+++ b/examples/tacotron2_aishell3/process_wav.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 from pathlib import Path
 from multiprocessing import Pool
@ -47,34 +61,36 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):

    wav_paths = list(source_dir.rglob("*.wav"))
    print(f"there are {len(wav_paths)} audio files in total")
-    fx = partial(process_utterance,
-                 source_dir=source_dir,
-                 target_dir=target_dir,
-                 alignment_dir=alignment_dir)
+    fx = partial(
+        process_utterance,
+        source_dir=source_dir,
+        target_dir=target_dir,
+        alignment_dir=alignment_dir)
    with Pool(16) as p:
        list(
-            tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
-                 unit="utterance"))
+            tqdm(
+                p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description=
-        "Process audio in AiShell3, trim silence according to the alignment "
+        description="Process audio in AiShell3, trim silence according to the alignment "
        "files generated by MFA, and normalize volume by peak.")
-    parser.add_argument("--input",
-                        type=str,
-                        default="~/datasets/aishell3/train/wav",
-                        help="path of the original audio folder in aishell3.")
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="~/datasets/aishell3/train/wav",
+        help="path of the original audio folder in aishell3.")
    parser.add_argument(
        "--output",
        type=str,
        default="~/datasets/aishell3/train/normalized_wav",
        help="path of the folder to save the processed audio files.")
-    parser.add_argument("--alignment",
-                        type=str,
-                        default="~/datasets/aishell3/train/alignment",
-                        help="path of the alignment files.")
+    parser.add_argument(
+        "--alignment",
+        type=str,
+        default="~/datasets/aishell3/train/alignment",
+        help="path of the alignment files.")
    args = parser.parse_args()

    preprocess_aishell3(args.input, args.output, args.alignment)
--- a/examples/tacotron2_aishell3/train.py
+++ b/examples/tacotron2_aishell3/train.py
@ -53,12 +53,13 @@ class Experiment(ExperimentBase):
        self.optimizer.clear_grad()
        self.model.train()
        texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
-        outputs = self.model(texts,
-                             text_lens,
-                             mels,
-                             output_lens,
-                             tones=tones,
-                             global_condition=utterance_embeds)
+        outputs = self.model(
+            texts,
+            text_lens,
+            mels,
+            output_lens,
+            tones=tones,
+            global_condition=utterance_embeds)
        losses = self.compute_losses(batch, outputs)
        loss = losses["loss"]
        loss.backward()
@ -86,12 +87,13 @@ class Experiment(ExperimentBase):
        valid_losses = defaultdict(list)
        for i, batch in enumerate(self.valid_loader):
            texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
-            outputs = self.model(texts,
-                                 text_lens,
-                                 mels,
-                                 output_lens,
-                                 tones=tones,
-                                 global_condition=utterance_embeds)
+            outputs = self.model(
+                texts,
+                text_lens,
+                mels,
+                output_lens,
+                tones=tones,
+                global_condition=utterance_embeds)
            losses = self.compute_losses(batch, outputs)
            for key, value in losses.items():
                valid_losses[key].append(float(value))
@ -132,9 +134,8 @@ class Experiment(ExperimentBase):
        mel_dir.mkdir(parents=True, exist_ok=True)
        for i, batch in enumerate(self.test_loader):
            texts, tones, mels, utterance_embeds, *_ = batch
-            outputs = self.model.infer(texts,
-                                       tones=tones,
-                                       global_condition=utterance_embeds)
+            outputs = self.model.infer(
+                texts, tones=tones, global_condition=utterance_embeds)

            display.plot_alignment(outputs["alignments"][0].numpy().T)
            plt.savefig(mel_dir / f"sentence_{i}.png")
@ -168,8 +169,7 @@ class Experiment(ExperimentBase):
            p_decoder_dropout=config.model.p_decoder_dropout,
            p_postnet_dropout=config.model.p_postnet_dropout,
            d_global_condition=config.model.d_global_condition,
-            use_stop_token=config.model.use_stop_token,
-        )
+            use_stop_token=config.model.use_stop_token, )

        if self.parallel:
            model = paddle.DataParallel(model)
@ -200,32 +200,34 @@ class Experiment(ExperimentBase):
        batch_fn = collate_aishell3_examples

        if not self.parallel:
-            self.train_loader = DataLoader(train_set,
-                                           batch_size=config.data.batch_size,
-                                           shuffle=True,
-                                           drop_last=True,
-                                           collate_fn=batch_fn)
+            self.train_loader = DataLoader(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True,
+                collate_fn=batch_fn)
        else:
            sampler = DistributedBatchSampler(
                train_set,
                batch_size=config.data.batch_size,
                shuffle=True,
                drop_last=True)
-            self.train_loader = DataLoader(train_set,
-                                           batch_sampler=sampler,
-                                           collate_fn=batch_fn)
+            self.train_loader = DataLoader(
+                train_set, batch_sampler=sampler, collate_fn=batch_fn)

-        self.valid_loader = DataLoader(valid_set,
-                                       batch_size=config.data.batch_size,
-                                       shuffle=False,
-                                       drop_last=False,
-                                       collate_fn=batch_fn)
+        self.valid_loader = DataLoader(
+            valid_set,
+            batch_size=config.data.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=batch_fn)

-        self.test_loader = DataLoader(valid_set,
-                                      batch_size=1,
-                                      shuffle=False,
-                                      drop_last=False,
-                                      collate_fn=batch_fn)
+        self.test_loader = DataLoader(
+            valid_set,
+            batch_size=1,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=batch_fn)


 def main_sp(config, args):
--- a/examples/transformer_tts/README.md
+++ b/examples/transformer_tts/README.md
@ -14,7 +14,7 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 tar xjvf LJSpeech-1.1.tar.bz2
 ```

-### Preprocess the dataset. 
+### Preprocess the dataset.

 Assume the path to save the preprocessed dataset is `ljspeech_transformer_tts`. Run the command below to preprocess the dataset.

@ -49,4 +49,4 @@ python synthesize.py --input=sentence.txt --output=mels/ --checkpoint_path='step

 ## Pretrained Model

-Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).
+Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).
--- a/examples/transformer_tts/ljspeech.py
+++ b/examples/transformer_tts/ljspeech.py
@ -23,6 +23,7 @@ from parakeet.data.batch import batch_spec, batch_text_id

 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
+
    def __init__(self, root):
        self.root = Path(root).expanduser()
        records = []
@ -64,6 +65,7 @@ class Transform(object):

 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
+
    def __init__(self, padding_idx=0, padding_value=0.):
        self.padding_idx = padding_idx
        self.padding_value = padding_value
--- a/examples/transformer_tts/preprocess.py
+++ b/examples/transformer_tts/preprocess.py
@ -35,13 +35,14 @@ def create_dataset(config, source_path, target_path, verbose=False):

    meta_data = LJSpeechMetaData(source_path)
    frontend = English()
-    processor = AudioProcessor(sample_rate=config.data.sample_rate,
-                               n_fft=config.data.n_fft,
-                               n_mels=config.data.d_mel,
-                               win_length=config.data.win_length,
-                               hop_length=config.data.hop_length,
-                               fmax=config.data.fmax,
-                               fmin=config.data.fmin)
+    processor = AudioProcessor(
+        sample_rate=config.data.sample_rate,
+        n_fft=config.data.n_fft,
+        n_mels=config.data.d_mel,
+        win_length=config.data.win_length,
+        hop_length=config.data.hop_length,
+        fmax=config.data.fmax,
+        fmin=config.data.fmin)
    normalizer = LogMagnitude()

    records = []
@ -80,26 +81,22 @@ def create_dataset(config, source_path, target_path, verbose=False):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config",
-                        type=str,
-                        metavar="FILE",
-                        help="extra config to overwrite the default config")
-    parser.add_argument("--input",
-                        type=str,
-                        help="path of the ljspeech dataset")
-    parser.add_argument("--output",
-                        type=str,
-                        help="path to save output dataset")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v",
-                        "--verbose",
-                        action="store_true",
-                        help="print msg")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")

    config = get_cfg_defaults()
    args = parser.parse_args()
--- a/examples/transformer_tts/synthesize.py
+++ b/examples/transformer_tts/synthesize.py
@ -73,29 +73,24 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--config",
-                        type=str,
-                        metavar="FILE",
-                        help="extra config to overwrite the default config")
-    parser.add_argument("--checkpoint_path",
-                        type=str,
-                        help="path of the checkpoint to load.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
    parser.add_argument("--input", type=str, help="path of the text sentences")
    parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device",
-                        type=str,
-                        default="cpu",
-                        help="device type to use.")
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="device type to use.")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v",
-                        "--verbose",
-                        action="store_true",
-                        help="print msg")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")

    args = parser.parse_args()
    if args.config:
--- a/examples/transformer_tts/train.py
+++ b/examples/transformer_tts/train.py
@ -53,11 +53,12 @@ class TransformerTTSExperiment(ExperimentBase):
            dropout=config.model.dropout)
        if self.parallel:
            model = paddle.DataParallel(model)
-        optimizer = paddle.optimizer.Adam(learning_rate=config.training.lr,
-                                          beta1=0.9,
-                                          beta2=0.98,
-                                          epsilon=1e-9,
-                                          parameters=model.parameters())
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=config.training.lr,
+            beta1=0.9,
+            beta2=0.98,
+            epsilon=1e-9,
+            parameters=model.parameters())
        criterion = TransformerTTSLoss(config.model.stop_loss_scale)
        drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
        reduction_factor = scheduler.StepWise(config.training.reduction_factor)
@ -82,11 +83,12 @@ class TransformerTTSExperiment(ExperimentBase):
        batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)

        if not self.parallel:
-            train_loader = DataLoader(train_set,
-                                      batch_size=config.data.batch_size,
-                                      shuffle=True,
-                                      drop_last=True,
-                                      collate_fn=batch_fn)
+            train_loader = DataLoader(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True,
+                collate_fn=batch_fn)
        else:
            sampler = DistributedBatchSampler(
                train_set,
@ -95,21 +97,20 @@ class TransformerTTSExperiment(ExperimentBase):
                rank=dist.get_rank(),
                shuffle=True,
                drop_last=True)
-            train_loader = DataLoader(train_set,
-                                      batch_sampler=sampler,
-                                      collate_fn=batch_fn)
+            train_loader = DataLoader(
+                train_set, batch_sampler=sampler, collate_fn=batch_fn)

-        valid_loader = DataLoader(valid_set,
-                                  batch_size=config.data.batch_size,
-                                  collate_fn=batch_fn)
+        valid_loader = DataLoader(
+            valid_set, batch_size=config.data.batch_size, collate_fn=batch_fn)

        self.train_loader = train_loader
        self.valid_loader = valid_loader

    def compute_outputs(self, text, mel):
        model_core = self.model._layers if self.parallel else self.model
-        model_core.set_constants(self.reduction_factor(self.iteration),
-                                 self.drop_n_heads(self.iteration))
+        model_core.set_constants(
+            self.reduction_factor(self.iteration),
+            self.drop_n_heads(self.iteration))

        mel_input = mel[:, :-1, :]
        reduced_mel_input = mel_input[:, ::model_core.r, :]
@ -126,10 +127,9 @@ class TransformerTTSExperiment(ExperimentBase):
        stop_logits = outputs["stop_logits"]

        time_steps = mel_target.shape[1]
-        losses = self.criterion(mel_output[:, :time_steps, :],
-                                mel_intermediate[:, :time_steps, :],
-                                mel_target, stop_logits[:, :time_steps, :],
-                                stop_label_target)
+        losses = self.criterion(
+            mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
+            mel_target, stop_logits[:, :time_steps, :], stop_label_target)
        return losses

    def train_batch(self):
--- a/examples/waveflow/README.md
+++ b/examples/waveflow/README.md
@ -14,7 +14,7 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 tar xjvf LJSpeech-1.1.tar.bz2
 ```

-### Preprocess the dataset. 
+### Preprocess the dataset.

 Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.

@ -49,4 +49,4 @@ python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-200000

 ## Pretrained Model

-Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
+Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
--- a/examples/waveflow/ljspeech.py
+++ b/examples/waveflow/ljspeech.py
@ -23,12 +23,14 @@ from parakeet.data.batch import batch_spec, batch_wav

 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
+
    def __init__(self, root):
        self.root = Path(root).expanduser()
-        meta_data = pandas.read_csv(str(self.root / "metadata.csv"),
-                                    sep="\t",
-                                    header=None,
-                                    names=["fname", "frames", "samples"])
+        meta_data = pandas.read_csv(
+            str(self.root / "metadata.csv"),
+            sep="\t",
+            header=None,
+            names=["fname", "frames", "samples"])

        records = []
        for row in meta_data.itertuples():
@ -49,6 +51,7 @@ class LJSpeech(Dataset):

 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
+
    def __init__(self, padding_value=0.):
        self.padding_value = padding_value

--- a/examples/waveflow/preprocess.py
+++ b/examples/waveflow/preprocess.py
@ -70,11 +70,12 @@ class Transform(object):

        # Compute mel-spectrogram.
        # Turn center to False to prevent internal padding.
-        spectrogram = librosa.core.stft(wav,
-                                        hop_length=hop_length,
-                                        win_length=win_length,
-                                        n_fft=n_fft,
-                                        center=False)
+        spectrogram = librosa.core.stft(
+            wav,
+            hop_length=hop_length,
+            win_length=win_length,
+            n_fft=n_fft,
+            center=False)
        spectrogram_magnitude = np.abs(spectrogram)

        # Compute mel-spectrograms.
@ -123,10 +124,8 @@ def create_dataset(config, input_dir, output_dir):
        file_names.append((base_name, mel.shape[-1], audio.shape[-1]))

    meta_data = pd.DataFrame.from_records(file_names)
-    meta_data.to_csv(str(output_dir / "metadata.csv"),
-                     sep="\t",
-                     index=None,
-                     header=None)
+    meta_data.to_csv(
+        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
    print("saved meta data in to {}".format(
        os.path.join(output_dir, "metadata.csv")))

@ -135,26 +134,22 @@ def create_dataset(config, input_dir, output_dir):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument("--config",
-                        type=str,
-                        metavar="FILE",
-                        help="extra config to overwrite the default config")
-    parser.add_argument("--input",
-                        type=str,
-                        help="path of the ljspeech dataset")
-    parser.add_argument("--output",
-                        type=str,
-                        help="path to save output dataset")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--input", type=str, help="path of the ljspeech dataset")
+    parser.add_argument(
+        "--output", type=str, help="path to save output dataset")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v",
-                        "--verbose",
-                        action="store_true",
-                        help="print msg")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")

    config = get_cfg_defaults()
    args = parser.parse_args()
--- a/examples/waveflow/synthesize.py
+++ b/examples/waveflow/synthesize.py
@ -39,8 +39,8 @@ def main(config, args):
        mel = np.load(str(file_path))
        with paddle.amp.auto_cast():
            audio = model.predict(mel)
-        audio_path = output_dir / (os.path.splitext(file_path.name)[0] +
-                                   ".wav")
+        audio_path = output_dir / (
+            os.path.splitext(file_path.name)[0] + ".wav")
        sf.write(audio_path, audio, config.data.sample_rate)
        print("[synthesize] {} -> {}".format(file_path, audio_path))

@ -50,32 +50,27 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description="generate mel spectrogram with TransformerTTS.")
-    parser.add_argument("--config",
-                        type=str,
-                        metavar="FILE",
-                        help="extra config to overwrite the default config")
-    parser.add_argument("--checkpoint_path",
-                        type=str,
-                        help="path of the checkpoint to load.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        metavar="FILE",
+        help="extra config to overwrite the default config")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path of the checkpoint to load.")
    parser.add_argument(
        "--input",
        type=str,
        help="path of directory containing mel spectrogram (in .npy format)")
    parser.add_argument("--output", type=str, help="path to save outputs")
-    parser.add_argument("--device",
-                        type=str,
-                        default="cpu",
-                        help="device type to use.")
+    parser.add_argument(
+        "--device", type=str, default="cpu", help="device type to use.")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
-        help=
-        "options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument("-v",
-                        "--verbose",
-                        action="store_true",
-                        help="print msg")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="print msg")

    args = parser.parse_args()
    if args.config:
--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
@ -43,8 +43,8 @@ class Experiment(ExperimentBase):

        if self.parallel:
            model = paddle.DataParallel(model)
-        optimizer = paddle.optimizer.Adam(config.training.lr,
-                                          parameters=model.parameters())
+        optimizer = paddle.optimizer.Adam(
+            config.training.lr, parameters=model.parameters())
        criterion = WaveFlowLoss(sigma=config.model.sigma)

        self.model = model
@ -63,11 +63,12 @@ class Experiment(ExperimentBase):
                                         config.data.hop_length)

        if not self.parallel:
-            train_loader = DataLoader(train_set,
-                                      batch_size=config.data.batch_size,
-                                      shuffle=True,
-                                      drop_last=True,
-                                      collate_fn=batch_fn)
+            train_loader = DataLoader(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True,
+                collate_fn=batch_fn)
        else:
            sampler = DistributedBatchSampler(
                train_set,
@ -76,14 +77,12 @@ class Experiment(ExperimentBase):
                rank=dist.get_rank(),
                shuffle=True,
                drop_last=True)
-            train_loader = DataLoader(train_set,
-                                      batch_sampler=sampler,
-                                      collate_fn=batch_fn)
+            train_loader = DataLoader(
+                train_set, batch_sampler=sampler, collate_fn=batch_fn)

        valid_batch_fn = LJSpeechCollector()
-        valid_loader = DataLoader(valid_set,
-                                  batch_size=1,
-                                  collate_fn=valid_batch_fn)
+        valid_loader = DataLoader(
+            valid_set, batch_size=1, collate_fn=valid_batch_fn)

        self.train_loader = train_loader
        self.valid_loader = valid_loader
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
@ -25,9 +25,9 @@ class AudioProcessor(object):
                 n_fft: int,
                 win_length: int,
                 hop_length: int,
-                 n_mels: int = 80,
-                 fmin: int = 0,
-                 fmax: int = None,
+                 n_mels: int=80,
+                 fmin: int=0,
+                 fmax: int=None,
                 window="hann",
                 center=True,
                 pad_mode="reflect",
@ -73,21 +73,23 @@ class AudioProcessor(object):
        sf.write(path, wav, samplerate=self.sample_rate)

    def stft(self, wav):
-        D = librosa.core.stft(wav,
-                              n_fft=self.n_fft,
-                              hop_length=self.hop_length,
-                              win_length=self.win_length,
-                              window=self.window,
-                              center=self.center,
-                              pad_mode=self.pad_mode)
+        D = librosa.core.stft(
+            wav,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode=self.pad_mode)
        return D

    def istft(self, D):
-        wav = librosa.core.istft(D,
-                                 hop_length=self.hop_length,
-                                 win_length=self.win_length,
-                                 window=self.window,
-                                 center=self.center)
+        wav = librosa.core.istft(
+            D,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center)
        return wav

    def spectrogram(self, wav):
--- a/parakeet/audio/spec_normalizer.py
+++ b/parakeet/audio/spec_normalizer.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 This modules contains normalizers for spectrogram magnitude.
 Normalizers are invertible transformations. They can be used to process 
--- a/parakeet/data/init.py
+++ b/parakeet/data/init.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Parakeet's infrastructure for data processing.
 """
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
@ -26,7 +26,11 @@ class AudioSegmentDataset(Dataset):
    """A simple dataset adaptor for audio files to train vocoders.
    Read -> trim silence -> normalize -> extract a segment
    """
-    def __init__(self, file_paths: List[Path], sample_rate: int, length: int,
+
+    def __init__(self,
+                 file_paths: List[Path],
+                 sample_rate: int,
+                 length: int,
                 top_db: float):
        self.file_paths = file_paths
        self.sr = sample_rate
@ -56,10 +60,11 @@ class AudioDataset(Dataset):
    """A simple dataset adaptor for the audio files. 
    Read -> trim silence -> normalize
    """
+
    def __init__(self,
                 file_paths: List[Path],
                 sample_rate: int,
-                 top_db: float = 60):
+                 top_db: float=60):
        self.file_paths = file_paths
        self.sr = sample_rate
        self.top_db = top_db
@ -78,12 +83,11 @@ class AudioDataset(Dataset):

 class AudioFolderDataset(AudioDataset):
    def __init__(
-        self,
-        root,
-        sample_rate,
-        top_db=60,
-        extension=".wav",
-    ):
+            self,
+            root,
+            sample_rate,
+            top_db=60,
+            extension=".wav", ):
        root = Path(root).expanduser()
        file_paths = sorted(list(root.rglob("*{}".format(extension))))
        super().__init__(file_paths, sample_rate, top_db)
--- a/parakeet/frontend/arpabet.py
+++ b/parakeet/frontend/arpabet.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.frontend.phonectic import Phonetics
 """
 A phonology system with ARPABET symbols and limited punctuations. The G2P 
@ -10,7 +24,6 @@ from parakeet.frontend.vocab import Vocab
 from g2p_en import G2p


-
 class ARPABET(Phonetics):
    """A phonology for English that uses ARPABET as the phoneme vocabulary.
    See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
@ -132,7 +145,9 @@ class ARPABET(Phonetics):
        List[str]
            The list of pronunciation sequence.
        """
-        phonemes = [self._remove_vowels(item) for item in self.backend(sentence)]
+        phonemes = [
+            self._remove_vowels(item) for item in self.backend(sentence)
+        ]
        if add_start_end:
            start = self.vocab.start_symbol
            end = self.vocab.end_symbol
@ -184,7 +199,9 @@ class ARPABET(Phonetics):
        List[str]
            The list of pronunciation id sequence.
        """
-        return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
+        return self.numericalize(
+            self.phoneticize(
+                sentence, add_start_end=add_start_end))

    @property
    def vocab_size(self):
@ -206,7 +223,7 @@ class ARPABETWithStress(Phonetics):
    ]
    punctuations = [',', '.', '?', '!']
    symbols = phonemes + punctuations
-    
+
    def __init__(self):
        self.backend = G2p()
        self.vocab = Vocab(self.phonemes + self.punctuations)
@ -276,11 +293,13 @@ class ARPABETWithStress(Phonetics):
        List[str]
            The list of pronunciation id sequence.
        """
-        return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
+        return self.numericalize(
+            self.phoneticize(
+                sentence, add_start_end=add_start_end))

    @property
    def vocab_size(self):
        """ Vocab size.
        """
        # 77 = 69 phones + 4 punctuations + 4 special tokens
-        return len(self.vocab)    
+        return len(self.vocab)
--- a/parakeet/frontend/normalizer/abbrrviation.py
+++ b/parakeet/frontend/normalizer/abbrrviation.py
@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
--- a/parakeet/frontend/normalizer/acronyms.py
+++ b/parakeet/frontend/normalizer/acronyms.py
@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
--- a/parakeet/frontend/normalizer/width.py
+++ b/parakeet/frontend/normalizer/width.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+
 def full2half_width(ustr):
    half = []
    for u in ustr:
--- a/parakeet/frontend/pinyin.py
+++ b/parakeet/frontend/pinyin.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 A Simple Chinese Phonology using pinyin symbols. 
 The G2P conversion converts pinyin string to symbols. Also it can handle string
@ -32,6 +45,7 @@ _tones = ['0', '1', '2', '3', '4', '5']
 _toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])]
 _toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations

+
 class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter):
    pass

@ -41,7 +55,7 @@ class ParakeetPinyin(Phonetics):
        self.vocab_phonemes = Vocab(_phones)
        self.vocab_tones = Vocab(_tones)
        self.pinyin_backend = Pinyin(ParakeetConverter())
-        
+
    def convert_pypinyin_tone3(self, syllables, add_start_end=False):
        phonemes, tones = _convert_to_parakeet_style_pinyin(syllables)

@ -58,8 +72,7 @@ class ParakeetPinyin(Phonetics):
            item for item in phonemes if item in self.vocab_phonemes.stoi
        ]
        tones = [item for item in tones if item in self.vocab_tones.stoi]
-        return phonemes, tones        
-        
+        return phonemes, tones

    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
@ -74,10 +87,10 @@ class ParakeetPinyin(Phonetics):
        List[str]
            The list of pronunciation sequence.
        """
-        syllables = self.pinyin_backend.lazy_pinyin(sentence,
-                                                    style=Style.TONE3,
-                                                    strict=True)
-        phonemes, tones = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
+        syllables = self.pinyin_backend.lazy_pinyin(
+            sentence, style=Style.TONE3, strict=True)
+        phonemes, tones = self.convert_pypinyin_tone3(
+            syllables, add_start_end=add_start_end)
        return phonemes, tones

    def numericalize(self, phonemes, tones):
@ -110,8 +123,8 @@ class ParakeetPinyin(Phonetics):
        List[str]
            The list of pronunciation id sequence.
        """
-        phonemes, tones = self.phoneticize(sentence,
-                                           add_start_end=add_start_end)
+        phonemes, tones = self.phoneticize(
+            sentence, add_start_end=add_start_end)
        phoneme_ids, tone_ids = self.numericalize(phonemes, tones)
        return phoneme_ids, tone_ids

@ -128,12 +141,11 @@ class ParakeetPinyin(Phonetics):
        return len(self.vocab_tones)


-
 class ParakeetPinyinWithTone(Phonetics):
    def __init__(self):
        self.vocab = Vocab(_toned_phonems)
        self.pinyin_backend = Pinyin(ParakeetConverter())
-        
+
    def convert_pypinyin_tone3(self, syllables, add_start_end=False):
        phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables)

@ -142,11 +154,9 @@ class ParakeetPinyinWithTone(Phonetics):
            end = self.vocab_phonemes.end_symbol
            phonemes = [start] + phonemes + [end]

-        phonemes = [
-            item for item in phonemes if item in self.vocab.stoi
-        ]
+        phonemes = [item for item in phonemes if item in self.vocab.stoi]
        return phonemes
-    
+
    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
    
@ -160,10 +170,10 @@ class ParakeetPinyinWithTone(Phonetics):
        List[str]
            The list of pronunciation sequence.
        """
-        syllables = self.pinyin_backend.lazy_pinyin(sentence,
-                                                    style=Style.TONE3,
-                                                    strict=True)
-        phonemes = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
+        syllables = self.pinyin_backend.lazy_pinyin(
+            sentence, style=Style.TONE3, strict=True)
+        phonemes = self.convert_pypinyin_tone3(
+            syllables, add_start_end=add_start_end)
        return phonemes

    def numericalize(self, phonemes):
@ -289,6 +299,7 @@ def _convert_to_parakeet_style_pinyin(syllables):
        tones.extend(t)
    return phones, tones

+
 def _split_syllable_with_tone(syllable: str):
    global _punctuations

@ -311,10 +322,10 @@ def _split_syllable_with_tone(syllable: str):
        phones.append(syllable)
    return phones

+
 def _convert_to_parakeet_style_pinyin_with_tone(syllables):
    phones = []
    for syllable in syllables:
        p = _split_syllable_with_tone(syllable)
        phones.extend(p)
    return phones
-
--- a/parakeet/models/lstm_speaker_encoder.py
+++ b/parakeet/models/lstm_speaker_encoder.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import paddle
 from paddle import nn
@ -23,9 +37,9 @@ class LSTMSpeakerEncoder(nn.Layer):
    def forward(self, utterances, num_speakers, initial_states=None):
        normalized_embeds = self.embed_sequences(utterances, initial_states)
        embeds = normalized_embeds.reshape([num_speakers, -1, num_speakers])
-        loss, eer = self.loss(embeds)    
+        loss, eer = self.loss(embeds)
        return loss, eer
-    
+
    def embed_sequences(self, utterances, initial_states=None, reduce=False):
        out, (h, c) = self.lstm(utterances, initial_states)
        embeds = F.relu(self.linear(h[-1]))
@ -35,7 +49,7 @@ class LSTMSpeakerEncoder(nn.Layer):
            embed = F.normalize(embed, axis=0)
            return embed
        return normalized_embeds
-    
+
    def embed_utterance(self, utterances, initial_states=None):
        # utterances: [B, T, C] -> embed [C']
        embed = self.embed_sequences(utterances, initial_states, reduce=True)
@ -47,37 +61,51 @@ class LSTMSpeakerEncoder(nn.Layer):

        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
        centroids_incl = paddle.mean(embeds, axis=1)
-        centroids_incl_norm = paddle.norm(centroids_incl, p=2, axis=1, keepdim=True)
+        centroids_incl_norm = paddle.norm(
+            centroids_incl, p=2, axis=1, keepdim=True)
        normalized_centroids_incl = centroids_incl / centroids_incl_norm

        # Exclusive centroids (1 per utterance)
-        centroids_excl = paddle.broadcast_to(paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
+        centroids_excl = paddle.broadcast_to(
+            paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
        centroids_excl /= (utterances_per_speaker - 1)
-        centroids_excl_norm = paddle.norm(centroids_excl, p=2, axis=2, keepdim=True)
+        centroids_excl_norm = paddle.norm(
+            centroids_excl, p=2, axis=2, keepdim=True)
        normalized_centroids_excl = centroids_excl / centroids_excl_norm

-        p1 = paddle.matmul(embeds.reshape([-1, embed_dim]), 
-                           normalized_centroids_incl, transpose_y=True) # (NMN)
+        p1 = paddle.matmul(
+            embeds.reshape([-1, embed_dim]),
+            normalized_centroids_incl,
+            transpose_y=True)  # (NMN)
        p1 = p1.reshape([-1])
        # print("p1: ", p1.shape)
-        p2 = paddle.bmm(embeds.reshape([-1, 1, embed_dim]), 
-                        normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1)
-        p2 = p2.reshape([-1]) # （NM)
+        p2 = paddle.bmm(
+            embeds.reshape([-1, 1, embed_dim]),
+            normalized_centroids_excl.reshape(
+                [-1, embed_dim, 1]))  # (NM, 1, 1)
+        p2 = p2.reshape([-1])  # （NM)

        # begin: alternative implementation for scatter
        with paddle.no_grad():
-            index = paddle.arange(0, speakers_per_batch * utterances_per_speaker, dtype="int64").reshape([speakers_per_batch, utterances_per_speaker])
-            index = index * speakers_per_batch + paddle.arange(0, speakers_per_batch, dtype="int64").unsqueeze(-1)
+            index = paddle.arange(
+                0, speakers_per_batch * utterances_per_speaker,
+                dtype="int64").reshape(
+                    [speakers_per_batch, utterances_per_speaker])
+            index = index * speakers_per_batch + paddle.arange(
+                0, speakers_per_batch, dtype="int64").unsqueeze(-1)
            index = paddle.reshape(index, [-1])
-        ones = paddle.ones([speakers_per_batch * utterances_per_speaker * speakers_per_batch])
+        ones = paddle.ones([
+            speakers_per_batch * utterances_per_speaker * speakers_per_batch
+        ])
        zeros = paddle.zeros_like(index, dtype=ones.dtype)
        mask_p1 = paddle.scatter(ones, index, zeros)
        p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
        # end: alternative implementation for scatter
        # p = paddle.scatter(p1, index, p2)

-        p = p * self.similarity_weight + self.similarity_bias # neg
-        p = p.reshape([speakers_per_batch * utterances_per_speaker, speakers_per_batch])
+        p = p * self.similarity_weight + self.similarity_bias  # neg
+        p = p.reshape(
+            [speakers_per_batch * utterances_per_speaker, speakers_per_batch])
        return p, p1, p2

    def do_gradient_ops(self):
@ -99,8 +127,10 @@ class LSTMSpeakerEncoder(nn.Layer):
        sim_matrix, *_ = self.similarity_matrix(embeds)
        sim_matrix = sim_matrix.reshape(
            [speakers_per_batch * utterances_per_speaker, speakers_per_batch])
-        target = paddle.arange(0, speakers_per_batch, dtype="int64").unsqueeze(-1)
-        target = paddle.expand(target, [speakers_per_batch, utterances_per_speaker])
+        target = paddle.arange(
+            0, speakers_per_batch, dtype="int64").unsqueeze(-1)
+        target = paddle.expand(target,
+                               [speakers_per_batch, utterances_per_speaker])
        target = paddle.reshape(target, [-1])

        loss = nn.CrossEntropyLoss()(sim_matrix, target)
@ -113,9 +143,7 @@ class LSTMSpeakerEncoder(nn.Layer):
            preds = sim_matrix.numpy()

            # Snippet from https://yangcha.github.io/EER-ROC/
-            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)

        return loss, eer
-
-
--- a/parakeet/models/tacotron2.py
+++ b/parakeet/models/tacotron2.py
@ -47,7 +47,11 @@ class DecoderPreNet(nn.Layer):
        The droput probability.

    """
-    def __init__(self, d_input: int, d_hidden: int, d_output: int,
+
+    def __init__(self,
+                 d_input: int,
+                 d_hidden: int,
+                 d_output: int,
                 dropout_rate: float):
        super().__init__()

@ -70,12 +74,10 @@ class DecoderPreNet(nn.Layer):

        """

-        x = F.dropout(F.relu(self.linear1(x)),
-                      self.dropout_rate,
-                      training=True)
-        output = F.dropout(F.relu(self.linear2(x)),
-                           self.dropout_rate,
-                           training=True)
+        x = F.dropout(
+            F.relu(self.linear1(x)), self.dropout_rate, training=True)
+        output = F.dropout(
+            F.relu(self.linear2(x)), self.dropout_rate, training=True)
        return output


@ -100,8 +102,13 @@ class DecoderPostNet(nn.Layer):
        The droput probability.

    """
-    def __init__(self, d_mels: int, d_hidden: int, kernel_size: int,
-                 num_layers: int, dropout: float):
+
+    def __init__(self,
+                 d_mels: int,
+                 d_hidden: int,
+                 kernel_size: int,
+                 num_layers: int,
+                 dropout: float):
        super().__init__()
        self.dropout = dropout
        self.num_layers = num_layers
@ -111,31 +118,33 @@ class DecoderPostNet(nn.Layer):
        self.conv_batchnorms = nn.LayerList()
        k = math.sqrt(1.0 / (d_mels * kernel_size))
        self.conv_batchnorms.append(
-            Conv1dBatchNorm(d_mels,
-                            d_hidden,
-                            kernel_size=kernel_size,
-                            padding=padding,
-                            bias_attr=I.Uniform(-k, k),
-                            data_format='NLC'))
+            Conv1dBatchNorm(
+                d_mels,
+                d_hidden,
+                kernel_size=kernel_size,
+                padding=padding,
+                bias_attr=I.Uniform(-k, k),
+                data_format='NLC'))

        k = math.sqrt(1.0 / (d_hidden * kernel_size))
        self.conv_batchnorms.extend([
-            Conv1dBatchNorm(d_hidden,
-                            d_hidden,
-                            kernel_size=kernel_size,
-                            padding=padding,
-                            bias_attr=I.Uniform(-k, k),
-                            data_format='NLC')
-            for i in range(1, num_layers - 1)
+            Conv1dBatchNorm(
+                d_hidden,
+                d_hidden,
+                kernel_size=kernel_size,
+                padding=padding,
+                bias_attr=I.Uniform(-k, k),
+                data_format='NLC') for i in range(1, num_layers - 1)
        ])

        self.conv_batchnorms.append(
-            Conv1dBatchNorm(d_hidden,
-                            d_mels,
-                            kernel_size=kernel_size,
-                            padding=padding,
-                            bias_attr=I.Uniform(-k, k),
-                            data_format='NLC'))
+            Conv1dBatchNorm(
+                d_hidden,
+                d_mels,
+                kernel_size=kernel_size,
+                padding=padding,
+                bias_attr=I.Uniform(-k, k),
+                data_format='NLC'))

    def forward(self, x):
        """Calculate forward propagation.
@ -153,12 +162,14 @@ class DecoderPostNet(nn.Layer):
        """

        for i in range(len(self.conv_batchnorms) - 1):
-            x = F.dropout(F.tanh(self.conv_batchnorms[i](x)),
-                          self.dropout,
-                          training=self.training)
-        output = F.dropout(self.conv_batchnorms[self.num_layers - 1](x),
-                           self.dropout,
-                           training=self.training)
+            x = F.dropout(
+                F.tanh(self.conv_batchnorms[i](x)),
+                self.dropout,
+                training=self.training)
+        output = F.dropout(
+            self.conv_batchnorms[self.num_layers - 1](x),
+            self.dropout,
+            training=self.training)
        return output


@ -179,26 +190,30 @@ class Tacotron2Encoder(nn.Layer):
    p_dropout: float
        The droput probability.
    """
-    def __init__(self, d_hidden: int, conv_layers: int, kernel_size: int,
+
+    def __init__(self,
+                 d_hidden: int,
+                 conv_layers: int,
+                 kernel_size: int,
                 p_dropout: float):
        super().__init__()

        k = math.sqrt(1.0 / (d_hidden * kernel_size))
        self.conv_batchnorms = paddle.nn.LayerList([
-            Conv1dBatchNorm(d_hidden,
-                            d_hidden,
-                            kernel_size,
-                            stride=1,
-                            padding=int((kernel_size - 1) / 2),
-                            bias_attr=I.Uniform(-k, k),
-                            data_format='NLC') for i in range(conv_layers)
+            Conv1dBatchNorm(
+                d_hidden,
+                d_hidden,
+                kernel_size,
+                stride=1,
+                padding=int((kernel_size - 1) / 2),
+                bias_attr=I.Uniform(-k, k),
+                data_format='NLC') for i in range(conv_layers)
        ])
        self.p_dropout = p_dropout

        self.hidden_size = int(d_hidden / 2)
-        self.lstm = nn.LSTM(d_hidden,
-                            self.hidden_size,
-                            direction="bidirectional")
+        self.lstm = nn.LSTM(
+            d_hidden, self.hidden_size, direction="bidirectional")

    def forward(self, x, input_lens=None):
        """Calculate forward propagation of tacotron2 encoder.
@ -218,9 +233,10 @@ class Tacotron2Encoder(nn.Layer):

        """
        for conv_batchnorm in self.conv_batchnorms:
-            x = F.dropout(F.relu(conv_batchnorm(x)),
-                          self.p_dropout,
-                          training=self.training)
+            x = F.dropout(
+                F.relu(conv_batchnorm(x)),
+                self.p_dropout,
+                training=self.training)

        output, _ = self.lstm(inputs=x, sequence_length=input_lens)
        return output
@ -271,6 +287,7 @@ class Tacotron2Decoder(nn.Layer):
        Whether to use a binary classifier for stop token prediction. 
        Defaults to False
    """
+
    def __init__(self,
                 d_mels: int,
                 reduction_factor: int,
@ -284,7 +301,7 @@ class Tacotron2Decoder(nn.Layer):
                 p_prenet_dropout: float,
                 p_attention_dropout: float,
                 p_decoder_dropout: float,
-                 use_stop_token: bool = False):
+                 use_stop_token: bool=False):
        super().__init__()
        self.d_mels = d_mels
        self.reduction_factor = reduction_factor
@ -294,10 +311,11 @@ class Tacotron2Decoder(nn.Layer):
        self.p_attention_dropout = p_attention_dropout
        self.p_decoder_dropout = p_decoder_dropout

-        self.prenet = DecoderPreNet(d_mels * reduction_factor,
-                                    d_prenet,
-                                    d_prenet,
-                                    dropout_rate=p_prenet_dropout)
+        self.prenet = DecoderPreNet(
+            d_mels * reduction_factor,
+            d_prenet,
+            d_prenet,
+            dropout_rate=p_prenet_dropout)

        # attention_rnn takes attention's context vector has an
        # auxiliary input
@ -367,9 +385,10 @@ class Tacotron2Decoder(nn.Layer):
        # The first lstm layer (or spec encoder lstm)
        _, (self.attention_hidden, self.attention_cell) = self.attention_rnn(
            cell_input, (self.attention_hidden, self.attention_cell))
-        self.attention_hidden = F.dropout(self.attention_hidden,
-                                          self.p_attention_dropout,
-                                          training=self.training)
+        self.attention_hidden = F.dropout(
+            self.attention_hidden,
+            self.p_attention_dropout,
+            training=self.training)

        # Loaction sensitive attention
        attention_weights_cat = paddle.stack(
@ -384,9 +403,10 @@ class Tacotron2Decoder(nn.Layer):
            [self.attention_hidden, self.attention_context], axis=-1)
        _, (self.decoder_hidden, self.decoder_cell) = self.decoder_rnn(
            decoder_input, (self.decoder_hidden, self.decoder_cell))
-        self.decoder_hidden = F.dropout(self.decoder_hidden,
-                                        p=self.p_decoder_dropout,
-                                        training=self.training)
+        self.decoder_hidden = F.dropout(
+            self.decoder_hidden,
+            p=self.p_decoder_dropout,
+            training=self.training)

        # decode output one step
        decoder_hidden_attention_context = paddle.concat(
@ -426,8 +446,8 @@ class Tacotron2Decoder(nn.Layer):
        querys = paddle.reshape(
            querys,
            [querys.shape[0], querys.shape[1] // self.reduction_factor, -1])
-        start_step = paddle.zeros(shape=[querys.shape[0], 1, querys.shape[-1]],
-                                  dtype=querys.dtype)
+        start_step = paddle.zeros(
+            shape=[querys.shape[0], 1, querys.shape[-1]], dtype=querys.dtype)
        querys = paddle.concat([start_step, querys], axis=1)

        querys = self.prenet(querys)
@ -604,43 +624,43 @@ class Tacotron2(nn.Layer):
        outputs.

    """
+
    def __init__(self,
                 vocab_size,
                 n_tones=None,
-                 d_mels: int = 80,
-                 d_encoder: int = 512,
-                 encoder_conv_layers: int = 3,
-                 encoder_kernel_size: int = 5,
-                 d_prenet: int = 256,
-                 d_attention_rnn: int = 1024,
-                 d_decoder_rnn: int = 1024,
-                 attention_filters: int = 32,
-                 attention_kernel_size: int = 31,
-                 d_attention: int = 128,
-                 d_postnet: int = 512,
-                 postnet_kernel_size: int = 5,
-                 postnet_conv_layers: int = 5,
-                 reduction_factor: int = 1,
-                 p_encoder_dropout: float = 0.5,
-                 p_prenet_dropout: float = 0.5,
-                 p_attention_dropout: float = 0.1,
-                 p_decoder_dropout: float = 0.1,
-                 p_postnet_dropout: float = 0.5,
+                 d_mels: int=80,
+                 d_encoder: int=512,
+                 encoder_conv_layers: int=3,
+                 encoder_kernel_size: int=5,
+                 d_prenet: int=256,
+                 d_attention_rnn: int=1024,
+                 d_decoder_rnn: int=1024,
+                 attention_filters: int=32,
+                 attention_kernel_size: int=31,
+                 d_attention: int=128,
+                 d_postnet: int=512,
+                 postnet_kernel_size: int=5,
+                 postnet_conv_layers: int=5,
+                 reduction_factor: int=1,
+                 p_encoder_dropout: float=0.5,
+                 p_prenet_dropout: float=0.5,
+                 p_attention_dropout: float=0.1,
+                 p_decoder_dropout: float=0.1,
+                 p_postnet_dropout: float=0.5,
                 d_global_condition=None,
                 use_stop_token=False):
        super().__init__()

        std = math.sqrt(2.0 / (vocab_size + d_encoder))
        val = math.sqrt(3.0) * std  # uniform bounds for std
-        self.embedding = nn.Embedding(vocab_size,
-                                      d_encoder,
-                                      weight_attr=I.Uniform(-val, val))
+        self.embedding = nn.Embedding(
+            vocab_size, d_encoder, weight_attr=I.Uniform(-val, val))
        if n_tones:
-            self.embedding_tones = nn.Embedding(n_tones,
-                                                d_encoder,
-                                                padding_idx=0,
-                                                weight_attr=I.Uniform(
-                                                    -0.1 * val, 0.1 * val))
+            self.embedding_tones = nn.Embedding(
+                n_tones,
+                d_encoder,
+                padding_idx=0,
+                weight_attr=I.Uniform(-0.1 * val, 0.1 * val))
        self.toned = n_tones is not None

        self.encoder = Tacotron2Encoder(d_encoder, encoder_conv_layers,
@ -649,24 +669,26 @@ class Tacotron2(nn.Layer):
        # input augmentation scheme: concat global condition to the encoder output
        if d_global_condition is not None:
            d_encoder += d_global_condition
-        self.decoder = Tacotron2Decoder(d_mels,
-                                        reduction_factor,
-                                        d_encoder,
-                                        d_prenet,
-                                        d_attention_rnn,
-                                        d_decoder_rnn,
-                                        d_attention,
-                                        attention_filters,
-                                        attention_kernel_size,
-                                        p_prenet_dropout,
-                                        p_attention_dropout,
-                                        p_decoder_dropout,
-                                        use_stop_token=use_stop_token)
-        self.postnet = DecoderPostNet(d_mels=d_mels * reduction_factor,
-                                      d_hidden=d_postnet,
-                                      kernel_size=postnet_kernel_size,
-                                      num_layers=postnet_conv_layers,
-                                      dropout=p_postnet_dropout)
+        self.decoder = Tacotron2Decoder(
+            d_mels,
+            reduction_factor,
+            d_encoder,
+            d_prenet,
+            d_attention_rnn,
+            d_decoder_rnn,
+            d_attention,
+            attention_filters,
+            attention_kernel_size,
+            p_prenet_dropout,
+            p_attention_dropout,
+            p_decoder_dropout,
+            use_stop_token=use_stop_token)
+        self.postnet = DecoderPostNet(
+            d_mels=d_mels * reduction_factor,
+            d_hidden=d_postnet,
+            kernel_size=postnet_kernel_size,
+            num_layers=postnet_conv_layers,
+            dropout=p_postnet_dropout)

    def forward(self,
                text_inputs,
@ -729,15 +751,14 @@ class Tacotron2(nn.Layer):
                [encoder_outputs, global_condition], -1)

        # [B, T_enc, 1]
-        mask = sequence_mask(text_lens,
-                             dtype=encoder_outputs.dtype).unsqueeze(-1)
+        mask = sequence_mask(
+            text_lens, dtype=encoder_outputs.dtype).unsqueeze(-1)
        if self.decoder.use_stop_token:
            mel_outputs, alignments, stop_logits = self.decoder(
                encoder_outputs, mels, mask=mask)
        else:
-            mel_outputs, alignments = self.decoder(encoder_outputs,
-                                                   mels,
-                                                   mask=mask)
+            mel_outputs, alignments = self.decoder(
+                encoder_outputs, mels, mask=mask)
        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

@ -863,6 +884,7 @@ class Tacotron2(nn.Layer):
 class Tacotron2Loss(nn.Layer):
    """ Tacotron2 Loss module
    """
+
    def __init__(self,
                 use_stop_token_loss=True,
                 use_guided_attention_loss=False,
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@ -321,10 +321,8 @@ class MLPPreNet(nn.Layer):
        self.dropout = dropout

    def forward(self, x, dropout):
-        l1 = F.dropout(
-            F.relu(self.lin1(x)), self.dropout, training=True)
-        l2 = F.dropout(
-            F.relu(self.lin2(l1)), self.dropout, training=True)
+        l1 = F.dropout(F.relu(self.lin1(x)), self.dropout, training=True)
+        l2 = F.dropout(F.relu(self.lin2(l1)), self.dropout, training=True)
        l3 = self.lin3(l2)
        return l3

@ -403,7 +401,7 @@ class TransformerTTS(nn.Layer):
                padding_idx=0,
                weight_attr=I.Uniform(-0.005, 0.005))
        else:
-            self.toned = False  
+            self.toned = False
        # position encoding matrix may be extended later
        self.encoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_encoder)
        self.encoder_pe_scalar = self.create_parameter(
@ -449,7 +447,8 @@ class TransformerTTS(nn.Layer):
        self.drop_n_heads = 0

    def forward(self, text, mel, tones=None):
-        encoded, encoder_attention_weights, encoder_mask = self.encode(text, tones=tones)
+        encoded, encoder_attention_weights, encoder_mask = self.encode(
+            text, tones=tones)
        mel_output, mel_intermediate, cross_attention_weights, stop_logits = self.decode(
            encoded, mel, encoder_mask)
        outputs = {
@ -489,7 +488,8 @@ class TransformerTTS(nn.Layer):
        # twice its length if needed
        if x.shape[1] * self.r > self.decoder_pe.shape[0]:
            new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
-            self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T, self.d_decoder)
+            self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T,
+                                                              self.d_decoder)
        pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
        x = x.scale(math.sqrt(
            self.d_decoder)) + pos_enc * self.decoder_pe_scalar
--- a/parakeet/models/waveflow.py
+++ b/parakeet/models/waveflow.py
@ -78,6 +78,7 @@ class UpsampleNet(nn.LayerList):
    ---------
    ``librosa.core.stft``
    """
+
    def __init__(self, upsample_factors):
        super().__init__()
        for factor in upsample_factors:
@ -85,12 +86,13 @@ class UpsampleNet(nn.LayerList):
            init = I.Uniform(-std, std)
            self.append(
                nn.utils.weight_norm(
-                    nn.Conv2DTranspose(1,
-                                       1, (3, 2 * factor),
-                                       padding=(1, factor // 2),
-                                       stride=(1, factor),
-                                       weight_attr=init,
-                                       bias_attr=init)))
+                    nn.Conv2DTranspose(
+                        1,
+                        1, (3, 2 * factor),
+                        padding=(1, factor // 2),
+                        stride=(1, factor),
+                        weight_attr=init,
+                        bias_attr=init)))

        # upsample factors
        self.upsample_factor = np.prod(upsample_factors)
@ -149,6 +151,7 @@ class ResidualBlock(nn.Layer):
    dilations : int
        Dilations of the Convolution2d applied to the input.
    """
+
    def __init__(self, channels, cond_channels, kernel_size, dilations):
        super().__init__()
        # input conv
@ -159,13 +162,14 @@ class ResidualBlock(nn.Layer):
        ]
        rh, rw = receptive_field
        paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2]  # causal & same
-        conv = nn.Conv2D(channels,
-                         2 * channels,
-                         kernel_size,
-                         padding=paddings,
-                         dilation=dilations,
-                         weight_attr=init,
-                         bias_attr=init)
+        conv = nn.Conv2D(
+            channels,
+            2 * channels,
+            kernel_size,
+            padding=paddings,
+            dilation=dilations,
+            weight_attr=init,
+            bias_attr=init)
        self.conv = nn.utils.weight_norm(conv)
        self.rh = rh
        self.rw = rw
@ -174,19 +178,18 @@ class ResidualBlock(nn.Layer):
        # condition projection
        std = math.sqrt(1 / cond_channels)
        init = I.Uniform(-std, std)
-        condition_proj = nn.Conv2D(cond_channels,
-                                   2 * channels, (1, 1),
-                                   weight_attr=init,
-                                   bias_attr=init)
+        condition_proj = nn.Conv2D(
+            cond_channels,
+            2 * channels, (1, 1),
+            weight_attr=init,
+            bias_attr=init)
        self.condition_proj = nn.utils.weight_norm(condition_proj)

        # parametric residual & skip connection
        std = math.sqrt(1 / channels)
        init = I.Uniform(-std, std)
-        out_proj = nn.Conv2D(channels,
-                             2 * channels, (1, 1),
-                             weight_attr=init,
-                             bias_attr=init)
+        out_proj = nn.Conv2D(
+            channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
        self.out_proj = nn.utils.weight_norm(out_proj)

    def forward(self, x, condition):
@ -265,11 +268,12 @@ class ResidualBlock(nn.Layer):
        self._update_buffer(x_row)

        rw = self.rw
-        x_row = F.conv2d(self._conv_buffer,
-                         self.conv.weight,
-                         self.conv.bias,
-                         padding=[0, 0, rw // 2, (rw - 1) // 2],
-                         dilation=self.dilations)
+        x_row = F.conv2d(
+            self._conv_buffer,
+            self.conv.weight,
+            self.conv.bias,
+            padding=[0, 0, rw // 2, (rw - 1) // 2],
+            dilation=self.dilations)
        x_row += self.condition_proj(condition_row)

        content, gate = paddle.chunk(x_row, 2, axis=1)
@ -315,8 +319,12 @@ class ResidualNet(nn.LayerList):
    ValueError
        If the length of dilations_h does not equals n_layers.
    """
-    def __init__(self, n_layer: int, residual_channels: int,
-                 condition_channels: int, kernel_size: Tuple[int],
+
+    def __init__(self,
+                 n_layer: int,
+                 residual_channels: int,
+                 condition_channels: int,
+                 kernel_size: Tuple[int],
                 dilations_h: List[int]):
        if len(dilations_h) != n_layer:
            raise ValueError(
@ -421,20 +429,22 @@ class Flow(nn.Layer):
        super().__init__()
        # input projection
        self.input_proj = nn.utils.weight_norm(
-            nn.Conv2D(1,
-                      channels, (1, 1),
-                      weight_attr=I.Uniform(-1., 1.),
-                      bias_attr=I.Uniform(-1., 1.)))
+            nn.Conv2D(
+                1,
+                channels, (1, 1),
+                weight_attr=I.Uniform(-1., 1.),
+                bias_attr=I.Uniform(-1., 1.)))

        # residual net
        self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
                                  self.dilations_dict[n_group])

        # output projection
-        self.output_proj = nn.Conv2D(channels,
-                                     2, (1, 1),
-                                     weight_attr=I.Constant(0.),
-                                     bias_attr=I.Constant(0.))
+        self.output_proj = nn.Conv2D(
+            channels,
+            2, (1, 1),
+            weight_attr=I.Constant(0.),
+            bias_attr=I.Constant(0.))

        # specs
        self.n_group = n_group
@ -478,8 +488,8 @@ class Flow(nn.Layer):
            transformation from x to z.
        """
        # (B, C, H-1, W)
-        logs, b = self._predict_parameters(x[:, :, :-1, :], condition[:, :,
-                                                                      1:, :])
+        logs, b = self._predict_parameters(x[:, :, :-1, :],
+                                           condition[:, :, 1:, :])
        z = self._transform(x, logs, b)
        return z, (logs, b)

@ -576,6 +586,7 @@ class WaveFlow(nn.LayerList):
    kernel_size : Union[int, List[int]]
        Kernel size of the convolution layer in each ResidualBlock.
    """
+
    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
                 kernel_size):
        if n_group % 2 or n_flows % 2:
@ -645,8 +656,8 @@ class WaveFlow(nn.LayerList):
        # to (B, C, h, T//h) layout
        x = paddle.unsqueeze(
            paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
-        condition = paddle.transpose(fold(condition, self.n_group),
-                                     [0, 1, 3, 2])
+        condition = paddle.transpose(
+            fold(condition, self.n_group), [0, 1, 3, 2])

        # flows
        logs_list = []
@ -689,8 +700,8 @@ class WaveFlow(nn.LayerList):
        # to (B, C, h, T//h) layout
        z = paddle.unsqueeze(
            paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
-        condition = paddle.transpose(fold(condition, self.n_group),
-                                     [0, 1, 3, 2])
+        condition = paddle.transpose(
+            fold(condition, self.n_group), [0, 1, 3, 2])

        # reverse it flow by flow
        for i in reversed(range(self.n_flows)):
@ -730,17 +741,24 @@ class ConditionalWaveFlow(nn.LayerList):
    kernel_size : Union[int, List[int]]
        Kernel size of the convolution layer in each ResidualBlock.
    """
-    def __init__(self, upsample_factors: List[int], n_flows: int,
-                 n_layers: int, n_group: int, channels: int, n_mels: int,
+
+    def __init__(self,
+                 upsample_factors: List[int],
+                 n_flows: int,
+                 n_layers: int,
+                 n_group: int,
+                 channels: int,
+                 n_mels: int,
                 kernel_size: Union[int, List[int]]):
        super().__init__()
        self.encoder = UpsampleNet(upsample_factors)
-        self.decoder = WaveFlow(n_flows=n_flows,
-                                n_layers=n_layers,
-                                n_group=n_group,
-                                channels=channels,
-                                mel_bands=n_mels,
-                                kernel_size=kernel_size)
+        self.decoder = WaveFlow(
+            n_flows=n_flows,
+            n_layers=n_layers,
+            n_group=n_group,
+            channels=channels,
+            mel_bands=n_mels,
+            kernel_size=kernel_size)

    def forward(self, audio, mel):
        """Compute the transformed random variable z (x to z) and the log of
@ -847,6 +865,7 @@ class WaveFlowLoss(nn.Layer):
        The standard deviation of the gaussian noise used in WaveFlow, by
        default 1.0.
    """
+
    def __init__(self, sigma=1.0):
        super().__init__()
        self.sigma = sigma
@ -870,7 +889,7 @@ class WaveFlowLoss(nn.Layer):
        Tensor [shape=(1,)]
            The loss.
        """
-        loss = paddle.sum(
-            z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
+        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
+                                    ) - log_det_jacobian
        loss = loss / np.prod(z.shape)
        return loss + self.const
--- a/parakeet/modules/attention.py
+++ b/parakeet/modules/attention.py
@ -143,9 +143,9 @@ class MonoheadAttention(nn.Layer):

    def __init__(self,
                 model_dim: int,
-                 dropout: float = 0.0,
-                 k_dim: int = None,
-                 v_dim: int = None):
+                 dropout: float=0.0,
+                 k_dim: int=None,
+                 v_dim: int=None):
        super(MonoheadAttention, self).__init__()
        k_dim = k_dim or model_dim
        v_dim = v_dim or model_dim
@ -225,9 +225,9 @@ class MultiheadAttention(nn.Layer):
    def __init__(self,
                 model_dim: int,
                 num_heads: int,
-                 dropout: float = 0.0,
-                 k_dim: int = None,
-                 v_dim: int = None):
+                 dropout: float=0.0,
+                 k_dim: int=None,
+                 v_dim: int=None):
        super(MultiheadAttention, self).__init__()
        if model_dim % num_heads != 0:
            raise ValueError("model_dim must be divisible by num_heads")
@ -318,7 +318,8 @@ class LocationSensitiveAttention(nn.Layer):

        # Location Layer
        self.location_conv = nn.Conv1D(
-            2, location_filters,
+            2,
+            location_filters,
            kernel_size=location_kernel_size,
            padding=int((location_kernel_size - 1) / 2),
            bias_attr=False,
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@ -116,16 +116,22 @@ class STFT(nn.Layer):
    
    """

-    def __init__(self, n_fft, hop_length=None, win_length=None, window="hanning", center=True, pad_mode="reflect"):
+    def __init__(self,
+                 n_fft,
+                 hop_length=None,
+                 win_length=None,
+                 window="hanning",
+                 center=True,
+                 pad_mode="reflect"):
        super().__init__()
        # By default, use the entire frame
        if win_length is None:
            win_length = n_fft
-    
+
        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length // 4)
-            
+
        self.hop_length = hop_length
        self.n_bin = 1 + n_fft // 2
        self.n_fft = n_fft
@ -134,7 +140,7 @@ class STFT(nn.Layer):

        # calculate window
        window = signal.get_window(window, win_length, fftbins=True)
-        
+
        # pad window to n_fft size
        if n_fft != win_length:
            window = pad_center(window, n_fft, mode="constant")
@ -146,11 +152,11 @@ class STFT(nn.Layer):
        #r = np.arange(0, n_fft)
        #M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
        #w_real = np.reshape(window *
-                            #np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-                            #(self.n_bin, 1, self.n_fft))
+        #np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
+        #(self.n_bin, 1, self.n_fft))
        #w_imag = np.reshape(window *
-                            #np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-                            #(self.n_bin, 1, self.n_fft))
+        #np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
+        #(self.n_bin, 1, self.n_fft))
        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
        w_real = weight.real
        w_imag = weight.imag
@ -178,8 +184,9 @@ class STFT(nn.Layer):
        """
        x = paddle.unsqueeze(x, axis=1)
        if self.center:
-            x = F.pad(x, [self.n_fft // 2, self.n_fft // 2], 
-                      data_format='NCL', mode=self.pad_mode)
+            x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
+                      data_format='NCL',
+                      mode=self.pad_mode)

        # to BCT, C=1
        out = F.conv1d(x, self.weight, stride=self.hop_length)
@ -226,8 +233,8 @@ class MelScale(nn.Layer):
        super().__init__()
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
        self.weight = paddle.to_tensor(mel_basis)
-        
+
    def forward(self, spec):
        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
        mel = paddle.matmul(self.weight, spec)
-        return mel
+        return mel
--- a/parakeet/modules/losses.py
+++ b/parakeet/modules/losses.py
@ -35,12 +35,12 @@ def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
    Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969.
    """
    dtype = dtype or paddle.get_default_dtype()
-    dec_pos = paddle.arange(0, N).astype(
-        dtype) / dec_lens.unsqueeze(-1)  # n/N # shape(B, T_dec)
-    enc_pos = paddle.arange(0, T).astype(
-        dtype) / enc_lens.unsqueeze(-1)  # t/T # shape(B, T_enc)
-    W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) -
-                         enc_pos.unsqueeze(1))**2 / (2 * g ** 2))
+    dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze(
+        -1)  # n/N # shape(B, T_dec)
+    enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze(
+        -1)  # t/T # shape(B, T_enc)
+    W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 /
+                       (2 * g**2))

    dec_mask = sequence_mask(dec_lens, maxlen=N)
    enc_mask = sequence_mask(enc_lens, maxlen=T)
@ -57,8 +57,7 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
    W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype)

    total_tokens = (dec_lens * enc_lens).astype(W.dtype)
-    loss = paddle.mean(paddle.sum(
-        W * attention_weight, [1, 2]) / total_tokens)
+    loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens)
    return loss


--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
@ -87,6 +87,7 @@ class ExperimentBase(object):
    >>> else:
    >>>     main_sp(config, args)
    """
+
    def __init__(self, config, args):
        self.config = config
        self.args = args
--- a/parakeet/training/reporter.py
+++ b/parakeet/training/reporter.py
@ -1,7 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import contextlib

 OBSERVATIONS = None

+
@contextlib.contextmanager
 def scope(observations):
    # make `observation` the target to report to.
@ -13,12 +28,14 @@ def scope(observations):
    try:
        yield
    finally:
-        OBSERVATIONS = old    
+        OBSERVATIONS = old
+

 def get_observations():
    global OBSERVATIONS
    return OBSERVATIONS

+
 def report(name, value):
    # a simple function to report named value
    # you can use it everywhere, it will get the default target and writ to it
--- a/parakeet/training/trainer.py
+++ b/parakeet/training/trainer.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 import tqdm
 from dataclasses import dataclass
@ -25,7 +39,7 @@ class Trainer(object):
        self.stop_trigger = get_trigger(stop_trigger)
        self.out = Path(out)
        self.observation = {}
-        
+
    def setup(self):
        pass

@ -38,8 +52,8 @@ class Trainer(object):
            ordinal += 1
            modified_name = f"{name}_{ordinal}"

-        self.extensions[modified_name] = ExtensionEntry(
-            extension, trigger, priority)
+        self.extensions[modified_name] = ExtensionEntry(extension, trigger,
+                                                        priority)

    def run(self):
        # sort extensions by priorities once
@ -61,7 +75,7 @@ class Trainer(object):
                max_epoch = self.stop_trigger.period
            else:
                max_iteration = self.stop_trigger.period
-        
+
        while not stop_trigger(self):
            self.observation = {}
            # set observation as the report target
@ -75,4 +89,3 @@ class Trainer(object):
            for name, entry in extensions:
                if entry.trigger(self):
                    entry.extension(self)
-                    
--- a/parakeet/training/trigger.py
+++ b/parakeet/training/trigger.py
@ -1,10 +1,25 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 class IntervalTrigger(object):
-    def __init__(self, period: int , unit: str):
+    def __init__(self, period: int, unit: str):
        if unit not in ("iteration", "epoch"):
            raise ValueError("unit should be 'iteration' or 'epoch'")
        self.period = period
        self.unit = unit
-        
+
    def __call__(self, trainer):
        state = trainer.updater.state
        if self.unit == "epoch":
@ -13,7 +28,7 @@ class IntervalTrigger(object):
            fire = not (state.iteration % self.iteration)
        return fire

-  
+
 def never_file_trigger(trainer):
    return False

@ -25,4 +40,4 @@ def get_trigger(trigger):
        return trigger
    else:
        trigger = IntervalTrigger(*trigger)
-        return trigger
+        return trigger
--- a/parakeet/training/updater.py
+++ b/parakeet/training/updater.py
@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Optional

@ -41,6 +55,7 @@ class UpdaterBase(object):

    So the best practice is to define a model and define a updater for it.
    """
+
    def update(self):
        pass

@ -52,13 +67,14 @@ class StandardUpdater(UpdaterBase):
    """An example of over-simplification. Things may not be that simple, but
    you can subclass it to fit your need.
    """
+
    def __init__(self,
                 model: Layer,
                 dataloader: DataLoader,
                 optimizer: Optimizer,
                 loss_func=None,
-                 auto_new_epoch: bool = True,
-                 init_state: Optional[UpdaterState] = None):
+                 auto_new_epoch: bool=True,
+                 init_state: Optional[UpdaterState]=None):
        self.model = model
        self.dataloader = dataloader
        self.optimizer = optimizer
--- a/parakeet/utils/display.py
+++ b/parakeet/utils/display.py
@ -31,10 +31,8 @@ __all__ = [
 def plot_alignment(alignment, title=None):
    # alignment: [encoder_steps, decoder_steps)
    fig, ax = plt.subplots(figsize=(6, 4))
-    im = ax.imshow(alignment,
-                   aspect='auto',
-                   origin='lower',
-                   interpolation='none')
+    im = ax.imshow(
+        alignment, aspect='auto', origin='lower', interpolation='none')
    fig.colorbar(im, ax=ax)
    xlabel = 'Decoder timestep'
    if title is not None:
@ -49,15 +47,14 @@ def plot_multihead_alignments(alignments, title=None):
    # alignments: [N, encoder_steps, decoder_steps)
    num_subplots = alignments.shape[0]

-    fig, axes = plt.subplots(figsize=(6 * num_subplots, 4),
-                             ncols=num_subplots,
-                             sharey=True,
-                             squeeze=True)
+    fig, axes = plt.subplots(
+        figsize=(6 * num_subplots, 4),
+        ncols=num_subplots,
+        sharey=True,
+        squeeze=True)
    for i, ax in enumerate(axes):
-        im = ax.imshow(alignments[i],
-                       aspect='auto',
-                       origin='lower',
-                       interpolation='none')
+        im = ax.imshow(
+            alignments[i], aspect='auto', origin='lower', interpolation='none')
        fig.colorbar(im, ax=ax)
        xlabel = 'Decoder timestep'
        if title is not None:
@ -73,18 +70,20 @@ def plot_multilayer_multihead_alignments(alignments, title=None):
    # alignments: [num_layers, num_heads, encoder_steps, decoder_steps)
    num_layers, num_heads, *_ = alignments.shape

-    fig, axes = plt.subplots(figsize=(6 * num_heads, 4 * num_layers),
-                             nrows=num_layers,
-                             ncols=num_heads,
-                             sharex=True,
-                             sharey=True,
-                             squeeze=True)
+    fig, axes = plt.subplots(
+        figsize=(6 * num_heads, 4 * num_layers),
+        nrows=num_layers,
+        ncols=num_heads,
+        sharex=True,
+        sharey=True,
+        squeeze=True)
    for i, row in enumerate(axes):
        for j, ax in enumerate(row):
-            im = ax.imshow(alignments[i, j],
-                           aspect='auto',
-                           origin='lower',
-                           interpolation='none')
+            im = ax.imshow(
+                alignments[i, j],
+                aspect='auto',
+                origin='lower',
+                interpolation='none')
            fig.colorbar(im, ax=ax)
            xlabel = 'Decoder timestep'
            if title is not None:
--- a/parakeet/utils/mp_tools.py
+++ b/parakeet/utils/mp_tools.py
@ -20,7 +20,6 @@ __all__ = ["rank_zero_only"]


 def rank_zero_only(func):
-
    @wraps(func)
    def wrapper(*args, **kwargs):
        if dist.get_rank() != 0:
--- a/setup.py
+++ b/setup.py
@ -20,8 +20,9 @@ from setuptools import setup, find_packages


 def read(*names, **kwargs):
-    with io.open(os.path.join(os.path.dirname(__file__), *names),
-                 encoding=kwargs.get("encoding", "utf8")) as fp:
+    with io.open(
+            os.path.join(os.path.dirname(__file__), *names),
+            encoding=kwargs.get("encoding", "utf8")) as fp:
        return fp.read()


@ -73,9 +74,7 @@ setup_info = dict(
        'g2pM',
        'praatio',
    ],
-    extras_require={
-        'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
-    },
+    extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },

    # Package info
    packages=find_packages(exclude=('tests', 'tests.*')),
@ -88,7 +87,6 @@ setup_info = dict(
        'License :: OSI Approved :: Apache Software License',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
-    ],
-)
+    ], )

 setup(**setup_info)