Merge pull request #141 from yt605155624/fix_pwg

fix docstrings and some bug in pwg
2021-08-17 05:16:47 -05:00 · 2021-08-17 05:16:47 -05:00 · 24c5b3c1a2
parent 82d5139f80 c497fd843d
commit 24c5b3c1a2
165 changed files with 1723 additions and 1830 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,28 @@
+# This file is used by clang-format to autoformat paddle source code
+#
+# The clang-format is part of llvm toolchain.
+# It need to install llvm and clang to format source code style.
+#
+# The basic usage is,
+#   clang-format -i -style=file PATH/TO/SOURCE/CODE
+#
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
+# The -i means inplace change.
+#
+# The document of clang-format is
+#   http://clang.llvm.org/docs/ClangFormat.html
+#   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     4
+TabWidth:        4
+ContinuationIndentWidth: 4
+MaxEmptyLinesToKeep: 2
+AccessModifierOffset: -2  # The private/protected/public has no indent in class
+Standard:  Cpp11
+AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
+...
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,50 @@
+[flake8]
+
+########## OPTIONS ##########
+# Set the maximum length that any line (with some exceptions) may be.
+max-line-length = 120
+
+
+################### FILE PATTERNS ##########################
+# Provide a comma-separated list of glob patterns to exclude from checks.
+exclude =
+    # git folder
+    .git,
+    # python cache
+    __pycache__,
+    third_party/,
+# Provide a comma-separate list of glob patterns to include for checks.
+filename =
+    *.py
+
+
+########## RULES ##########
+
+# ERROR CODES
+#
+# E/W  - PEP8 errors/warnings (pycodestyle)
+# F    - linting errors (pyflakes)
+# C    - McCabe complexity error (mccabe)
+#
+# W503 - line break before binary operator
+
+# Specify a list of codes to ignore.
+ignore =
+    W503
+    E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
+    W291,W293,W605
+    E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
+    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
+    # to line this up with executable bit
+    EXE001,
+    # these ignores are from flake8-bugbear; please fix!
+    B007,B008,
+    # these ignores are from flake8-comprehensions; please fix!
+    C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
+
+# Specify the list of error codes you wish Flake8 to report.
+select =
+    E,
+    W,
+    F,
+    C
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,11 +1,11 @@
-repos:
-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
-    rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+-   repo: https://github.com/pre-commit/mirrors-yapf.git
+    sha: v0.16.0
    hooks:
    -   id: yapf
        files: \.py$
+        exclude: (?=third_party).*(\.py)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: a11d9314b22d8f8c7556443875b731ef05965464
+    sha: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
    -   id: check-merge-conflict
    -   id: check-symlinks
@ -15,8 +15,23 @@ repos:
        files: \.md$
    -   id: trailing-whitespace
        files: \.md$
-   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.0.1
+    -   id: requirements-txt-fixer
+        exclude: (?=third_party).*$
+    -   id: check-yaml
+    -   id: check-json
+    -   id: pretty-format-json
+        args:
+        - --no-sort-keys
+        - --autofix
+    -   id: check-merge-conflict
+    -   id: flake8
+        aergs:
+        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
+        -  --builtins=G,request
+        -  --jobs=1
+        exclude: (?=third_party).*(\.py)$
+-   repo : https://github.com/Lucas-C/pre-commit-hooks
+    sha: v1.0.1
    hooks:
    -   id: forbid-crlf
        files: \.md$
@ -28,9 +43,15 @@ repos:
        files: \.md$
 -   repo: local
    hooks:
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat
+        entry: bash .pre-commit-hooks/clang-format.hook -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
    -   id: copyright_checker
        name: copyright_checker
-        entry: python ./tools/copyright.hook
+        entry: python .pre-commit-hooks/copyright-check.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
+        exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$
--- a/.pre-commit-hooks/clang-format.hook
+++ b/.pre-commit-hooks/clang-format.hook
@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+readonly VERSION="3.9"
+
+version=$(clang-format -version)
+
+# if ! [[ $version == *"$VERSION"* ]]; then
+#     echo "clang-format version check failed."
+#     echo "a version contains '$VERSION' is needed, but get '$version'"
+#     echo "you can install the right version, and make an soft-link to '\$PATH' env"
+#     exit -1
+# fi
+
+clang-format $@
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@ -0,0 +1,133 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import io
+import os
+import re
+import sys
+import subprocess
+import platform
+
+COPYRIGHT = '''
+Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/.style.yapf
+++ b/.style.yapf
@ -0,0 +1,3 @@
+[style]
+based_on_style = pep8
+column_limit = 80
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -11,15 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
-
 # -- Path setup --------------------------------------------------------------
-
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
--- a/examples/fastspeech2/baker/batch_fn.py
+++ b/examples/fastspeech2/baker/batch_fn.py
@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import numpy as np
 import paddle
+
 from parakeet.data.batch import batch_sequences


@ -24,8 +24,7 @@ def collate_baker_examples(examples):
    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
    durations = [
-        np.array(
-            item["durations"], dtype=np.int64) for item in examples
+        np.array(item["durations"], dtype=np.int64) for item in examples
    ]
    text_lengths = np.array([item["text_lengths"] for item in examples])
    speech_lengths = np.array([item["speech_lengths"] for item in examples])
@ -54,4 +53,4 @@ def collate_baker_examples(examples):
        "pitch": pitch,
        "energy": energy
    }
-    return batch
+    return batch
--- a/examples/fastspeech2/baker/compute_statistics.py
+++ b/examples/fastspeech2/baker/compute_statistics.py
@ -12,18 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Calculate statistics of feature files."""
-
 import argparse
 import logging
 from pathlib import Path

 import jsonlines
 import numpy as np
-from parakeet.datasets.data_table import DataTable
+from config import get_cfg_default
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

-from config import get_cfg_default
+from parakeet.datasets.data_table import DataTable


 def main():
@ -75,8 +74,8 @@ def main():

    # check directory existence
    if args.output is None:
-        args.output = Path(args.metadata).parent.with_name(args.field_name +
-                                                           "_stats.npy")
+        args.output = Path(
+            args.metadata).parent.with_name(args.field_name + "_stats.npy")
    else:
        args.output = Path(args.output)
    args.output.parent.mkdir(parents=True, exist_ok=True)
--- a/examples/fastspeech2/baker/config.py
+++ b/examples/fastspeech2/baker/config.py
@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from pathlib import Path

-from yacs.config import CfgNode as Configuration
 import yaml
+from yacs.config import CfgNode as Configuration

 config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()

--- a/examples/fastspeech2/baker/fastspeech2_updater.py
+++ b/examples/fastspeech2/baker/fastspeech2_updater.py
@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from parakeet.models.fastspeech2 import FastSpeech2, FastSpeech2Loss
+from parakeet.models.fastspeech2 import FastSpeech2Loss
 from parakeet.training.extensions.evaluator import StandardEvaluator
 from parakeet.training.reporter import report
 from parakeet.training.updaters.standard_updater import StandardUpdater
--- a/examples/fastspeech2/baker/frontend.py
+++ b/examples/fastspeech2/baker/frontend.py
@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re
+
 import numpy as np
 import paddle
+
 from parakeet.frontend.cn_frontend import Frontend as cnFrontend


@ -87,8 +88,7 @@ class Frontend():
                    phones.append(phone)
        return phones, tones

-    def get_input_ids(self, sentence, merge_sentences=True,
-                      get_tone_ids=False):
+    def get_input_ids(self, sentence, merge_sentences=True, get_tone_ids=False):
        phonemes = self.frontend.get_phonemes(
            sentence, merge_sentences=merge_sentences)
        result = {}
--- a/examples/fastspeech2/baker/gen_duration_from_textgrid.py
+++ b/examples/fastspeech2/baker/gen_duration_from_textgrid.py
@ -11,16 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import os
 from pathlib import Path

 import librosa
 import numpy as np
-from praatio import tgio
-
 from config import get_cfg_default
+from praatio import tgio


 def readtg(config, tg_path):
--- a/examples/fastspeech2/baker/normalize.py
+++ b/examples/fastspeech2/baker/normalize.py
@ -50,10 +50,7 @@ def main():
        required=True,
        help="speech statistics file.")
    parser.add_argument(
-        "--pitch-stats",
-        type=str,
-        required=True,
-        help="pitch statistics file.")
+        "--pitch-stats", type=str, required=True, help="pitch statistics file.")
    parser.add_argument(
        "--energy-stats",
        type=str,
--- a/examples/fastspeech2/baker/preprocess.py
+++ b/examples/fastspeech2/baker/preprocess.py
@ -21,10 +21,10 @@ from typing import List, Dict, Any
 import jsonlines
 import librosa
 import numpy as np
+from parakeet.data.get_feats import LogMelFBank, Energy, Pitch
 import tqdm

 from config import get_cfg_default
-from get_feats import LogMelFBank, Energy, Pitch


 def get_phn_dur(file_name):
@ -262,10 +262,7 @@ def main():
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
-        "--rootdir",
-        default=None,
-        type=str,
-        help="directory to baker dataset.")
+        "--rootdir", default=None, type=str, help="directory to baker dataset.")
    parser.add_argument(
        "--dur-file",
        default=None,
--- a/examples/fastspeech2/baker/synthesize.py
+++ b/examples/fastspeech2/baker/synthesize.py
@ -67,8 +67,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
    std = paddle.to_tensor(std)
    pwg_normalizer = ZScore(mu, std)

-    fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer,
-                                                  model)
+    fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model)
    pwg_inference = PWGInference(pwg_normalizer, vocoder)

    output_dir = Path(args.output_dir)
@ -94,7 +93,7 @@ def main():
    parser.add_argument(
        "--fastspeech2-config",
        type=str,
-        help="config file to overwrite default config")
+        help="config file to overwrite default config.")
    parser.add_argument(
        "--fastspeech2-checkpoint",
        type=str,
@ -121,13 +120,13 @@ def main():
    parser.add_argument(
        "--phones-dict",
        type=str,
-        default="phone_id_map.txt ",
+        default="phone_id_map.txt",
        help="phone vocabulary file.")
-    parser.add_argument("--test-metadata", type=str, help="test metadata")
-    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument("--test-metadata", type=str, help="test metadata.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+        "--device", type=str, default="gpu", help="device type to use.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")

    args = parser.parse_args()
    with open(args.fastspeech2_config) as f:
--- a/examples/fastspeech2/baker/synthesize_e2e.py
+++ b/examples/fastspeech2/baker/synthesize_e2e.py
@ -105,7 +105,7 @@ def main():
    parser.add_argument(
        "--fastspeech2-config",
        type=str,
-        help="config file to overwrite default config")
+        help="fastspeech2 config file to overwrite default config.")
    parser.add_argument(
        "--fastspeech2-checkpoint",
        type=str,
@ -118,8 +118,7 @@ def main():
    parser.add_argument(
        "--pwg-config",
        type=str,
-        help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
-    )
+        help="parallel wavegan config file to overwrite default config.")
    parser.add_argument(
        "--pwg-params",
        type=str,
@ -132,16 +131,16 @@ def main():
    parser.add_argument(
        "--phones-dict",
        type=str,
-        default="phone_id_map.txt ",
+        default="phone_id_map.txt",
        help="phone vocabulary file.")
    parser.add_argument(
        "--text",
        type=str,
-        help="text to synthesize, a 'utt_id sentence' pair per line")
-    parser.add_argument("--output-dir", type=str, help="output dir")
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+        "--device", type=str, default="gpu", help="device type to use.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")

    args = parser.parse_args()
    with open(args.fastspeech2_config) as f:
--- a/examples/fastspeech2/baker/train.py
+++ b/examples/fastspeech2/baker/train.py
@ -154,8 +154,7 @@ def train_sp(args, config):
    output_dir = Path(args.output_dir)
    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)

-    evaluator = FastSpeech2Evaluator(model, dev_dataloader,
-                                     **config["updater"])
+    evaluator = FastSpeech2Evaluator(model, dev_dataloader, **config["updater"])

    if dist.get_rank() == 0:
        trainer.extend(evaluator, trigger=(1, "epoch"))
@ -169,18 +168,18 @@ def train_sp(args, config):

 def main():
    # parse args and config and redirect to train_sp
-    parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
+    parser = argparse.ArgumentParser(description="Train a FastSpeech2 "
                                     "model with Baker Mandrin TTS dataset.")
    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config")
-    parser.add_argument("--train-metadata", type=str, help="training data")
-    parser.add_argument("--dev-metadata", type=str, help="dev data")
-    parser.add_argument("--output-dir", type=str, help="output dir")
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
+        "--device", type=str, default="gpu", help="device type to use.")
    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+        "--nprocs", type=int, default=1, help="number of processes.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
    parser.add_argument(
        "--phones-dict",
        type=str,
--- a/examples/ge2e/audio_processor.py
+++ b/examples/ge2e/audio_processor.py
@ -30,9 +30,7 @@ except ModuleNotFoundError:
 INT16_MAX = (2**15) - 1


-def normalize_volume(wav,
-                     target_dBFS,
-                     increase_only=False,
+def normalize_volume(wav, target_dBFS, increase_only=False,
                     decrease_only=False):
    # this function implements Loudness normalization, instead of peak
    # normalization, See https://en.wikipedia.org/wiki/Audio_normalization
@ -44,8 +42,9 @@ def normalize_volume(wav,
    if increase_only and decrease_only:
        raise ValueError("Both increase only and decrease only are set")
    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
-    if ((dBFS_change < 0 and increase_only) or
-        (dBFS_change > 0 and decrease_only)):
+    if dBFS_change < 0 and increase_only:
+        return wav
+    if dBFS_change > 0 and decrease_only:
        return wav
    gain = 10**(dBFS_change / 20)
    return wav * gain
@ -59,9 +58,14 @@ def trim_long_silences(wav,
    """
    Ensures that segments without voice in the waveform remain no longer than a
    threshold determined by the VAD parameters in params.py.
-
-    :param wav: the raw waveform as a numpy array of floats
-    :return: the same waveform with silences trimmed away (length <= original wav length)
+    Parameters
+    ----------
+    wav : np.array
+        the raw waveform as a numpy array of floats
+    Returns
+    ----------
+    np.array
+        the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    samples_per_window = (vad_window_length * sampling_rate) // 1000
@ -117,20 +121,25 @@ def compute_partial_slices(n_samples: int,

    The returned ranges may be indexing further than the length of the waveform. It is
    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    Parameters
+    ----------
+    n_samples : int
+        the number of samples in the waveform.
+    partial_utterance_n_frames : int
+        the number of mel spectrogram frames in each partial utterance.

-    :param n_samples: the number of samples in the waveform
-    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
-    utterance
-    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
-    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
-    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
-    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
-    utterance, this parameter is ignored so that the function always returns at least 1 slice.
-    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
-    utterances are entirely disjoint.
-    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
-    respectively the waveform and the mel spectrogram with these slices to obtain the partial
-    utterances.
+    min_pad_coverage : int 
+        when reaching the last partial utterance, it may or may not have enough frames.
+        If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+        then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+        it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+        utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    overlap : float
+        by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
+    Returns
+    ----------
+        the waveform slices and mel spectrogram slices as lists of array slices. 
+        Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
    """
    assert 0 <= overlap < 1
    assert 0 < min_pad_coverage <= 1
@ -138,8 +147,8 @@ def compute_partial_slices(n_samples: int,
    # librosa's function to compute num_frames from num_samples
    n_frames = int(np.ceil((n_samples + 1) / hop_length))
    # frame shift between ajacent partials
-    frame_step = max(
-        1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
+    frame_step = max(1,
+                     int(np.round(partial_utterance_n_frames * (1 - overlap))))

    # Compute the slices
    wav_slices, mel_slices = [], []
--- a/examples/ge2e/dataset_processors.py
+++ b/examples/ge2e/dataset_processors.py
@ -57,7 +57,7 @@ def _process_speaker(speaker_dir: Path,
        try:
            with sources_fpath.open("rt") as sources_file:
                existing_names = {line.split(",")[0] for line in sources_file}
-        except:
+        except Exception as e:
            existing_names = {}
    else:
        existing_names = {}
@ -114,9 +114,7 @@ def process_librispeech(processor,
                     output_dir, "*.flac", skip_existing)


-def process_voxceleb1(processor,
-                      datasets_root,
-                      output_dir,
+def process_voxceleb1(processor, datasets_root, output_dir,
                      skip_existing=False):
    dataset_name = "VoxCeleb1"
    dataset_root = datasets_root / dataset_name
@ -126,10 +124,7 @@ def process_voxceleb1(processor,
        metadata = [line.strip().split("\t") for line in metafile][1:]

    # speaker id -> nationality
-    nationalities = {
-        line[0]: line[3]
-        for line in metadata if line[-1] == "dev"
-    }
+    nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
    keep_speaker_ids = [
        speaker_id for speaker_id, nationality in nationalities.items()
        if nationality.lower() in anglophone_nationalites
@ -147,9 +142,7 @@ def process_voxceleb1(processor,
                     output_dir, "*.wav", skip_existing)


-def process_voxceleb2(processor,
-                      datasets_root,
-                      output_dir,
+def process_voxceleb2(processor, datasets_root, output_dir,
                      skip_existing=False):
    dataset_name = "VoxCeleb2"
    dataset_root = datasets_root / dataset_name
@ -171,9 +164,7 @@ def process_aidatatang_200zh(processor,
                     output_dir, "*.wav", skip_existing)


-def process_magicdata(processor,
-                      datasets_root,
-                      output_dir,
+def process_magicdata(processor, datasets_root, output_dir,
                      skip_existing=False):
    dataset_name = "magicdata/train"
    dataset_root = datasets_root / dataset_name
--- a/examples/ge2e/preprocess.py
+++ b/examples/ge2e/preprocess.py
@ -52,7 +52,8 @@ if __name__ == "__main__":
    if not args.no_trim:
        try:
            import webrtcvad
-        except:
+            print(webrtcvad.__version__)
+        except Exception as e:
            raise ModuleNotFoundError(
                "Package 'webrtcvad' not found. This package enables "
                "noise removal and is recommended. Please install and "
@ -96,5 +97,5 @@ if __name__ == "__main__":

    for dataset in args.datasets:
        print("Preprocessing %s" % dataset)
-        preprocess_func[dataset](processor, args.datasets_root,
-                                 args.output_dir, args.skip_existing)
+        preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
+                                 args.skip_existing)
--- a/examples/ge2e/train.py
+++ b/examples/ge2e/train.py
@ -83,12 +83,11 @@ class Ge2eExperiment(ExperimentBase):
        self.logger.info(msg)

        if dist.get_rank() == 0:
-            self.visualizer.add_scalar("train/loss", loss_value,
-                                       self.iteration)
+            self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
            self.visualizer.add_scalar("train/eer", eer, self.iteration)
-            self.visualizer.add_scalar(
-                "param/w",
-                float(self.model_core.similarity_weight), self.iteration)
+            self.visualizer.add_scalar("param/w",
+                                       float(self.model_core.similarity_weight),
+                                       self.iteration)
            self.visualizer.add_scalar("param/b",
                                       float(self.model_core.similarity_bias),
                                       self.iteration)
--- a/examples/parallelwave_gan/baker/batch_fn.py
+++ b/examples/parallelwave_gan/baker/batch_fn.py
@ -27,10 +27,14 @@ class Clip(object):
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.

-        Args:
-            batch_max_steps (int): The maximum length of input signal in batch.
-            hop_size (int): Hop size of auxiliary features.
-            aux_context_window (int): Context window size for auxiliary feature conv.
+        Parameters
+        ----------
+        batch_max_steps : int
+            The maximum length of input signal in batch.
+        hop_size : int
+            Hop size of auxiliary features.
+        aux_context_window : int
+            Context window size for auxiliary feature conv.

        """
        if batch_max_steps % hop_size != 0:
@ -49,14 +53,18 @@ class Clip(object):
    def __call__(self, examples):
        """Convert into batch tensors.

-        Args:
-            batch (list): list of tuple of the pair of audio and features. Audio shape
-                (T, ), features shape(T', C).
+        Parameters
+        ----------
+        batch : list
+            list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

-        Returns:
-            Tensor: Auxiliary feature batch (B, C, T'), where
-                T = (T' - 2 * aux_context_window) * hop_size.
-            Tensor: Target signal batch (B, 1, T).
+        Returns
+        ----------
+        Tensor
+            Auxiliary feature batch (B, C, T'), where
+            T = (T' - 2 * aux_context_window) * hop_size.
+        Tensor
+            Target signal batch (B, 1, T).

        """
        # check length
@ -93,15 +101,15 @@ class Clip(object):
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.

-        Note:
-            Basically we assume that the length of x and c are adjusted
-            through preprocessing stage, but if we use other library processed
-            features, this process will be needed.
+        Note
+        -------
+        Basically we assume that the length of x and c are adjusted
+        through preprocessing stage, but if we use other library processed
+        features, this process will be needed.

        """
        if len(x) < c.shape[1] * self.hop_size:
-            x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)),
-                       mode="edge")
+            x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), mode="edge")

        # check the legnth is valid
        assert len(x) == c.shape[
--- a/examples/parallelwave_gan/baker/compute_statistics.py
+++ b/examples/parallelwave_gan/baker/compute_statistics.py
@ -17,18 +17,12 @@ import argparse
 import logging
 import os

-import numpy as np
-import yaml
-import json
 import jsonlines
-
+import numpy as np
+from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

-from parakeet.datasets.data_table import DataTable
-from parakeet.utils.h5_utils import read_hdf5
-from parakeet.utils.h5_utils import write_hdf5
-
 from config import get_cfg_default


--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -82,7 +82,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 6              # Batch size.
+batch_size: 8              # Batch size.
 batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
--- a/examples/parallelwave_gan/baker/normalize.py
+++ b/examples/parallelwave_gan/baker/normalize.py
@ -15,18 +15,15 @@

 import argparse
 import logging
-import os
 from operator import itemgetter
 from pathlib import Path

-import numpy as np
-import yaml
 import jsonlines
+import numpy as np
+from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

-from parakeet.datasets.data_table import DataTable
-
 from config import get_cfg_default


--- a/examples/parallelwave_gan/baker/preprocess.py
+++ b/examples/parallelwave_gan/baker/preprocess.py
@ -12,95 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Dict, Any
-import soundfile as sf
-import librosa
-import numpy as np
-import argparse
-import yaml
-import json
-import jsonlines
-import concurrent.futures
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
-from pathlib import Path
-import tqdm
 from operator import itemgetter
-from praatio import tgio
+from typing import Any
+from typing import Dict
+from typing import List
+
+import argparse
+import jsonlines
+import librosa
 import logging
+import numpy as np
+import tqdm
+from concurrent.futures import ThreadPoolExecutor
+from parakeet.data.get_feats import LogMelFBank
+from pathlib import Path
+from praatio import tgio

 from config import get_cfg_default


-def logmelfilterbank(audio,
-                     sr,
-                     n_fft=1024,
-                     hop_length=256,
-                     win_length=None,
-                     window="hann",
-                     n_mels=80,
-                     fmin=None,
-                     fmax=None,
-                     eps=1e-10):
-    """Compute log-Mel filterbank feature.
-
-    Parameters
-    ----------
-    audio : ndarray
-        Audio signal (T,).
-    sr : int
-        Sampling rate.
-    n_fft : int
-        FFT size. (Default value = 1024)
-    hop_length : int
-        Hop size. (Default value = 256)
-    win_length : int
-        Window length. If set to None, it will be the same as fft_size. (Default value = None)
-    window : str
-        Window function type. (Default value = "hann")
-    n_mels : int
-        Number of mel basis. (Default value = 80)
-    fmin : int
-        Minimum frequency in mel basis calculation. (Default value = None)
-    fmax : int
-        Maximum frequency in mel basis calculation. (Default value = None)
-    eps : float
-        Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
-
-    Returns
-    -------
-    np.ndarray
-        Log Mel filterbank feature (#frames, num_mels).
-
-    """
-    # get amplitude spectrogram
-    x_stft = librosa.stft(
-        audio,
-        n_fft=n_fft,
-        hop_length=hop_length,
-        win_length=win_length,
-        window=window,
-        pad_mode="reflect")
-    spc = np.abs(x_stft)  # (#bins, #frames,)
-
-    # get mel basis
-    fmin = 0 if fmin is None else fmin
-    fmax = sr / 2 if fmax is None else fmax
-    mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-
-    return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
-
-
 def process_sentence(config: Dict[str, Any],
                     fp: Path,
                     alignment_fp: Path,
-                     output_dir: Path):
+                     output_dir: Path,
+                     mel_extractor=None):
    utt_id = fp.stem

    # reading
    y, sr = librosa.load(str(fp), sr=config.sr)  # resampling may occur
    assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
-    assert np.abs(y).max(
-    ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+    assert np.abs(
+        y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
    duration = librosa.get_duration(y, sr=sr)

    # trim according to the alignment file
@ -134,22 +76,14 @@ def process_sentence(config: Dict[str, Any],
            frame_length=config.trim_frame_length,
            hop_length=config.trim_hop_length)

-    logmel = logmelfilterbank(
-        y,
-        sr=sr,
-        n_fft=config.n_fft,
-        window=config.window,
-        win_length=config.win_length,
-        hop_length=config.hop_length,
-        n_mels=config.n_mels,
-        fmin=config.fmin,
-        fmax=config.fmax)
+    # extract mel feats
+    logmel = mel_extractor.get_log_mel_fbank(y)

    # adjust time to make num_samples == num_frames * hop_length
-    num_frames = logmel.shape[1]
+    num_frames = logmel.shape[0]
    if y.size < num_frames * config.hop_length:
-        y = np.pad(y, (0, num_frames * config.hop_length - y.size),
-                   mode="reflect")
+        y = np.pad(
+            y, (0, num_frames * config.hop_length - y.size), mode="reflect")
    else:
        y = y[:num_frames * config.hop_length]
    num_sample = y.shape[0]
@ -157,7 +91,7 @@ def process_sentence(config: Dict[str, Any],
    mel_path = output_dir / (utt_id + "_feats.npy")
    wav_path = output_dir / (utt_id + "_wave.npy")
    np.save(wav_path, y)  # (num_samples, )
-    np.save(mel_path, logmel.T)  # (num_frames, n_mels)
+    np.save(mel_path, logmel)  # (num_frames, n_mels)
    record = {
        "utt_id": utt_id,
        "num_samples": num_sample,
@ -172,19 +106,22 @@ def process_sentences(config,
                      fps: List[Path],
                      alignment_fps: List[Path],
                      output_dir: Path,
+                      mel_extractor=None,
                      nprocs: int=1):
    if nprocs == 1:
        results = []
        for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
            results.append(
-                process_sentence(config, fp, alignment_fp, output_dir))
+                process_sentence(config, fp, alignment_fp, output_dir,
+                                 mel_extractor))
    else:
        with ThreadPoolExecutor(nprocs) as pool:
            futures = []
            with tqdm.tqdm(total=len(fps)) as progress:
                for fp, alignment_fp in zip(fps, alignment_fps):
                    future = pool.submit(process_sentence, config, fp,
-                                         alignment_fp, output_dir)
+                                         alignment_fp, output_dir,
+                                         mel_extractor)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)

@ -204,10 +141,7 @@ def main():
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features .")
    parser.add_argument(
-        "--rootdir",
-        default=None,
-        type=str,
-        help="directory to baker dataset.")
+        "--rootdir", default=None, type=str, help="directory to baker dataset.")
    parser.add_argument(
        "--dumpdir",
        type=str,
@ -260,24 +194,37 @@ def main():
    test_dump_dir = dumpdir / "test" / "raw"
    test_dump_dir.mkdir(parents=True, exist_ok=True)

+    mel_extractor = LogMelFBank(
+        sr=C.sr,
+        n_fft=C.n_fft,
+        hop_length=C.hop_length,
+        win_length=C.win_length,
+        window=C.window,
+        n_mels=C.n_mels,
+        fmin=C.fmin,
+        fmax=C.fmax)
+
    # process for the 3 sections
    process_sentences(
        C,
        train_wav_files,
        train_alignment_files,
        train_dump_dir,
+        mel_extractor=mel_extractor,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        dev_wav_files,
        dev_alignment_files,
        dev_dump_dir,
+        mel_extractor=mel_extractor,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        test_wav_files,
        test_alignment_files,
        test_dump_dir,
+        mel_extractor=mel_extractor,
        nprocs=args.num_cpu)


--- a/examples/parallelwave_gan/baker/preprocess.sh
+++ b/examples/parallelwave_gan/baker/preprocess.sh
--- a/examples/parallelwave_gan/baker/pwg_updater.py
+++ b/examples/parallelwave_gan/baker/pwg_updater.py
@ -20,17 +20,11 @@ from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from paddle.optimizer.lr import LRScheduler
 from paddle.io import DataLoader
-from paddle.io import DistributedBatchSampler
 from timer import timer

-from parakeet.datasets.data_table import DataTable
 from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState
 from parakeet.training.extensions.evaluator import StandardEvaluator
-from parakeet.training.trainer import Trainer
 from parakeet.training.reporter import report
-from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
-from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
-from parakeet.utils.profile import synchronize


 class PWGUpdater(StandardUpdater):
@ -78,16 +72,17 @@ class PWGUpdater(StandardUpdater):
            wav_ = self.generator(noise, mel)
            logging.debug(f"Generator takes {t.elapse}s.")

-        ## Multi-resolution stft loss
+        # initialize
+        gen_loss = 0.0

+        ## Multi-resolution stft loss
        with timer() as t:
-            sc_loss, mag_loss = self.criterion_stft(
-                wav_.squeeze(1), wav.squeeze(1))
+            sc_loss, mag_loss = self.criterion_stft(wav_, wav)
            logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")

        report("train/spectral_convergence_loss", float(sc_loss))
        report("train/log_stft_magnitude_loss", float(mag_loss))
-        gen_loss = sc_loss + mag_loss
+        gen_loss += sc_loss + mag_loss

        ## Adversarial loss
        if self.state.iteration > self.discriminator_train_start_steps:
@ -119,9 +114,9 @@ class PWGUpdater(StandardUpdater):
            p_ = self.discriminator(wav_.detach())
            real_loss = self.criterion_mse(p, paddle.ones_like(p))
            fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
+            dis_loss = real_loss + fake_loss
            report("train/real_loss", float(real_loss))
            report("train/fake_loss", float(fake_loss))
-            dis_loss = real_loss + fake_loss
            report("train/discriminator_loss", float(dis_loss))

            self.optimizer_d.clear_grad()
@ -164,8 +159,7 @@ class PWGEvaluator(StandardEvaluator):

        # stft loss
        with timer() as t:
-            sc_loss, mag_loss = self.criterion_stft(
-                wav_.squeeze(1), wav.squeeze(1))
+            sc_loss, mag_loss = self.criterion_stft(wav_, wav)
            logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")

        report("eval/spectral_convergence_loss", float(sc_loss))
@ -178,7 +172,7 @@ class PWGEvaluator(StandardEvaluator):
        p = self.discriminator(wav)
        real_loss = self.criterion_mse(p, paddle.ones_like(p))
        fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
+        dis_loss = real_loss + fake_loss
        report("eval/real_loss", float(real_loss))
        report("eval/fake_loss", float(fake_loss))
-        dis_loss = real_loss + fake_loss
        report("eval/discriminator_loss", float(dis_loss))
--- a/examples/parallelwave_gan/baker/run.sh
+++ b/examples/parallelwave_gan/baker/run.sh
--- a/examples/parallelwave_gan/baker/synthesize.py
+++ b/examples/parallelwave_gan/baker/synthesize.py
@ -12,34 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-import sys
-from timer import timer
-import logging
 import argparse
+import os
 from pathlib import Path
+from timer import timer

-import yaml
 import jsonlines
-import paddle
 import numpy as np
+import paddle
 import soundfile as sf
+import yaml
 from paddle import distributed as dist
-
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.parallel_wavegan import PWGGenerator

 from config import get_cfg_default

 parser = argparse.ArgumentParser(
-    description="synthesize with parallel wavegan.")
+    description="Synthesize with parallel wavegan.")
 parser.add_argument(
-    "--config", type=str, help="config file to overwrite default config")
-parser.add_argument("--checkpoint", type=str, help="snapshot to load")
-parser.add_argument("--test-metadata", type=str, help="dev data")
-parser.add_argument("--output-dir", type=str, help="output dir")
-parser.add_argument("--device", type=str, default="gpu", help="device to run")
-parser.add_argument("--verbose", type=int, default=1, help="verbose")
+    "--config", type=str, help="config file to overwrite default config.")
+parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+parser.add_argument("--test-metadata", type=str, help="dev data.")
+parser.add_argument("--output-dir", type=str, help="output dir.")
+parser.add_argument("--device", type=str, default="gpu", help="device to run.")
+parser.add_argument("--verbose", type=int, default=1, help="verbose.")

 args = parser.parse_args()
 config = get_cfg_default()
@ -89,5 +86,5 @@ for example in test_dataset:
    print(
        f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
    )
-    sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr)
+    sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.sr)
 print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")
--- a/examples/parallelwave_gan/baker/synthesize.sh
+++ b/examples/parallelwave_gan/baker/synthesize.sh
@ -0,0 +1,5 @@
+python3 synthesize.py \
+  --config=conf/default.yaml \
+  --checkpoint=exp/default/checkpoints/snapshot_iter_220000.pdz \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=exp/debug/test
--- a/examples/parallelwave_gan/baker/synthesize_from_wav.py
+++ b/examples/parallelwave_gan/baker/synthesize_from_wav.py
@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import logging
+from pathlib import Path
+
+import librosa
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from parakeet.data.get_feats import LogMelFBank
+from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
+from parakeet.modules.normalizer import ZScore
+
+from config import get_cfg_default
+
+
+def evaluate(args, config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    vocoder = PWGGenerator(**config["generator_params"])
+    state_dict = paddle.load(args.checkpoint)
+    vocoder.set_state_dict(state_dict["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    normalizer = ZScore(mu, std)
+
+    pwg_inference = PWGInference(normalizer, vocoder)
+
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    mel_extractor = LogMelFBank(
+        sr=config.sr,
+        n_fft=config.n_fft,
+        hop_length=config.hop_length,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    for utt_name in os.listdir(input_dir):
+        wav, _ = librosa.load(str(input_dir / utt_name), sr=config.sr)
+        # extract mel feats
+        mel = mel_extractor.get_log_mel_fbank(wav)
+        mel = paddle.to_tensor(mel)
+        gen_wav = pwg_inference(mel)
+        sf.write(
+            str(output_dir / ("gen_" + utt_name)),
+            gen_wav.numpy(),
+            samplerate=config.sr)
+        print(f"{utt_name} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with parallel wavegan.")
+
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+    parser.add_argument(
+        "--stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
+    )
+    parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--device", type=str, default="gpu", help="device to run.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    args = parser.parse_args()
+    config = get_cfg_default()
+    if args.config:
+        config.merge_from_file(args.config)
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    evaluate(args, config)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/parallelwave_gan/baker/train.py
+++ b/examples/parallelwave_gan/baker/train.py
@ -12,36 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-import sys
-import logging
 import argparse
-import dataclasses
-from pathlib import Path
+import os
+import logging

-import yaml
 import jsonlines
-import paddle
 import numpy as np
-from paddle import nn
-from paddle.nn import functional as F
+import paddle
+import yaml
+from paddle import DataParallel
 from paddle import distributed as dist
+from paddle import nn
 from paddle.io import DataLoader, DistributedBatchSampler
 from paddle.optimizer import Adam  # No RAdaom
 from paddle.optimizer.lr import StepDecay
-from paddle import DataParallel
-from visualdl import LogWriter
-
 from parakeet.datasets.data_table import DataTable
-from parakeet.training.updater import UpdaterBase
-from parakeet.training.trainer import Trainer
-from parakeet.training.reporter import report
-from parakeet.training import extension
-from parakeet.training.extensions.snapshot import Snapshot
-from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
 from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
+from parakeet.training.extensions.snapshot import Snapshot
+from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.seeding import seed_everything
+from parakeet.training.trainer import Trainer
+from pathlib import Path
+from visualdl import LogWriter

 from batch_fn import Clip
 from config import get_cfg_default
@ -137,8 +130,7 @@ def train_sp(args, config):
        parameters=generator.parameters(),
        **config["generator_optimizer_params"])
    lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
-    gradient_clip_d = nn.ClipGradByGlobalNorm(config[
-        "discriminator_grad_norm"])
+    gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"])
    optimizer_d = Adam(
        learning_rate=lr_schedule_d,
        grad_clip=gradient_clip_d,
@ -191,8 +183,7 @@ def train_sp(args, config):
        stop_trigger=(config.train_max_steps, "iteration"),
        out=output_dir, )

-    trainer.extend(
-        evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+    trainer.extend(evaluator, trigger=(config.eval_interval_steps, 'iteration'))
    if dist.get_rank() == 0:
        writer = LogWriter(str(trainer.out))
        trainer.extend(VisualDL(writer), trigger=(1, 'iteration'))
@ -210,15 +201,15 @@ def main():
    parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
                                     "model with Baker Mandrin TTS dataset.")
    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config")
-    parser.add_argument("--train-metadata", type=str, help="training data")
-    parser.add_argument("--dev-metadata", type=str, help="dev data")
-    parser.add_argument("--output-dir", type=str, help="output dir")
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
+        "--device", type=str, default="gpu", help="device type to use.")
    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+        "--nprocs", type=int, default=1, help="number of processes.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")

    args = parser.parse_args()
    if args.device == "cpu" and args.nprocs > 1:
--- a/examples/speedyspeech/baker/batch_fn.py
+++ b/examples/speedyspeech/baker/batch_fn.py
@ -22,8 +22,7 @@ def collate_baker_examples(examples):
    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
    durations = [
-        np.array(
-            item["durations"], dtype=np.int64) for item in examples
+        np.array(item["durations"], dtype=np.int64) for item in examples
    ]
    num_phones = np.array([item["num_phones"] for item in examples])
    num_frames = np.array([item["num_frames"] for item in examples])
--- a/examples/speedyspeech/baker/compute_statistics.py
+++ b/examples/speedyspeech/baker/compute_statistics.py
@ -15,21 +15,14 @@

 import argparse
 import logging
-import os
 from pathlib import Path

-import numpy as np
-import yaml
-import json
 import jsonlines
-
+import numpy as np
+from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

-from parakeet.datasets.data_table import DataTable
-from parakeet.utils.h5_utils import read_hdf5
-from parakeet.utils.h5_utils import write_hdf5
-
 from config import get_cfg_default


--- a/examples/speedyspeech/baker/frontend.py
+++ b/examples/speedyspeech/baker/frontend.py
@ -17,7 +17,6 @@ from pathlib import Path

 import numpy as np
 import paddle
-import pypinyin
 from pypinyin import lazy_pinyin, Style
 import jieba
 import phkit
--- a/examples/speedyspeech/baker/inference.py
+++ b/examples/speedyspeech/baker/inference.py
@ -15,9 +15,8 @@
 import argparse
 from pathlib import Path

-import numpy as np
-from paddle import inference
 import soundfile as sf
+from paddle import inference

 from frontend import text_analysis

@ -73,8 +72,8 @@ def main():

        speedyspeech_predictor.run()
        output_names = speedyspeech_predictor.get_output_names()
-        output_handle = speedyspeech_predictor.get_output_handle(output_names[
-            0])
+        output_handle = speedyspeech_predictor.get_output_handle(
+            output_names[0])
        output_data = output_handle.copy_to_cpu()

        input_names = pwg_predictor.get_input_names()
--- a/examples/speedyspeech/baker/normalize.py
+++ b/examples/speedyspeech/baker/normalize.py
@ -15,19 +15,16 @@

 import argparse
 import logging
-import os
-from copy import copy
 from operator import itemgetter
 from pathlib import Path

-import numpy as np
-import yaml
 import jsonlines
+import numpy as np
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

-from parakeet.frontend.vocab import Vocab
 from parakeet.datasets.data_table import DataTable
+from parakeet.frontend.vocab import Vocab

 from config import get_cfg_default

@ -100,7 +97,10 @@ def main():
    for item in metadata:
        item["feats"] = str(metadata_dir / item["feats"])

-    dataset = DataTable(metadata, converters={'feats': np.load, })
+    dataset = DataTable(
+        metadata, converters={
+            'feats': np.load,
+        })
    logging.info(f"The number of files = {len(dataset)}.")

    # restore scaler
--- a/examples/speedyspeech/baker/preprocess.py
+++ b/examples/speedyspeech/baker/preprocess.py
@ -12,97 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Dict, Any
-import soundfile as sf
-import librosa
-import numpy as np
-import argparse
-import yaml
-import json
-import re
-import jsonlines
-import concurrent.futures
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
-from pathlib import Path
-import tqdm
 from operator import itemgetter
-from praatio import tgio
+from typing import Any
+from typing import Dict
+from typing import List
+
+import argparse
+import jsonlines
+import librosa
 import logging
+import numpy as np
+import re
+import tqdm
+from concurrent.futures import ThreadPoolExecutor
+from parakeet.data.get_feats import LogMelFBank
+from pathlib import Path
+from praatio import tgio

 from config import get_cfg_default
 from tg_utils import validate_textgrid


-def logmelfilterbank(audio,
-                     sr,
-                     n_fft=1024,
-                     hop_length=256,
-                     win_length=None,
-                     window="hann",
-                     n_mels=80,
-                     fmin=None,
-                     fmax=None,
-                     eps=1e-10):
-    """Compute log-Mel filterbank feature.
-
-    Parameters
-    ----------
-    audio : ndarray
-        Audio signal (T,).
-    sr : int
-        Sampling rate.
-    n_fft : int
-        FFT size. (Default value = 1024)
-    hop_length : int
-        Hop size. (Default value = 256)
-    win_length : int
-        Window length. If set to None, it will be the same as fft_size. (Default value = None)
-    window : str
-        Window function type. (Default value = "hann")
-    n_mels : int
-        Number of mel basis. (Default value = 80)
-    fmin : int
-        Minimum frequency in mel basis calculation. (Default value = None)
-    fmax : int
-        Maximum frequency in mel basis calculation. (Default value = None)
-    eps : float
-        Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
-
-    Returns
-    -------
-    np.ndarray
-        Log Mel filterbank feature (#frames, num_mels).
-
-    """
-    # get amplitude spectrogram
-    x_stft = librosa.stft(
-        audio,
-        n_fft=n_fft,
-        hop_length=hop_length,
-        win_length=win_length,
-        window=window,
-        pad_mode="reflect")
-    spc = np.abs(x_stft)  # (#bins, #frames,)
-
-    # get mel basis
-    fmin = 0 if fmin is None else fmin
-    fmax = sr / 2 if fmax is None else fmax
-    mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-
-    return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
-
-
 def process_sentence(config: Dict[str, Any],
                     fp: Path,
                     alignment_fp: Path,
-                     output_dir: Path):
+                     output_dir: Path,
+                     mel_extractor=None):
    utt_id = fp.stem

    # reading
-    y, sr = librosa.load(fp, sr=config.sr)  # resampling may occur
+    y, sr = librosa.load(str(fp), sr=config.sr)  # resampling may occur
    assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
-    assert np.abs(y).max(
-    ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+    assert np.abs(
+        y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
    duration = librosa.get_duration(y, sr=sr)

    # intervals with empty lables are ignored
@ -125,16 +67,8 @@ def process_sentence(config: Dict[str, Any],
            f" There is something wrong with the last interval {last} in utterance: {utt_id}"
        )

-    logmel = logmelfilterbank(
-        y,
-        sr=sr,
-        n_fft=config.n_fft,
-        window=config.window,
-        win_length=config.win_length,
-        hop_length=config.hop_length,
-        n_mels=config.n_mels,
-        fmin=config.fmin,
-        fmax=config.fmax)
+    # extract mel feats
+    logmel = mel_extractor.get_log_mel_fbank(y)

    # extract phone and duration
    phones = []
@ -162,7 +96,7 @@ def process_sentence(config: Dict[str, Any],
        ends, sr=sr, hop_length=config.hop_length)
    durations_frame = np.diff(frame_pos, prepend=0)

-    num_frames = logmel.shape[-1]  # number of frames of the spectrogram
+    num_frames = logmel.shape[0]  # number of frames of the spectrogram
    extra = np.sum(durations_frame) - num_frames
    assert extra <= 0, (
        f"Number of frames inferred from alignemnt is "
@ -173,7 +107,7 @@ def process_sentence(config: Dict[str, Any],
    durations_frame = durations_frame.tolist()

    mel_path = output_dir / (utt_id + "_feats.npy")
-    np.save(mel_path, logmel.T)  # (num_frames, n_mels)
+    np.save(mel_path, logmel)  # (num_frames, n_mels)
    record = {
        "utt_id": utt_id,
        "phones": phones,
@ -190,20 +124,23 @@ def process_sentences(config,
                      fps: List[Path],
                      alignment_fps: List[Path],
                      output_dir: Path,
+                      mel_extractor=None,
                      nprocs: int=1):
    if nprocs == 1:
        results = []
        for fp, alignment_fp in tqdm.tqdm(
                zip(fps, alignment_fps), total=len(fps)):
            results.append(
-                process_sentence(config, fp, alignment_fp, output_dir))
+                process_sentence(config, fp, alignment_fp, output_dir,
+                                 mel_extractor))
    else:
        with ThreadPoolExecutor(nprocs) as pool:
            futures = []
            with tqdm.tqdm(total=len(fps)) as progress:
                for fp, alignment_fp in zip(fps, alignment_fps):
                    future = pool.submit(process_sentence, config, fp,
-                                         alignment_fp, output_dir)
+                                         alignment_fp, output_dir,
+                                         mel_extractor)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)

@ -227,10 +164,7 @@ def main():
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
-        "--rootdir",
-        default=None,
-        type=str,
-        help="directory to baker dataset.")
+        "--rootdir", default=None, type=str, help="directory to baker dataset.")
    parser.add_argument(
        "--dumpdir",
        type=str,
@ -288,24 +222,37 @@ def main():
    test_dump_dir = dumpdir / "test" / "raw"
    test_dump_dir.mkdir(parents=True, exist_ok=True)

+    mel_extractor = LogMelFBank(
+        sr=C.sr,
+        n_fft=C.n_fft,
+        hop_length=C.hop_length,
+        win_length=C.win_length,
+        window=C.window,
+        n_mels=C.n_mels,
+        fmin=C.fmin,
+        fmax=C.fmax)
+
    # process for the 3 sections
    process_sentences(
        C,
        train_wav_files,
        train_alignment_files,
        train_dump_dir,
+        mel_extractor=mel_extractor,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        dev_wav_files,
        dev_alignment_files,
        dev_dump_dir,
+        mel_extractor=mel_extractor,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        test_wav_files,
        test_alignment_files,
        test_dump_dir,
+        mel_extractor=mel_extractor,
        nprocs=args.num_cpu)


--- a/examples/speedyspeech/baker/preprocess.sh
+++ b/examples/speedyspeech/baker/preprocess.sh
--- a/examples/speedyspeech/baker/run.sh
+++ b/examples/speedyspeech/baker/run.sh
--- a/examples/speedyspeech/baker/speedyspeech_updater.py
+++ b/examples/speedyspeech/baker/speedyspeech_updater.py
@ -13,15 +13,13 @@
 # limitations under the License.

 import paddle
-from paddle.nn import functional as F
 from paddle.fluid.layers import huber_loss
-
-from parakeet.modules.ssim import ssim
+from paddle.nn import functional as F
 from parakeet.modules.losses import masked_l1_loss, weighted_mean
+from parakeet.modules.ssim import ssim
+from parakeet.training.extensions.evaluator import StandardEvaluator
 from parakeet.training.reporter import report
 from parakeet.training.updaters.standard_updater import StandardUpdater
-from parakeet.training.extensions.evaluator import StandardEvaluator
-from parakeet.models.speedyspeech import SpeedySpeech


 class SpeedySpeechUpdater(StandardUpdater):
--- a/examples/speedyspeech/baker/synthesize.py
+++ b/examples/speedyspeech/baker/synthesize.py
@ -11,30 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import os
-import sys
 import logging
 import argparse
-import dataclasses
 from pathlib import Path

-import yaml
 import jsonlines
-import paddle
 import numpy as np
 import soundfile as sf
 import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddle import distributed as dist
+import yaml
 from paddle import jit
 from paddle.static import InputSpec
 from yacs.config import CfgNode

 from parakeet.datasets.data_table import DataTable
-from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
-from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
+from parakeet.models.speedyspeech import SpeedySpeech
+from parakeet.models.speedyspeech import SpeedySpeechInference
+from parakeet.models.parallel_wavegan import PWGGenerator
+from parakeet.models.parallel_wavegan import PWGInference
 from parakeet.modules.normalizer import ZScore


@ -79,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
    speedyspeech_inference = jit.to_static(
        speedyspeech_inference,
        input_spec=[
-            InputSpec(
-                [-1], dtype=paddle.int64), InputSpec(
-                    [-1], dtype=paddle.int64)
+            InputSpec([-1], dtype=paddle.int64), InputSpec(
+                [-1], dtype=paddle.int64)
        ])
    paddle.jit.save(speedyspeech_inference,
                    os.path.join(args.inference_dir, "speedyspeech"))
@ -91,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
    pwg_inference = PWGInference(pwg_normalizer, vocoder)
    pwg_inference.eval()
    pwg_inference = jit.to_static(
-        pwg_inference,
-        input_spec=[InputSpec(
-            [-1, 80], dtype=paddle.float32), ])
+        pwg_inference, input_spec=[
+            InputSpec([-1, 80], dtype=paddle.float32),
+        ])
    paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
    pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))

@ -119,9 +113,7 @@ def main():
    parser = argparse.ArgumentParser(
        description="Synthesize with speedyspeech & parallel wavegan.")
    parser.add_argument(
-        "--speedyspeech-config",
-        type=str,
-        help="config file for speedyspeech.")
+        "--speedyspeech-config", type=str, help="config file for speedyspeech.")
    parser.add_argument(
        "--speedyspeech-checkpoint",
        type=str,
--- a/examples/speedyspeech/baker/synthesize.sh
+++ b/examples/speedyspeech/baker/synthesize.sh
@ -1,6 +1,6 @@
 python synthesize.py \
  --speedyspeech-config=conf/default.yaml \
-  --speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
+  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \
  --speedyspeech-stat=dump/train/stats.npy \
  --pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
  --pwg-params=../../parallelwave_gan/baker/converted.pdparams \
--- a/examples/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/speedyspeech/baker/synthesize_e2e.py
@ -13,28 +13,22 @@
 # limitations under the License.

 import os
-import sys
 import logging
 import argparse
-import dataclasses
 from pathlib import Path

-import yaml
-import jsonlines
-import paddle
 import numpy as np
 import soundfile as sf
 import paddle
+import yaml
 from paddle import jit
 from paddle.static import InputSpec
-from paddle import nn
-from paddle.nn import functional as F
-from paddle import distributed as dist
 from yacs.config import CfgNode

-from parakeet.datasets.data_table import DataTable
-from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
-from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
+from parakeet.models.speedyspeech import SpeedySpeech
+from parakeet.models.speedyspeech import SpeedySpeechInference
+from parakeet.models.parallel_wavegan import PWGGenerator
+from parakeet.models.parallel_wavegan import PWGInference
 from parakeet.modules.normalizer import ZScore

 from frontend import text_analysis
@ -57,8 +51,7 @@ def evaluate(args, speedyspeech_config, pwg_config):
    model.eval()

    vocoder = PWGGenerator(**pwg_config["generator_params"])
-    vocoder.set_state_dict(
-        paddle.load(args.pwg_checkpoint)["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
    vocoder.remove_weight_norm()
    vocoder.eval()
    print("model done!")
@ -81,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
    speedyspeech_inference = jit.to_static(
        speedyspeech_inference,
        input_spec=[
-            InputSpec(
-                [-1], dtype=paddle.int64), InputSpec(
-                    [-1], dtype=paddle.int64)
+            InputSpec([-1], dtype=paddle.int64), InputSpec(
+                [-1], dtype=paddle.int64)
        ])
    paddle.jit.save(speedyspeech_inference,
                    os.path.join(args.inference_dir, "speedyspeech"))
@ -93,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
    pwg_inference = PWGInference(pwg_normalizer, vocoder)
    pwg_inference.eval()
    pwg_inference = jit.to_static(
-        pwg_inference,
-        input_spec=[InputSpec(
-            [-1, 80], dtype=paddle.float32), ])
+        pwg_inference, input_spec=[
+            InputSpec([-1, 80], dtype=paddle.float32),
+        ])
    paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
    pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))

@ -119,9 +111,7 @@ def main():
    parser = argparse.ArgumentParser(
        description="Synthesize with speedyspeech & parallel wavegan.")
    parser.add_argument(
-        "--speedyspeech-config",
-        type=str,
-        help="config file for speedyspeech.")
+        "--speedyspeech-config", type=str, help="config file for speedyspeech.")
    parser.add_argument(
        "--speedyspeech-checkpoint",
        type=str,
--- a/examples/speedyspeech/baker/tg_utils.py
+++ b/examples/speedyspeech/baker/tg_utils.py
@ -13,7 +13,6 @@
 # limitations under the License.

 import librosa
-from praatio import tgio


 def validate_textgrid(text_grid, num_samples, sr):
--- a/examples/speedyspeech/baker/train.py
+++ b/examples/speedyspeech/baker/train.py
@ -12,40 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-import sys
-import logging
 import argparse
-import dataclasses
-from pathlib import Path
+import logging
+import os

-import yaml
 import jsonlines
-import paddle
 import numpy as np
-from paddle import nn
-from paddle.nn import functional as F
+import paddle
+import yaml
 from paddle import distributed as dist
+from paddle import DataParallel
+from paddle import nn
 from paddle.io import DataLoader, DistributedBatchSampler
 from paddle.optimizer import Adam  # No RAdaom
-from paddle.optimizer.lr import StepDecay
-from paddle import DataParallel
-from visualdl import LogWriter
-
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.speedyspeech import SpeedySpeech
-
-from parakeet.training.updater import UpdaterBase
-from parakeet.training.trainer import Trainer
-from parakeet.training.reporter import report
-from parakeet.training import extension
 from parakeet.training.extensions.snapshot import Snapshot
 from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.seeding import seed_everything
+from parakeet.training.trainer import Trainer
+from pathlib import Path
+from visualdl import LogWriter

 from batch_fn import collate_baker_examples
-from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
 from config import get_cfg_default
+from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator


 def train_sp(args, config):
@ -81,7 +72,9 @@ def train_sp(args, config):
        fields=[
            "phones", "tones", "num_phones", "num_frames", "feats", "durations"
        ],
-        converters={"feats": np.load, }, )
+        converters={
+            "feats": np.load,
+        }, )
    with jsonlines.open(args.dev_metadata, 'r') as reader:
        dev_metadata = list(reader)
    metadata_dir = Path(args.dev_metadata).parent
@ -92,7 +85,9 @@ def train_sp(args, config):
        fields=[
            "phones", "tones", "num_phones", "num_frames", "feats", "durations"
        ],
-        converters={"feats": np.load, }, )
+        converters={
+            "feats": np.load,
+        }, )

    # collate function and dataloader
    train_sampler = DistributedBatchSampler(
@ -100,10 +95,6 @@ def train_sp(args, config):
        batch_size=config.batch_size,
        shuffle=False,
        drop_last=True)
-    # dev_sampler = DistributedBatchSampler(dev_dataset,
-    #                                       batch_size=config.batch_size,
-    #                                       shuffle=False,
-    #                                       drop_last=False)
    print("samplers done!")

    train_dataloader = DataLoader(
@ -123,7 +114,6 @@ def train_sp(args, config):
    model = SpeedySpeech(**config["model"])
    if world_size > 1:
        model = DataParallel(model)  # TODO, do not use vocab size from config
-    # print(model)
    print("model done!")
    optimizer = Adam(
        0.001,
@ -154,15 +144,15 @@ def main():
    parser = argparse.ArgumentParser(description="Train a Speedyspeech "
                                     "model with Baker Mandrin TTS dataset.")
    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config")
-    parser.add_argument("--train-metadata", type=str, help="training data")
-    parser.add_argument("--dev-metadata", type=str, help="dev data")
-    parser.add_argument("--output-dir", type=str, help="output dir")
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
+        "--device", type=str, default="gpu", help="device type to use.")
    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+        "--nprocs", type=int, default=1, help="number of processes.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")

    args, rest = parser.parse_known_args()
    if args.device == "cpu" and args.nprocs > 1:
--- a/examples/tacotron2/ljspeech.py
+++ b/examples/tacotron2/ljspeech.py
@ -46,8 +46,7 @@ class LJSpeech(Dataset):
 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""

-    def __init__(self, padding_idx=0, padding_value=0.,
-                 padding_stop_token=1.0):
+    def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
        self.padding_idx = padding_idx
        self.padding_value = padding_value
        self.padding_stop_token = padding_stop_token
--- a/examples/tacotron2/preprocess.py
+++ b/examples/tacotron2/preprocess.py
@ -63,8 +63,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
    with open(target_path / "metadata.pkl", 'wb') as f:
        pickle.dump(records, f)
        if verbose:
-            print("saved metadata into {}".format(target_path /
-                                                  "metadata.pkl"))
+            print("saved metadata into {}".format(target_path / "metadata.pkl"))

    print("Done.")

--- a/examples/tacotron2/train.py
+++ b/examples/tacotron2/train.py
@ -14,14 +14,13 @@

 import time
 from collections import defaultdict
+
 import numpy as np
-
 import paddle
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
 from paddle import distributed as dist
-from paddle.io import DataLoader, DistributedBatchSampler
-
 from parakeet.data import dataset
-from parakeet.frontend import EnglishCharacter  # pylint: disable=unused-import
 from parakeet.training.cli import default_argument_parser
 from parakeet.training.experiment import ExperimentBase
 from parakeet.utils import display, mp_tools
@ -74,8 +73,7 @@ class Experiment(ExperimentBase):

        if dist.get_rank() == 0:
            for k, v in losses_np.items():
-                self.visualizer.add_scalar(f"train_loss/{k}", v,
-                                           self.iteration)
+                self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)

    @mp_tools.rank_zero_only
    @paddle.no_grad()
--- a/examples/tacotron2_aishell3/aishell3.py
+++ b/examples/tacotron2_aishell3/aishell3.py
@ -65,8 +65,8 @@ def collate_aishell3_examples(examples):
    text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
    T_dec = np.max(spec_lengths)
-    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
-                   ).astype(np.float32)
+    stop_tokens = (
+        np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
    phones, _ = batch_text_id(phones)
    tones, _ = batch_text_id(tones)
    mel, _ = batch_spec(mel)
--- a/examples/tacotron2_aishell3/preprocess_transcription.py
+++ b/examples/tacotron2_aishell3/preprocess_transcription.py
@ -121,8 +121,8 @@ def convert(syllable):
    syllable = syllable.replace("ing", "ieng").replace("in", "ien")

    # expansion for un, ui, iu
-    syllable = syllable.replace("un", "uen").replace(
-        "ui", "uei").replace("iu", "iou")
+    syllable = syllable.replace("un", "uen").replace("ui",
+                                                     "uei").replace("iu", "iou")

    # rule for variants of i
    syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
--- a/examples/tacotron2_aishell3/process_wav.py
+++ b/examples/tacotron2_aishell3/process_wav.py
@ -68,8 +68,7 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
        alignment_dir=alignment_dir)
    with Pool(16) as p:
        list(
-            tqdm(
-                p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
+            tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))


 if __name__ == "__main__":
--- a/examples/tacotron2_aishell3/train.py
+++ b/examples/tacotron2_aishell3/train.py
@ -109,8 +109,7 @@ class Experiment(ExperimentBase):
            mel_pred = outputs['mel_outputs_postnet']
            self.visualizer.add_figure(
                f"valid_sentence_{i}_predicted_spectrogram",
-                display.plot_spectrogram(mel_pred[0].numpy().T),
-                self.iteration)
+                display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)

        # write visual log
        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
--- a/examples/text_frontend/get_textnorm_data.py
+++ b/examples/text_frontend/get_textnorm_data.py
@ -13,7 +13,6 @@
 # limitations under the License.

 import argparse
-import re
 from pathlib import Path


--- a/examples/text_frontend/test_g2p.py
+++ b/examples/text_frontend/test_g2p.py
@ -40,6 +40,7 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir):
        raw_text = raw_dict[utt_id]
        text = text_cleaner(raw_text)
        g2p_phones = frontend.get_phonemes(text)
+        g2p_phones = sum(g2p_phones, [])
        gt_phones = ref_dict[utt_id].split(" ")
        # delete silence tokens in predicted phones and ground truth phones
        g2p_phones = [phn for phn in g2p_phones if phn not in SILENCE_TOKENS]
--- a/examples/transformer_tts/ljspeech.py
+++ b/examples/transformer_tts/ljspeech.py
@ -53,10 +53,10 @@ class Transform(object):
        ids, mel = example  # ids already have <s> and </s>
        ids = np.array(ids, dtype=np.int64)
        # add start and end frame
-        mel = np.pad(mel, [(0, 0), (1, 1)],
-                     mode='constant',
-                     constant_values=[(0, 0),
-                                      (self.start_value, self.end_value)])
+        mel = np.pad(
+            mel, [(0, 0), (1, 1)],
+            mode='constant',
+            constant_values=[(0, 0), (self.start_value, self.end_value)])
        stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
        stop_labels[-1] = 2
        # actually this thing can also be done within the model
--- a/examples/transformer_tts/preprocess.py
+++ b/examples/transformer_tts/preprocess.py
@ -64,8 +64,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
    with open(target_path / "metadata.pkl", 'wb') as f:
        pickle.dump(records, f)
        if verbose:
-            print("saved metadata into {}".format(target_path /
-                                                  "metadata.pkl"))
+            print("saved metadata into {}".format(target_path / "metadata.pkl"))

    # also save meta data into text format for inspection
    with open(target_path / "metadata.txt", 'wt') as f:
@ -73,8 +72,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
            phoneme_str = "|".join(phonemes)
            f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
        if verbose:
-            print("saved metadata into {}".format(target_path /
-                                                  "metadata.txt"))
+            print("saved metadata into {}".format(target_path / "metadata.txt"))

    print("Done.")

--- a/examples/transformer_tts/synthesize.py
+++ b/examples/transformer_tts/synthesize.py
@ -60,7 +60,7 @@ def main(config, args):
        display.plot_multilayer_multihead_alignments(attns)
        plt.savefig(str(output_dir / f"sentence_{i}.png"))

-        mel_output = mel_output.T  #(C, T)
+        mel_output = mel_output.T  # (C, T)
        np.save(str(output_dir / f"sentence_{i}"), mel_output)
        if args.verbose:
            print("spectrogram saved at {}".format(output_dir /
--- a/examples/transformer_tts/train.py
+++ b/examples/transformer_tts/train.py
@ -76,8 +76,7 @@ class TransformerTTSExperiment(ExperimentBase):
        ljspeech_dataset = LJSpeech(args.data)
        transform = Transform(config.data.mel_start_value,
                              config.data.mel_end_value)
-        ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
-                                                    transform)
+        ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
        valid_set, train_set = dataset.split(ljspeech_dataset,
                                             config.data.valid_size)
        batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
@ -159,8 +158,7 @@ class TransformerTTSExperiment(ExperimentBase):

        if dist.get_rank() == 0:
            for k, v in losses_np.items():
-                self.visualizer.add_scalar(f"train_loss/{k}", v,
-                                           self.iteration)
+                self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)

    @mp_tools.rank_zero_only
    @paddle.no_grad()
--- a/examples/use_mfa/local/generate_lexicon.py
+++ b/examples/use_mfa/local/generate_lexicon.py
@ -90,8 +90,8 @@ def rule(C, V, R, T):
        return None

    # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
-    if V in ['ua', 'uai', 'uang'
-             ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
+    if V in ['ua', 'uai',
+             'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
        return None

    # sh 和 ong 不能拼
--- a/examples/waveflow/preprocess.py
+++ b/examples/waveflow/preprocess.py
@ -28,8 +28,8 @@ from config import get_cfg_defaults


 class Transform(object):
-    def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels,
-                 fmin, fmax):
+    def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, fmin,
+                 fmax):
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length
@ -79,11 +79,8 @@ class Transform(object):
        spectrogram_magnitude = np.abs(spectrogram)

        # Compute mel-spectrograms.
-        mel_filter_bank = librosa.filters.mel(sr=sr,
-                                              n_fft=n_fft,
-                                              n_mels=n_mels,
-                                              fmin=fmin,
-                                              fmax=fmax)
+        mel_filter_bank = librosa.filters.mel(
+            sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
        mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)

        # log scale mel_spectrogram.
--- a/examples/waveflow/synthesize.py
+++ b/examples/waveflow/synthesize.py
@ -39,8 +39,7 @@ def main(config, args):
        mel = np.load(str(file_path))
        with paddle.amp.auto_cast():
            audio = model.predict(mel)
-        audio_path = output_dir / (
-            os.path.splitext(file_path.name)[0] + ".wav")
+        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
        sf.write(audio_path, audio, config.data.sample_rate)
        print("[synthesize] {} -> {}".format(file_path, audio_path))

--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
@ -114,8 +114,7 @@ class Experiment(ExperimentBase):
        msg += "loss: {:>.6f}".format(loss_value)
        self.logger.info(msg)
        if dist.get_rank() == 0:
-            self.visualizer.add_scalar("train/loss", loss_value,
-                                       self.iteration)
+            self.visualizer.add_scalar("train/loss", loss_value, self.iteration)

    @mp_tools.rank_zero_only
    @paddle.no_grad()
--- a/parakeet/init.py
+++ b/parakeet/init.py
@ -13,6 +13,3 @@
 # limitations under the License.

 __version__ = "0.0.0"
-
-import logging
-from parakeet import audio, data, datasets, frontend, models, modules, training, utils
--- a/parakeet/audio/init.py
+++ b/parakeet/audio/init.py
@ -11,6 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .audio import AudioProcessor
-from .spec_normalizer import NormalizerBase, LogMagnitude
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
@ -11,10 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import librosa
-import soundfile as sf
 import numpy as np
+import soundfile as sf

 __all__ = ["AudioProcessor"]

@ -53,11 +52,12 @@ class AudioProcessor(object):
        self.inv_mel_filter = np.linalg.pinv(self.mel_filter)

    def _create_mel_filter(self):
-        mel_filter = librosa.filters.mel(self.sample_rate,
-                                         self.n_fft,
-                                         n_mels=self.n_mels,
-                                         fmin=self.fmin,
-                                         fmax=self.fmax)
+        mel_filter = librosa.filters.mel(
+            self.sample_rate,
+            self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            fmax=self.fmax)
        return mel_filter

    def read_wav(self, filename):
--- a/parakeet/data/init.py
+++ b/parakeet/data/init.py
@ -13,19 +13,3 @@
 # limitations under the License.
 """Parakeet's infrastructure for data processing.
 """
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from parakeet.data.dataset import *
-from parakeet.data.batch import *
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@ -61,9 +61,10 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    for example in minibatch:
        pad_len = max_len - example.shape[0]
        batch.append(
-            np.pad(example, [(0, pad_len)],
-                   mode='constant',
-                   constant_values=pad_id))
+            np.pad(
+                example, [(0, pad_len)],
+                mode='constant',
+                constant_values=pad_id))

    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)

@ -103,9 +104,10 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        batch.append(
-            np.pad(example, [(0, pad_len)],
-                   mode='constant',
-                   constant_values=pad_value))
+            np.pad(
+                example, [(0, pad_len)],
+                mode='constant',
+                constant_values=pad_value))
    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)


@ -152,14 +154,16 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
        pad_len = max_len - example.shape[time_idx]
        if time_major:
            batch.append(
-                np.pad(example, [(0, pad_len), (0, 0)],
-                       mode='constant',
-                       constant_values=pad_value))
+                np.pad(
+                    example, [(0, pad_len), (0, 0)],
+                    mode='constant',
+                    constant_values=pad_value))
        else:
            batch.append(
-                np.pad(example, [(0, 0), (0, pad_len)],
-                       mode='constant',
-                       constant_values=pad_value))
+                np.pad(
+                    example, [(0, 0), (0, pad_len)],
+                    mode='constant',
+                    constant_values=pad_value))
    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)


@ -178,10 +182,8 @@ def batch_sequences(sequences, axis=0, pad_value=0):
    for seq, length in zip(sequences, seq_lengths):
        padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
            ndim - axis - 1)
-        padded_seq = np.pad(seq,
-                            padding,
-                            mode='constant',
-                            constant_values=pad_value)
+        padded_seq = np.pad(
+            seq, padding, mode='constant', constant_values=pad_value)
        padded_sequences.append(padded_seq)
    batch = np.stack(padded_sequences)
    return batch
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@ -11,9 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import six
-import paddle
 from paddle.io import Dataset

 __all__ = [
@ -69,7 +67,7 @@ class CacheDataset(Dataset):
        return len(self._dataset)

    def __getitem__(self, i):
-        if not i in self._cache:
+        if i not in self._cache:
            self._cache[i] = self._dataset[i]
        return self._cache[i]

@ -86,9 +84,8 @@ class TupleDataset(Dataset):
        length = len(datasets[0])
        for i, dataset in enumerate(datasets):
            if len(dataset) != length:
-                raise ValueError(
-                    "all the datasets should have the same length."
-                    "dataset {} has a different length".format(i))
+                raise ValueError("all the datasets should have the same length."
+                                 "dataset {} has a different length".format(i))
        self._datasets = datasets
        self._length = length

@ -115,7 +112,7 @@ class DictDataset(Dataset):
        A compound dataset made from several datasets of the same length. An 
        example of the `DictDataset` is a dict of examples from the constituent 
        datasets.
-        
+
        WARNING: paddle does not have a good support for DictDataset, because
        every batch yield from a DataLoader is a list, but it cannot be a dict.
        So you have to provide a collate function because you cannot use the
--- a/examples/fastspeech2/baker/get_feats.py
+++ b/examples/fastspeech2/baker/get_feats.py
@ -11,14 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import librosa
 import numpy as np
 import pyworld
 from scipy.interpolate import interp1d

-from config import get_cfg_default
-

 class LogMelFBank():
    def __init__(self,
@ -42,17 +39,18 @@ class LogMelFBank():

        # mel
        self.n_mels = n_mels
-        self.fmin = fmin
-        self.fmax = fmax
+        self.fmin = 0 if fmin is None else fmin
+        self.fmax = sr / 2 if fmax is None else fmax

        self.mel_filter = self._create_mel_filter()

    def _create_mel_filter(self):
-        mel_filter = librosa.filters.mel(sr=self.sr,
-                                         n_fft=self.n_fft,
-                                         n_mels=self.n_mels,
-                                         fmin=self.fmin,
-                                         fmax=self.fmax)
+        mel_filter = librosa.filters.mel(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            fmax=self.fmax)
        return mel_filter

    def _stft(self, wav):
@ -123,11 +121,12 @@ class Pitch():
                      use_log_f0=True) -> np.array:
        input = input.astype(np.float)
        frame_period = 1000 * self.hop_length / self.sr
-        f0, timeaxis = pyworld.dio(input,
-                                   fs=self.sr,
-                                   f0_floor=self.f0min,
-                                   f0_ceil=self.f0max,
-                                   frame_period=frame_period)
+        f0, timeaxis = pyworld.dio(
+            input,
+            fs=self.sr,
+            f0_floor=self.f0min,
+            f0_ceil=self.f0max,
+            frame_period=frame_period)
        f0 = pyworld.stonemask(input, f0, timeaxis, self.sr)
        if use_continuous_f0:
            f0 = self._convert_to_continuous_f0(f0)
@ -197,8 +196,7 @@ class Energy():
        input_power = np.abs(input_stft)**2
        energy = np.sqrt(
            np.clip(
-                np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float(
-                    'inf')))
+                np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float('inf')))
        return energy

    def _average_by_duration(self, input: np.array, d: np.array) -> np.array:
@ -217,41 +215,3 @@ class Energy():
        if use_token_averaged_energy and duration is not None:
            energy = self._average_by_duration(energy, duration)
        return energy
-
-
-if __name__ == "__main__":
-    C = get_cfg_default()
-    filename = "../raw_data/data/format.1/000001.flac"
-    wav, _ = librosa.load(filename, sr=C.fs)
-    mel_extractor = LogMelFBank(
-        sr=C.fs,
-        n_fft=C.n_fft,
-        hop_length=C.n_shift,
-        win_length=C.win_length,
-        window=C.window,
-        n_mels=C.n_mels,
-        fmin=C.fmin,
-        fmax=C.fmax, )
-    mel = mel_extractor.get_log_mel_fbank(wav)
-    print(mel)
-    print(mel.shape)
-
-    pitch_extractor = Pitch(
-        sr=C.fs, hop_length=C.n_shift, f0min=C.f0min, f0max=C.f0max)
-    duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
-    duration = np.array([int(x) for x in duration.split(" ")])
-    avg_f0 = pitch_extractor.get_pitch(wav, duration=duration)
-    print(avg_f0)
-    print(avg_f0.shape)
-
-    energy_extractor = Energy(
-        sr=C.fs,
-        n_fft=C.n_fft,
-        hop_length=C.n_shift,
-        win_length=C.win_length,
-        window=C.window)
-    duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
-    duration = np.array([int(x) for x in duration.split(" ")])
-    avg_energy = energy_extractor.get_energy(wav, duration=duration)
-    print(avg_energy)
-    print(avg_energy.sum())
--- a/parakeet/datasets/init.py
+++ b/parakeet/datasets/init.py
@ -11,6 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from parakeet.datasets.common import *
-from parakeet.datasets.ljspeech import *
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from paddle.io import Dataset
-import os
-import librosa
 from pathlib import Path
-import numpy as np
 from typing import List

+import librosa
+import numpy as np
+from paddle.io import Dataset
+
 __all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]


@ -57,7 +56,7 @@ class AudioSegmentDataset(Dataset):


 class AudioDataset(Dataset):
-    """A simple dataset adaptor for the audio files. 
+    """A simple dataset adaptor for the audio files.
    Read -> trim silence -> normalize
    """

--- a/parakeet/datasets/data_table.py
+++ b/parakeet/datasets/data_table.py
@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import Union, Optional, Callable, Tuple, List, Dict, Any
-from pathlib import Path
 from multiprocessing import Manager
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List

-import numpy as np
 from paddle.io import Dataset


--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pathlib import Path

 from paddle.io import Dataset
-from pathlib import Path

 __all__ = ["LJSpeechMetaData"]

--- a/parakeet/frontend/init.py
+++ b/parakeet/frontend/init.py
@ -11,11 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from parakeet.frontend.vocab import *
-from parakeet.frontend.phonectic import *
-from parakeet.frontend.punctuation import *
-from parakeet.frontend.normalizer import *
-from parakeet.frontend.cn_normalization import *
-from parakeet.frontend.tone_sandhi import *
-from parakeet.frontend.generate_lexicon import *
--- a/parakeet/frontend/arpabet.py
+++ b/parakeet/frontend/arpabet.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from parakeet.frontend.phonectic import Phonetics
 """
 A phonology system with ARPABET symbols and limited punctuations. The G2P 
@ -200,8 +199,7 @@ class ARPABET(Phonetics):
            The list of pronunciation id sequence.
        """
        return self.numericalize(
-            self.phoneticize(
-                sentence, add_start_end=add_start_end))
+            self.phoneticize(sentence, add_start_end=add_start_end))

    @property
    def vocab_size(self):
@ -217,9 +215,9 @@ class ARPABETWithStress(Phonetics):
        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
        'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
        'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K',
-        'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P',
-        'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2',
-        'V', 'W', 'Y', 'Z', 'ZH'
+        'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R',
+        'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V',
+        'W', 'Y', 'Z', 'ZH'
    ]
    punctuations = [',', '.', '?', '!']
    symbols = phonemes + punctuations
@ -294,8 +292,7 @@ class ARPABETWithStress(Phonetics):
            The list of pronunciation id sequence.
        """
        return self.numericalize(
-            self.phoneticize(
-                sentence, add_start_end=add_start_end))
+            self.phoneticize(sentence, add_start_end=add_start_end))

    @property
    def vocab_size(self):
--- a/parakeet/frontend/cn_frontend.py
+++ b/parakeet/frontend/cn_frontend.py
@ -11,17 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re

 import jieba.posseg as psg
-import numpy as np
-import paddle
-import re
 from g2pM import G2pM
-from parakeet.frontend.tone_sandhi import ToneSandhi
-from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
-from pypinyin import lazy_pinyin, Style
+from pypinyin import lazy_pinyin
+from pypinyin import Style

+from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
 from parakeet.frontend.generate_lexicon import generate_lexicon
+from parakeet.frontend.tone_sandhi import ToneSandhi


 class Frontend():
--- a/parakeet/frontend/cn_normalization/init.py
+++ b/parakeet/frontend/cn_normalization/init.py
@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from parakeet.frontend.cn_normalization.text_normlization import *
--- a/parakeet/frontend/cn_normalization/chronology.py
+++ b/parakeet/frontend/cn_normalization/chronology.py
@ -11,10 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re

-from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
+from .num import DIGITS
+from .num import num2str
+from .num import verbalize_cardinal
+from .num import verbalize_digit


 def _time_num2str(num_string: str) -> str:
--- a/parakeet/frontend/cn_normalization/constants.py
+++ b/parakeet/frontend/cn_normalization/constants.py
@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re
 import string
+
 from pypinyin.constants import SUPPORT_UCS4

 # 全角半角转换
@ -32,10 +32,7 @@ F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
 H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}

 # 标点符号全角 -> 半角映射表 (num: 32)
-F2H_PUNCTUATIONS = {
-    chr(ord(char) + 65248): char
-    for char in string.punctuation
-}
+F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
 # 标点符号半角 -> 全角映射表
 H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}

--- a/parakeet/frontend/cn_normalization/num.py
+++ b/parakeet/frontend/cn_normalization/num.py
@ -15,7 +15,6 @@
 Rules to verbalize numbers into Chinese characters.
 https://zh.wikipedia.org/wiki/中文数字#現代中文
 """
-
 import re
 from collections import OrderedDict
 from typing import List
--- a/parakeet/frontend/cn_normalization/phonecode.py
+++ b/parakeet/frontend/cn_normalization/phonecode.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re

 from .num import verbalize_digit
@ -32,14 +31,12 @@ def phone2str(phone_string: str, mobile=True) -> str:
    if mobile:
        sp_parts = phone_string.strip('+').split()
        result = ''.join(
-            [verbalize_digit(
-                part, alt_one=True) for part in sp_parts])
+            [verbalize_digit(part, alt_one=True) for part in sp_parts])
        return result
    else:
        sil_parts = phone_string.split('-')
        result = ''.join(
-            [verbalize_digit(
-                part, alt_one=True) for part in sil_parts])
+            [verbalize_digit(part, alt_one=True) for part in sil_parts])
        return result


--- a/parakeet/frontend/cn_normalization/quantifier.py
+++ b/parakeet/frontend/cn_normalization/quantifier.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re

 from .num import num2str
--- a/parakeet/frontend/cn_normalization/text_normlization.py
+++ b/parakeet/frontend/cn_normalization/text_normlization.py
@ -11,16 +11,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re
 from typing import List

-from .chronology import RE_TIME, RE_DATE, RE_DATE2
-from .chronology import replace_time, replace_date, replace_date2
-from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
-from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM, RE_DECIMAL_NUM, RE_POSITIVE_QUANTIFIERS
-from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num, replace_negative_num, replace_positive_quantifier
-from .phonecode import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone, replace_mobile
+from .chronology import RE_DATE
+from .chronology import RE_DATE2
+from .chronology import RE_TIME
+from .chronology import replace_date
+from .chronology import replace_date2
+from .chronology import replace_time
+from .constants import F2H_ASCII_LETTERS
+from .constants import F2H_DIGITS
+from .constants import F2H_SPACE
+from .num import RE_DECIMAL_NUM
+from .num import RE_DEFAULT_NUM
+from .num import RE_FRAC
+from .num import RE_INTEGER
+from .num import RE_NUMBER
+from .num import RE_PERCENTAGE
+from .num import RE_POSITIVE_QUANTIFIERS
+from .num import RE_RANGE
+from .num import replace_default_num
+from .num import replace_frac
+from .num import replace_negative_num
+from .num import replace_number
+from .num import replace_percentage
+from .num import replace_positive_quantifier
+from .num import replace_range
+from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_TELEPHONE
+from .phonecode import replace_mobile
+from .phonecode import replace_phone
 from .quantifier import RE_TEMPERATURE
 from .quantifier import replace_temperature

--- a/parakeet/frontend/generate_lexicon.py
+++ b/parakeet/frontend/generate_lexicon.py
@ -18,8 +18,6 @@ than words are used in transcriptions produced by `reorganize_baker.py`.
 We make this choice to better leverage other software for chinese text to 
 pinyin tools like pypinyin. This is the convention for G2P in Chinese.
 """
-
-import argparse
 import re
 from collections import OrderedDict

@ -41,10 +39,10 @@ SPECIALS = ['sil', 'sp']
 def rule(C, V, R, T):
    """Generate a syllable given the initial, the final, erhua indicator, and tone.
    Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
-    
+
    Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to 
    'u' in syllables when certain conditions are satisfied.
-    
+
    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
    When a syllable is impossible or does not have any characters with this pronunciation, return None
@ -86,8 +84,8 @@ def rule(C, V, R, T):
        return None

    # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
-    if V in ['ua', 'uai', 'uang'
-             ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
+    if V in ['ua', 'uai',
+             'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
        return None

    # sh 和 ong 不能拼
--- a/parakeet/frontend/normalizer/init.py
+++ b/parakeet/frontend/normalizer/init.py
@ -11,6 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from parakeet.frontend.normalizer.normalizer import *
-from parakeet.frontend.normalizer.numbers import *
--- a/parakeet/frontend/normalizer/normalizer.py
+++ b/parakeet/frontend/normalizer/normalizer.py
@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re
 import unicodedata
 from builtins import str as unicode
+
 from parakeet.frontend.normalizer.numbers import normalize_numbers


--- a/parakeet/frontend/normalizer/numbers.py
+++ b/parakeet/frontend/normalizer/numbers.py
@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 # number expansion is not that easy
-import inflect
 import re

+import inflect
+
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
--- a/parakeet/frontend/phonectic.py
+++ b/parakeet/frontend/phonectic.py
@ -11,16 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from abc import ABC
+from abc import abstractmethod

-from abc import ABC, abstractmethod
-from typing import Union
 from g2p_en import G2p
 from g2pM import G2pM
+
 from parakeet.frontend import Vocab
+from parakeet.frontend.normalizer.normalizer import normalize
+from parakeet.frontend.punctuation import get_punctuations
+
 # discard opencc untill we find an easy solution to install it on windows
 # from opencc import OpenCC
-from parakeet.frontend.punctuation import get_punctuations
-from parakeet.frontend.normalizer.normalizer import normalize

 __all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"]

@ -65,14 +67,14 @@ class English(Phonetics):
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
        phonemes = ([] if start is None else [start]) \
-                 + self.backend(sentence) \
-                 + ([] if end is None else [end])
+                   + self.backend(sentence) \
+                   + ([] if end is None else [end])
        phonemes = [item for item in phonemes if item in self.vocab.stoi]
        return phonemes

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
+
        Parameters
        -----------
        phonemes: List[str]
@ -91,7 +93,7 @@ class English(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        
+
        Parameters
        -----------
        ids: List[int]
@ -183,7 +185,7 @@ class EnglishCharacter(Phonetics):
        ----------
        str
            The input text sequence.
-        
+
        """
        return [self.vocab.reverse(i) for i in ids]

@ -244,8 +246,8 @@ class Chinese(Phonetics):
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
        phonemes = ([] if start is None else [start]) \
-                 + phonemes \
-                 + ([] if end is None else [end])
+                   + phonemes \
+                   + ([] if end is None else [end])
        return self._filter_symbols(phonemes)

    def _filter_symbols(self, phonemes):
@ -261,7 +263,7 @@ class Chinese(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
+
        Parameters
        -----------
        phonemes: List[str]
@ -298,7 +300,7 @@ class Chinese(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        
+
        Parameters
        -----------
        ids: List[int]
--- a/parakeet/frontend/pinyin.py
+++ b/parakeet/frontend/pinyin.py
@ -19,13 +19,15 @@ text -> pinyin to other part of a TTS system. Other NLP techniques may be used
 (e.g. tokenization, tagging, NER...)
 """
 import re
+from itertools import product
+
+from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
+from pypinyin.core import DefaultConverter
+from pypinyin.core import Pinyin
+from pypinyin.core import Style
+
 from parakeet.frontend.phonectic import Phonetics
 from parakeet.frontend.vocab import Vocab
-import pypinyin
-from pypinyin.core import Pinyin, Style
-from pypinyin.core import DefaultConverter
-from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
-from itertools import product

 _punctuations = ['，', '。', '？', '！']
 _initials = [
@ -33,10 +35,10 @@ _initials = [
    'ch', 'sh', 'r', 'z', 'c', 's'
 ]
 _finals = [
-    'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en',
-    'ang', 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian',
-    'ien', 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang',
-    'ueng', 'v', 've', 'van', 'ven', 'veng'
+    'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang',
+    'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien',
+    'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
+    'v', 've', 'van', 'ven', 'veng'
 ]
 _ernized_symbol = ['&r']
 _phones = _initials + _finals + _ernized_symbol + _punctuations
@ -76,12 +78,12 @@ class ParakeetPinyin(Phonetics):

    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-    
+
        Parameters
        -----------
        sentence: str
            The input text sequence.
-    
+
        Returns
        ----------
        List[str]
@ -95,12 +97,12 @@ class ParakeetPinyin(Phonetics):

    def numericalize(self, phonemes, tones):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
+
        Parameters
        -----------
        phonemes: List[str]
            The list of pronunciation sequence.
-    
+
        Returns
        ----------
        List[int]
@ -112,12 +114,12 @@ class ParakeetPinyin(Phonetics):

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
-    
+
        Parameters
        -----------
        sentence: str
            The input text sequence.
-    
+
        Returns
        ----------
        List[str]
@ -159,12 +161,12 @@ class ParakeetPinyinWithTone(Phonetics):

    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-    
+
        Parameters
        -----------
        sentence: str
            The input text sequence.
-    
+
        Returns
        ----------
        List[str]
@ -178,12 +180,12 @@ class ParakeetPinyinWithTone(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
+
        Parameters
        -----------
        phonemes: List[str]
            The list of pronunciation sequence.
-    
+
        Returns
        ----------
        List[int]
@ -194,12 +196,12 @@ class ParakeetPinyinWithTone(Phonetics):

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
-    
+
        Parameters
        -----------
        sentence: str
            The input text sequence.
-    
+
        Returns
        ----------
        List[str]
@ -232,17 +234,17 @@ def _convert_to_parakeet_convension(syllable):
    syllable = syllable.replace("ing", "ieng").replace("in", "ien")

    # expansion for un, ui, iu
-    syllable = syllable.replace("un","uen")\
-        .replace("ui", "uei")\
+    syllable = syllable.replace("un", "uen") \
+        .replace("ui", "uei") \
        .replace("iu", "iou")

    # rule for variants of i
-    syllable = syllable.replace("zi", "zii")\
-        .replace("ci", "cii")\
-        .replace("si", "sii")\
-        .replace("zhi", "zhiii")\
-        .replace("chi", "chiii")\
-        .replace("shi", "shiii")\
+    syllable = syllable.replace("zi", "zii") \
+        .replace("ci", "cii") \
+        .replace("si", "sii") \
+        .replace("zhi", "zhiii") \
+        .replace("chi", "chiii") \
+        .replace("shi", "shiii") \
        .replace("ri", "riii")

    # rule for y preceding i, u
@ -252,8 +254,8 @@ def _convert_to_parakeet_convension(syllable):
    syllable = syllable.replace("wu", "u").replace("w", "u")

    # rule for v following j, q, x
-    syllable = syllable.replace("ju", "jv")\
-        .replace("qu", "qv")\
+    syllable = syllable.replace("ju", "jv") \
+        .replace("qu", "qv") \
        .replace("xu", "xv")

    return syllable + tone
--- a/parakeet/frontend/punctuation.py
+++ b/parakeet/frontend/punctuation.py
@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import abc
-import string
-
 __all__ = ["get_punctuations"]

 EN_PUNCT = [
--- a/parakeet/frontend/tone_sandhi.py
+++ b/parakeet/frontend/tone_sandhi.py
@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import List, Tuple
+from typing import List
+from typing import Tuple

 import jieba
 from pypinyin import lazy_pinyin
@ -76,8 +76,7 @@ class ToneSandhi():

        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
        for j, item in enumerate(word):
-            if j - 1 >= 0 and item == word[j - 1] and pos[
-                    0] in {"n", "v", "a"}:
+            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
                finals[j] = finals[j][:-1] + "5"
        ge_idx = word.find("个")
        if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
@ -125,8 +124,8 @@ class ToneSandhi():
        else:
            for i, char in enumerate(word):
                # "不" before tone4 should be bu2, e.g. 不怕
-                if char == "不" and i + 1 < len(word) and finals[i + 1][
-                        -1] == "4":
+                if char == "不" and i + 1 < len(word) and finals[i +
+                                                                1][-1] == "4":
                    finals[i] = finals[i][:-1] + "2"
        return finals

@ -266,12 +265,12 @@ class ToneSandhi():
        assert len(sub_finals_list) == len(seg)
        merge_last = [False] * len(seg)
        for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and self._all_tone_three(sub_finals_list[
-                    i - 1]) and self._all_tone_three(sub_finals_list[
-                        i]) and not merge_last[i - 1]:
+            if i - 1 >= 0 and self._all_tone_three(
+                    sub_finals_list[i - 1]) and self._all_tone_three(
+                        sub_finals_list[i]) and not merge_last[i - 1]:
                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if not self._is_reduplication(seg[i - 1][0]) and len(seg[
-                        i - 1][0]) + len(seg[i][0]) <= 3:
+                if not self._is_reduplication(seg[i - 1][0]) and len(
+                        seg[i - 1][0]) + len(seg[i][0]) <= 3:
                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
                    merge_last[i] = True
                else:
@ -299,8 +298,8 @@ class ToneSandhi():
            if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
                    merge_last[i - 1]:
                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if not self._is_reduplication(seg[i - 1][0]) and len(seg[
-                        i - 1][0]) + len(seg[i][0]) <= 3:
+                if not self._is_reduplication(seg[i - 1][0]) and len(
+                        seg[i - 1][0]) + len(seg[i][0]) <= 3:
                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
                    merge_last[i] = True
                else:
--- a/parakeet/frontend/vocab.py
+++ b/parakeet/frontend/vocab.py
@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import Dict, Iterable, List
 from collections import OrderedDict
+from typing import Iterable

 __all__ = ["Vocab"]

@ -25,13 +24,13 @@ class Vocab(object):
    -----------
    symbols: Iterable[str]
        Common symbols.
-    
+
    padding_symbol: str, optional
        Symbol for pad. Defaults to "<pad>".

    unk_symbol: str, optional
        Symbol for unknow. Defaults to "<unk>"
-    
+
    start_symbol: str, optional
        Symbol for start. Defaults to "<s>"

--- a/parakeet/models/init.py
+++ b/parakeet/models/init.py
@ -11,13 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-#from parakeet.models.clarinet import *
-from parakeet.models.waveflow import *
-#from parakeet.models.wavenet import *
-
-from parakeet.models.transformer_tts import *
-#from parakeet.models.deepvoice3 import *
-# from parakeet.models.fastspeech import *
-from parakeet.models.tacotron2 import *
-from parakeet.models.fastspeech2 import *
--- a/parakeet/models/fastspeech2.py
+++ b/parakeet/models/fastspeech2.py
@ -12,20 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fastspeech2 related modules for paddle"""
+from typing import Sequence
+from typing import Tuple

-from typing import Dict, Sequence, Tuple
-
-import numpy as np
 import paddle
 from paddle import nn
-from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss
+from typeguard import check_argument_types
+
+from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
+from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
 from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
 from parakeet.modules.fastspeech2_predictor.postnet import Postnet
 from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding, ScaledPositionalEncoding
+from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
+from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
-from parakeet.modules.nets_utils import initialize, make_non_pad_mask, make_pad_mask
-from typeguard import check_argument_types
+from parakeet.modules.nets_utils import initialize
+from parakeet.modules.nets_utils import make_non_pad_mask
+from parakeet.modules.nets_utils import make_pad_mask


 class FastSpeech2(nn.Layer):
@ -252,36 +256,36 @@ class FastSpeech2(nn.Layer):

        Parameters
        ----------
-            text : Tensor
-                Batch of padded token ids (B, Tmax).
-            text_lengths : Tensor)
-                Batch of lengths of each input (B,).
-            speech : Tensor
-                Batch of padded target features (B, Lmax, odim).
-            speech_lengths : Tensor
-                Batch of the lengths of each target (B,).
-            durations : Tensor
-                Batch of padded durations (B, Tmax).
-            pitch : Tensor
-                Batch of padded token-averaged pitch (B, Tmax, 1).
-            energy : Tensor
-                Batch of padded token-averaged energy (B, Tmax, 1).
+        text : Tensor
+            Batch of padded token ids (B, Tmax).
+        text_lengths : Tensor)
+            Batch of lengths of each input (B,).
+        speech : Tensor
+            Batch of padded target features (B, Lmax, odim).
+        speech_lengths : Tensor
+            Batch of the lengths of each target (B,).
+        durations : Tensor
+            Batch of padded durations (B, Tmax).
+        pitch : Tensor
+            Batch of padded token-averaged pitch (B, Tmax, 1).
+        energy : Tensor
+            Batch of padded token-averaged energy (B, Tmax, 1).
        Returns
        ----------
-            Tensor
-                mel outs before postnet
-            Tensor
-                mel outs after postnet
-            Tensor
-                duration predictor's output
-            Tensor
-                pitch predictor's output
-            Tensor
-                energy predictor's output
-            Tensor
-                speech
-            Tensor
-                speech_lengths, modified if reduction_factor >1
+        Tensor
+            mel outs before postnet
+        Tensor
+            mel outs after postnet
+        Tensor
+            duration predictor's output
+        Tensor
+            pitch predictor's output
+        Tensor
+            energy predictor's output
+        Tensor
+            speech
+        Tensor
+            speech_lengths, modified if reduction_factor > 1
        """

        xs = text
@ -294,9 +298,8 @@ class FastSpeech2(nn.Layer):
            xs, ilens, ys, olens, ds, ps, es, is_inference=False)
        # modify mod part of groundtruth
        if self.reduction_factor > 1:
-            olens = paddle.to_tensor([
-                olen - olen % self.reduction_factor for olen in olens.numpy()
-            ])
+            olens = paddle.to_tensor(
+                [olen - olen % self.reduction_factor for olen in olens.numpy()])
            max_olen = max(olens)
            ys = ys[:, :max_olen]

@ -389,26 +392,26 @@ class FastSpeech2(nn.Layer):

        Parameters
        ----------
-            text : Tensor
-                Input sequence of characters (T,).
-            speech : Tensor, optional
-                Feature sequence to extract style (N, idim).
-            durations : Tensor, optional
-                Groundtruth of duration (T,).
-            pitch : Tensor, optional
-                Groundtruth of token-averaged pitch (T, 1).
-            energy : Tensor, optional
-                Groundtruth of token-averaged energy (T, 1).
-            alpha : float, optional
-                 Alpha to control the speed.
-            use_teacher_forcing : bool, optional
-                 Whether to use teacher forcing.
-                 If true, groundtruth of duration, pitch and energy will be used.
+        text : Tensor
+            Input sequence of characters (T,).
+        speech : Tensor, optional
+            Feature sequence to extract style (N, idim).
+        durations : Tensor, optional
+            Groundtruth of duration (T,).
+        pitch : Tensor, optional
+            Groundtruth of token-averaged pitch (T, 1).
+        energy : Tensor, optional
+            Groundtruth of token-averaged energy (T, 1).
+        alpha : float, optional
+            Alpha to control the speed.
+        use_teacher_forcing : bool, optional
+            Whether to use teacher forcing.
+            If true, groundtruth of duration, pitch and energy will be used.

        Returns
        ----------
-            Tensor
-                Output sequence of features (L, odim).
+        Tensor
+            Output sequence of features (L, odim).
        """
        x, y = text, speech
        d, p, e = durations, pitch, energy
@ -448,21 +451,21 @@ class FastSpeech2(nn.Layer):

        Parameters
        ----------
-            ilens : Tensor
-                Batch of lengths (B,).
+        ilens : Tensor
+            Batch of lengths (B,).

        Returns
        -------
-            Tensor
-                Mask tensor for self-attention.
-                dtype=paddle.bool
+        Tensor
+            Mask tensor for self-attention.
+            dtype=paddle.bool

        Examples
        -------
-            >>> ilens = [5, 3]
-            >>> self._source_mask(ilens)
-            tensor([[[1, 1, 1, 1, 1],
-                     [1, 1, 1, 0, 0]]]) bool
+        >>> ilens = [5, 3]
+        >>> self._source_mask(ilens)
+        tensor([[[1, 1, 1, 1, 1],
+                    [1, 1, 1, 0, 0]]]) bool

        """
        x_masks = make_non_pad_mask(ilens)
@ -502,17 +505,16 @@ class FastSpeech2Inference(nn.Layer):
 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""

-    def __init__(self,
-                 use_masking: bool=True,
+    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.

        Parameters
        ----------
-            use_masking : bool
-                Whether to apply masking for padded part in loss calculation.
-            use_weighted_masking : bool
-                Whether to weighted masking in loss calculation.
+        use_masking : bool
+            Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking : bool
+            Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
@ -539,45 +541,45 @@ class FastSpeech2Loss(nn.Layer):
            ps: paddle.Tensor,
            es: paddle.Tensor,
            ilens: paddle.Tensor,
-            olens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor,
-                                             paddle.Tensor, paddle.Tensor]:
+            olens: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Calculate forward propagation.

        Parameters
        ----------
-            after_outs : Tensor
-                Batch of outputs after postnets (B, Lmax, odim).
-            before_outs : Tensor
-                Batch of outputs before postnets (B, Lmax, odim).
-            d_outs : Tensor
-                 Batch of outputs of duration predictor (B, Tmax).
-            p_outs : Tensor
-                Batch of outputs of pitch predictor (B, Tmax, 1).
-            e_outs : Tensor
-                Batch of outputs of energy predictor (B, Tmax, 1).
-            ys : Tensor
-                Batch of target features (B, Lmax, odim).
-            ds : Tensor
-                Batch of durations (B, Tmax).
-            ps : Tensor
-                Batch of target token-averaged pitch (B, Tmax, 1).
-            es : Tensor
-                Batch of target token-averaged energy (B, Tmax, 1).
-            ilens : Tensor
-                Batch of the lengths of each input (B,).
-            olens : Tensor
-                Batch of the lengths of each target (B,).
+        after_outs : Tensor
+            Batch of outputs after postnets (B, Lmax, odim).
+        before_outs : Tensor
+            Batch of outputs before postnets (B, Lmax, odim).
+        d_outs : Tensor
+                Batch of outputs of duration predictor (B, Tmax).
+        p_outs : Tensor
+            Batch of outputs of pitch predictor (B, Tmax, 1).
+        e_outs : Tensor
+            Batch of outputs of energy predictor (B, Tmax, 1).
+        ys : Tensor
+            Batch of target features (B, Lmax, odim).
+        ds : Tensor
+            Batch of durations (B, Tmax).
+        ps : Tensor
+            Batch of target token-averaged pitch (B, Tmax, 1).
+        es : Tensor
+            Batch of target token-averaged energy (B, Tmax, 1).
+        ilens : Tensor
+            Batch of the lengths of each input (B,).
+        olens : Tensor
+            Batch of the lengths of each target (B,).

        Returns
        ----------
-            Tensor
-                L1 loss value.
-            Tensor
-                Duration predictor loss value.
-            Tensor
-                Pitch predictor loss value.
-            Tensor
-                Energy predictor loss value.
+        Tensor
+            L1 loss value.
+        Tensor
+            Duration predictor loss value.
+        Tensor
+            Pitch predictor loss value.
+        Tensor
+            Energy predictor loss value.

        """
        # apply mask to remove padded part
@ -612,9 +614,9 @@ class FastSpeech2Loss(nn.Layer):
        # make weighted mask and apply it
        if self.use_weighted_masking:
            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
-            out_weights = out_masks.cast(
-                dtype=paddle.float32) / out_masks.cast(
-                    dtype=paddle.float32).sum(axis=1, keepdim=True)
+            out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
+                dtype=paddle.float32).sum(
+                    axis=1, keepdim=True)
            out_weights /= ys.shape[0] * ys.shape[2]
            duration_masks = make_non_pad_mask(ilens)
            duration_weights = (duration_masks.cast(dtype=paddle.float32) /
--- a/parakeet/models/lstm_speaker_encoder.py
+++ b/parakeet/models/lstm_speaker_encoder.py
@ -11,17 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import numpy as np
 import paddle
 from paddle import nn
-from paddle.fluid.param_attr import ParamAttr
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-
 from scipy.interpolate import interp1d
-from sklearn.metrics import roc_curve
 from scipy.optimize import brentq
+from sklearn.metrics import roc_curve


 class LSTMSpeakerEncoder(nn.Layer):
@ -81,8 +78,7 @@ class LSTMSpeakerEncoder(nn.Layer):
        # print("p1: ", p1.shape)
        p2 = paddle.bmm(
            embeds.reshape([-1, 1, embed_dim]),
-            normalized_centroids_excl.reshape(
-                [-1, embed_dim, 1]))  # (NM, 1, 1)
+            normalized_centroids_excl.reshape([-1, embed_dim, 1]))  # (NM, 1, 1)
        p2 = p2.reshape([-1])  # （NM)

        # begin: alternative implementation for scatter
@ -94,9 +90,8 @@ class LSTMSpeakerEncoder(nn.Layer):
            index = index * speakers_per_batch + paddle.arange(
                0, speakers_per_batch, dtype="int64").unsqueeze(-1)
            index = paddle.reshape(index, [-1])
-        ones = paddle.ones([
-            speakers_per_batch * utterances_per_speaker * speakers_per_batch
-        ])
+        ones = paddle.ones(
+            [speakers_per_batch * utterances_per_speaker * speakers_per_batch])
        zeros = paddle.zeros_like(index, dtype=ones.dtype)
        mask_p1 = paddle.scatter(ones, index, zeros)
        p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
@ -113,6 +108,9 @@ class LSTMSpeakerEncoder(nn.Layer):
            g = p._grad_ivar()
            g[...] = g * 0.01

+    def inv_argmax(self, i, num):
+        return np.eye(1, num, i, dtype=np.int)[0]
+
    def loss(self, embeds):
        """
        Computes the softmax loss according the section 2.1 of GE2E.
@ -138,8 +136,8 @@ class LSTMSpeakerEncoder(nn.Layer):
        # EER (not backpropagated)
        with paddle.no_grad():
            ground_truth = target.numpy()
-            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
-            labels = np.array([inv_argmax(i) for i in ground_truth])
+            labels = np.array(
+                [self.inv_argmax(i, speakers_per_batch) for i in ground_truth])
            preds = sim_matrix.numpy()

            # Snippet from https://yangcha.github.io/EER-ROC/
--- a/parakeet/models/parallel_wavegan.py
+++ b/parakeet/models/parallel_wavegan.py
@ -11,13 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import math
-from typing import List, Dict, Any, Union, Optional, Tuple
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional

 import numpy as np
 import paddle
-from paddle import Tensor
 from paddle import nn
 from paddle.nn import functional as F

@ -63,8 +64,8 @@ class Stretch2D(nn.Layer):


 class UpsampleNet(nn.Layer):
-    """A Layer to upsample spectrogram by applying consecutive stretch and 
-    convolutions. 
+    """A Layer to upsample spectrogram by applying consecutive stretch and
+    convolutions.

    Parameters
    ----------
@ -81,10 +82,10 @@ class UpsampleNet(nn.Layer):
    use_causal_conv : bool, optional
        Whether to use causal padding before convolution, by default False

-        If True, Causal padding is used along the time axis, i.e. padding 
-        amount is ``receptive field - 1`` and 0 for before and after, 
+        If True, Causal padding is used along the time axis, i.e. padding
+        amount is ``receptive field - 1`` and 0 for before and after,
        respectively.
-        
+
        If False, "same" padding is used along the time axis.
    """

@ -158,7 +159,7 @@ class ConvInUpsampleNet(nn.Layer):
    aux_context_window : int, optional
        Context window of the first 1D convolution applied to the input. It 
        related to the kernel size of the convolution, by default 0
-        
+
        If use causal convolution, the kernel size is ``window + 1``, else
        the kernel size is ``2 * window + 1``.
    use_causal_conv : bool, optional
@ -167,7 +168,7 @@ class ConvInUpsampleNet(nn.Layer):
        If True, Causal padding is used along the time axis, i.e. padding 
        amount is ``receptive field - 1`` and 0 for before and after, 
        respectively.
-        
+
        If False, "same" padding is used along the time axis.
    """

@ -276,10 +277,7 @@ class ResidualBlock(nn.Layer):

        gate_out_channels = gate_channels // 2
        self.conv1x1_out = nn.Conv1D(
-            gate_out_channels,
-            residual_channels,
-            kernel_size=1,
-            bias_attr=bias)
+            gate_out_channels, residual_channels, kernel_size=1, bias_attr=bias)
        self.conv1x1_skip = nn.Conv1D(
            gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias)

@ -428,13 +426,18 @@ class PWGGenerator(nn.Layer):
                use_causal_conv=use_causal_conv)
            self.conv_layers.append(conv)

-        self.last_conv_layers = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv1D(
-                skip_channels, skip_channels, 1, bias_attr=True),
-            nn.ReLU(),
-            nn.Conv1D(
-                skip_channels, out_channels, 1, bias_attr=True))
+        self.last_conv_layers = nn.Sequential(nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  skip_channels,
+                                                  1,
+                                                  bias_attr=True),
+                                              nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  out_channels,
+                                                  1,
+                                                  bias_attr=True))

        if use_weight_norm:
            self.apply_weight_norm()
@ -548,18 +551,18 @@ class PWGDiscriminator(nn.Layer):
        by default True
    """

-    def __init__(self,
-                 in_channels: int=1,
-                 out_channels: int=1,
-                 kernel_size: int=3,
-                 layers: int=10,
-                 conv_channels: int=64,
-                 dilation_factor: int=1,
-                 nonlinear_activation: str="LeakyReLU",
-                 nonlinear_activation_params: Dict[
-                     str, Any]={"negative_slope": 0.2},
-                 bias: bool=True,
-                 use_weight_norm: bool=True):
+    def __init__(
+            self,
+            in_channels: int=1,
+            out_channels: int=1,
+            kernel_size: int=3,
+            layers: int=10,
+            conv_channels: int=64,
+            dilation_factor: int=1,
+            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
+            bias: bool=True,
+            use_weight_norm: bool=True):
        super().__init__()
        assert kernel_size % 2 == 1
        assert dilation_factor > 0
@ -693,8 +696,7 @@ class ResidualPWGDiscriminator(nn.Layer):
        layers_per_stack = layers // stacks

        self.first_conv = nn.Sequential(
-            nn.Conv1D(
-                in_channels, residual_channels, 1, bias_attr=True),
+            nn.Conv1D(in_channels, residual_channels, 1, bias_attr=True),
            getattr(nn, nonlinear_activation)(**nonlinear_activation_params))

        self.conv_layers = nn.LayerList()
@ -714,11 +716,9 @@ class ResidualPWGDiscriminator(nn.Layer):

        self.last_conv_layers = nn.Sequential(
            getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
-            nn.Conv1D(
-                skip_channels, skip_channels, 1, bias_attr=True),
+            nn.Conv1D(skip_channels, skip_channels, 1, bias_attr=True),
            getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
-            nn.Conv1D(
-                skip_channels, out_channels, 1, bias_attr=True))
+            nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True))

        if use_weight_norm:
            self.apply_weight_norm()
--- a/parakeet/models/speedyspeech.py
+++ b/parakeet/models/speedyspeech.py
@ -11,18 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import math
-
-import numpy as np
 import paddle
-from paddle import Tensor
 from paddle import nn
-from paddle.nn import functional as F
-from paddle.nn import initializer as I

-from parakeet.modules.positional_encoding import sinusoid_position_encoding
 from parakeet.modules.expansion import expand
+from parakeet.modules.positional_encoding import sinusoid_position_encoding


 class ResidualBlock(nn.Layer):
@ -38,8 +31,7 @@ class ResidualBlock(nn.Layer):
                    padding="same",
                    data_format="NLC"),
                nn.ReLU(),
-                nn.BatchNorm1D(
-                    channels, data_format="NLC"), ) for _ in range(n)
+                nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
        ]
        self.blocks = nn.Sequential(*blocks)

@ -95,16 +87,14 @@ class SpeedySpeechEncoder(nn.Layer):
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(), )
        res_blocks = [
-            ResidualBlock(
-                hidden_size, kernel_size, d, n=2) for d in dilations
+            ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
        ]
        self.res_blocks = nn.Sequential(*res_blocks)

        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
        self.postnet2 = nn.Sequential(
            nn.ReLU(),
-            nn.BatchNorm1D(
-                hidden_size, data_format="NLC"),
+            nn.BatchNorm1D(hidden_size, data_format="NLC"),
            nn.Linear(hidden_size, hidden_size), )

    def forward(self, text, tones):
@ -120,13 +110,9 @@ class DurationPredictor(nn.Layer):
    def __init__(self, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
-            ResidualBlock(
-                hidden_size, 4, 1, n=1),
-            ResidualBlock(
-                hidden_size, 3, 1, n=1),
-            ResidualBlock(
-                hidden_size, 1, 1, n=1),
-            nn.Linear(hidden_size, 1))
+            ResidualBlock(hidden_size, 4, 1, n=1),
+            ResidualBlock(hidden_size, 3, 1, n=1),
+            ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))

    def forward(self, x):
        return paddle.squeeze(self.layers(x), -1)
@ -136,15 +122,13 @@ class SpeedySpeechDecoder(nn.Layer):
    def __init__(self, hidden_size, output_size, kernel_size, dilations):
        super().__init__()
        res_blocks = [
-            ResidualBlock(
-                hidden_size, kernel_size, d, n=2) for d in dilations
+            ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
        ]
        self.res_blocks = nn.Sequential(*res_blocks)

        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
        self.postnet2 = nn.Sequential(
-            ResidualBlock(
-                hidden_size, kernel_size, 1, n=2),
+            ResidualBlock(hidden_size, kernel_size, 1, n=2),
            nn.Linear(hidden_size, output_size))

    def forward(self, x):
--- a/Show More
+++ b/Show More