diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..7230df7 --- /dev/null +++ b/.clang-format @@ -0,0 +1,28 @@ +# This file is used by clang-format to autoformat paddle source code +# +# The clang-format is part of llvm toolchain. +# It need to install llvm and clang to format source code style. +# +# The basic usage is, +# clang-format -i -style=file PATH/TO/SOURCE/CODE +# +# The -style=file implicit use ".clang-format" file located in one of +# parent directory. +# The -i means inplace change. +# +# The document of clang-format is +# http://clang.llvm.org/docs/ClangFormat.html +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +Language: Cpp +BasedOnStyle: Google +IndentWidth: 4 +TabWidth: 4 +ContinuationIndentWidth: 4 +MaxEmptyLinesToKeep: 2 +AccessModifierOffset: -2 # The private/protected/public has no indent in class +Standard: Cpp11 +AllowAllParametersOfDeclarationOnNextLine: true +BinPackParameters: false +BinPackArguments: false +... \ No newline at end of file diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..3723df5 --- /dev/null +++ b/.flake8 @@ -0,0 +1,50 @@ +[flake8] + +########## OPTIONS ########## +# Set the maximum length that any line (with some exceptions) may be. +max-line-length = 120 + + +################### FILE PATTERNS ########################## +# Provide a comma-separated list of glob patterns to exclude from checks. +exclude = + # git folder + .git, + # python cache + __pycache__, + third_party/, +# Provide a comma-separate list of glob patterns to include for checks. +filename = + *.py + + +########## RULES ########## + +# ERROR CODES +# +# E/W - PEP8 errors/warnings (pycodestyle) +# F - linting errors (pyflakes) +# C - McCabe complexity error (mccabe) +# +# W503 - line break before binary operator + +# Specify a list of codes to ignore. +ignore = + W503 + E252,E262,E127,E265,E126,E266,E241,E261,E128,E125 + W291,W293,W605 + E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303, + # shebang has extra meaning in fbcode lints, so I think it's not worth trying + # to line this up with executable bit + EXE001, + # these ignores are from flake8-bugbear; please fix! + B007,B008, + # these ignores are from flake8-comprehensions; please fix! + C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415 + +# Specify the list of error codes you wish Flake8 to report. +select = + E, + W, + F, + C \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6f222bb..cde2cc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ -repos: -- repo: https://github.com/PaddlePaddle/mirrors-yapf.git - rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 +- repo: https://github.com/pre-commit/mirrors-yapf.git + sha: v0.16.0 hooks: - id: yapf files: \.py$ + exclude: (?=third_party).*(\.py)$ - repo: https://github.com/pre-commit/pre-commit-hooks - rev: a11d9314b22d8f8c7556443875b731ef05965464 + sha: a11d9314b22d8f8c7556443875b731ef05965464 hooks: - id: check-merge-conflict - id: check-symlinks @@ -15,8 +15,23 @@ repos: files: \.md$ - id: trailing-whitespace files: \.md$ -- repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.0.1 + - id: requirements-txt-fixer + exclude: (?=third_party).*$ + - id: check-yaml + - id: check-json + - id: pretty-format-json + args: + - --no-sort-keys + - --autofix + - id: check-merge-conflict + - id: flake8 + aergs: + - --ignore=E501,E228,E226,E261,E266,E128,E402,W503 + - --builtins=G,request + - --jobs=1 + exclude: (?=third_party).*(\.py)$ +- repo : https://github.com/Lucas-C/pre-commit-hooks + sha: v1.0.1 hooks: - id: forbid-crlf files: \.md$ @@ -28,9 +43,15 @@ repos: files: \.md$ - repo: local hooks: + - id: clang-format + name: clang-format + description: Format files with ClangFormat + entry: bash .pre-commit-hooks/clang-format.hook -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ - id: copyright_checker name: copyright_checker - entry: python ./tools/copyright.hook + entry: python .pre-commit-hooks/copyright-check.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ - exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ + exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$ diff --git a/.pre-commit-hooks/clang-format.hook b/.pre-commit-hooks/clang-format.hook new file mode 100755 index 0000000..ceb4a7e --- /dev/null +++ b/.pre-commit-hooks/clang-format.hook @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +readonly VERSION="3.9" + +version=$(clang-format -version) + +# if ! [[ $version == *"$VERSION"* ]]; then +# echo "clang-format version check failed." +# echo "a version contains '$VERSION' is needed, but get '$version'" +# echo "you can install the right version, and make an soft-link to '\$PATH' env" +# exit -1 +# fi + +clang-format $@ diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook new file mode 100644 index 0000000..80a5315 --- /dev/null +++ b/.pre-commit-hooks/copyright-check.hook @@ -0,0 +1,133 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import io +import os +import re +import sys +import subprocess +import platform + +COPYRIGHT = ''' +Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +LANG_COMMENT_MARK = None + +NEW_LINE_MARK = None + +COPYRIGHT_HEADER = None + +if platform.system() == "Windows": + NEW_LINE_MARK = "\r\n" +else: + NEW_LINE_MARK = '\n' + COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1] + p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0) + process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE) + date, err = process.communicate() + date = date.decode("utf-8").rstrip("\n") + COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date) + + +def generate_copyright(template, lang='C'): + if lang == 'Python': + LANG_COMMENT_MARK = '#' + else: + LANG_COMMENT_MARK = "//" + + lines = template.split(NEW_LINE_MARK) + BLANK = " " + ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK + for lino, line in enumerate(lines): + if lino == 0 or lino == 1 or lino == len(lines) - 1: continue + if len(line) == 0: + BLANK = "" + else: + BLANK = " " + ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK + + return ans + "\n" + + +def lang_type(filename): + if filename.endswith(".py"): + return "Python" + elif filename.endswith(".h"): + return "C" + elif filename.endswith(".c"): + return "C" + elif filename.endswith(".hpp"): + return "C" + elif filename.endswith(".cc"): + return "C" + elif filename.endswith(".cpp"): + return "C" + elif filename.endswith(".cu"): + return "C" + elif filename.endswith(".cuh"): + return "C" + elif filename.endswith(".go"): + return "C" + elif filename.endswith(".proto"): + return "C" + else: + print("Unsupported filetype %s", filename) + exit(0) + + +PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)") + + +def main(argv=None): + parser = argparse.ArgumentParser( + description='Checker for copyright declaration.') + parser.add_argument('filenames', nargs='*', help='Filenames to check') + args = parser.parse_args(argv) + + retv = 0 + for filename in args.filenames: + fd = io.open(filename, encoding="utf-8") + first_line = fd.readline() + second_line = fd.readline() + if "COPYRIGHT (C)" in first_line.upper(): continue + if first_line.startswith("#!") or PYTHON_ENCODE.match( + second_line) != None or PYTHON_ENCODE.match(first_line) != None: + continue + original_contents = io.open(filename, encoding="utf-8").read() + new_contents = generate_copyright( + COPYRIGHT, lang_type(filename)) + original_contents + print('Auto Insert Copyright Header {}'.format(filename)) + retv = 1 + with io.open(filename, 'w') as output_file: + output_file.write(new_contents) + + return retv + + +if __name__ == '__main__': + exit(main()) diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 0000000..b62febf --- /dev/null +++ b/.style.yapf @@ -0,0 +1,3 @@ +[style] +based_on_style = pep8 +column_limit = 80 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index bddd217..c7afa7a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -11,15 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html - # -- Path setup -------------------------------------------------------------- - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. diff --git a/examples/fastspeech2/baker/batch_fn.py b/examples/fastspeech2/baker/batch_fn.py index 1bbab84..0dd93dd 100644 --- a/examples/fastspeech2/baker/batch_fn.py +++ b/examples/fastspeech2/baker/batch_fn.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle + from parakeet.data.batch import batch_sequences @@ -24,8 +24,7 @@ def collate_baker_examples(examples): pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] energy = [np.array(item["energy"], dtype=np.float32) for item in examples] durations = [ - np.array( - item["durations"], dtype=np.int64) for item in examples + np.array(item["durations"], dtype=np.int64) for item in examples ] text_lengths = np.array([item["text_lengths"] for item in examples]) speech_lengths = np.array([item["speech_lengths"] for item in examples]) @@ -54,4 +53,4 @@ def collate_baker_examples(examples): "pitch": pitch, "energy": energy } - return batch \ No newline at end of file + return batch diff --git a/examples/fastspeech2/baker/compute_statistics.py b/examples/fastspeech2/baker/compute_statistics.py index aa4bf4f..823223a 100644 --- a/examples/fastspeech2/baker/compute_statistics.py +++ b/examples/fastspeech2/baker/compute_statistics.py @@ -12,18 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. """Calculate statistics of feature files.""" - import argparse import logging from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable +from config import get_cfg_default from sklearn.preprocessing import StandardScaler from tqdm import tqdm -from config import get_cfg_default +from parakeet.datasets.data_table import DataTable def main(): @@ -75,8 +74,8 @@ def main(): # check directory existence if args.output is None: - args.output = Path(args.metadata).parent.with_name(args.field_name + - "_stats.npy") + args.output = Path( + args.metadata).parent.with_name(args.field_name + "_stats.npy") else: args.output = Path(args.output) args.output.parent.mkdir(parents=True, exist_ok=True) diff --git a/examples/fastspeech2/baker/config.py b/examples/fastspeech2/baker/config.py index 7cf3d95..500f5bd 100644 --- a/examples/fastspeech2/baker/config.py +++ b/examples/fastspeech2/baker/config.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from pathlib import Path -from yacs.config import CfgNode as Configuration import yaml +from yacs.config import CfgNode as Configuration config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve() diff --git a/examples/fastspeech2/baker/fastspeech2_updater.py b/examples/fastspeech2/baker/fastspeech2_updater.py index 884efda..e10620b 100644 --- a/examples/fastspeech2/baker/fastspeech2_updater.py +++ b/examples/fastspeech2/baker/fastspeech2_updater.py @@ -11,8 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.models.fastspeech2 import FastSpeech2, FastSpeech2Loss +from parakeet.models.fastspeech2 import FastSpeech2Loss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater diff --git a/examples/fastspeech2/baker/frontend.py b/examples/fastspeech2/baker/frontend.py index 3ed7efb..4a2f2c6 100644 --- a/examples/fastspeech2/baker/frontend.py +++ b/examples/fastspeech2/baker/frontend.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re + import numpy as np import paddle + from parakeet.frontend.cn_frontend import Frontend as cnFrontend @@ -87,8 +88,7 @@ class Frontend(): phones.append(phone) return phones, tones - def get_input_ids(self, sentence, merge_sentences=True, - get_tone_ids=False): + def get_input_ids(self, sentence, merge_sentences=True, get_tone_ids=False): phonemes = self.frontend.get_phonemes( sentence, merge_sentences=merge_sentences) result = {} diff --git a/examples/fastspeech2/baker/gen_duration_from_textgrid.py b/examples/fastspeech2/baker/gen_duration_from_textgrid.py index b3a39d3..aaece61 100644 --- a/examples/fastspeech2/baker/gen_duration_from_textgrid.py +++ b/examples/fastspeech2/baker/gen_duration_from_textgrid.py @@ -11,16 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from pathlib import Path import librosa import numpy as np -from praatio import tgio - from config import get_cfg_default +from praatio import tgio def readtg(config, tg_path): diff --git a/examples/fastspeech2/baker/normalize.py b/examples/fastspeech2/baker/normalize.py index adaa0ab..6e59744 100644 --- a/examples/fastspeech2/baker/normalize.py +++ b/examples/fastspeech2/baker/normalize.py @@ -50,10 +50,7 @@ def main(): required=True, help="speech statistics file.") parser.add_argument( - "--pitch-stats", - type=str, - required=True, - help="pitch statistics file.") + "--pitch-stats", type=str, required=True, help="pitch statistics file.") parser.add_argument( "--energy-stats", type=str, diff --git a/examples/fastspeech2/baker/preprocess.py b/examples/fastspeech2/baker/preprocess.py index a8b597e..c28e280 100644 --- a/examples/fastspeech2/baker/preprocess.py +++ b/examples/fastspeech2/baker/preprocess.py @@ -262,10 +262,7 @@ def main(): parser = argparse.ArgumentParser( description="Preprocess audio and then extract features.") parser.add_argument( - "--rootdir", - default=None, - type=str, - help="directory to baker dataset.") + "--rootdir", default=None, type=str, help="directory to baker dataset.") parser.add_argument( "--dur-file", default=None, diff --git a/examples/fastspeech2/baker/synthesize.py b/examples/fastspeech2/baker/synthesize.py index 6770189..f6304eb 100644 --- a/examples/fastspeech2/baker/synthesize.py +++ b/examples/fastspeech2/baker/synthesize.py @@ -67,8 +67,7 @@ def evaluate(args, fastspeech2_config, pwg_config): std = paddle.to_tensor(std) pwg_normalizer = ZScore(mu, std) - fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, - model) + fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model) pwg_inference = PWGInference(pwg_normalizer, vocoder) output_dir = Path(args.output_dir) diff --git a/examples/fastspeech2/baker/train.py b/examples/fastspeech2/baker/train.py index 741678b..39b6fbc 100644 --- a/examples/fastspeech2/baker/train.py +++ b/examples/fastspeech2/baker/train.py @@ -154,8 +154,7 @@ def train_sp(args, config): output_dir = Path(args.output_dir) trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) - evaluator = FastSpeech2Evaluator(model, dev_dataloader, - **config["updater"]) + evaluator = FastSpeech2Evaluator(model, dev_dataloader, **config["updater"]) if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) diff --git a/examples/ge2e/audio_processor.py b/examples/ge2e/audio_processor.py index 65a6aee..921e999 100644 --- a/examples/ge2e/audio_processor.py +++ b/examples/ge2e/audio_processor.py @@ -30,9 +30,7 @@ except ModuleNotFoundError: INT16_MAX = (2**15) - 1 -def normalize_volume(wav, - target_dBFS, - increase_only=False, +def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): # this function implements Loudness normalization, instead of peak # normalization, See https://en.wikipedia.org/wiki/Audio_normalization @@ -44,8 +42,9 @@ def normalize_volume(wav, if increase_only and decrease_only: raise ValueError("Both increase only and decrease only are set") dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2)) - if ((dBFS_change < 0 and increase_only) or - (dBFS_change > 0 and decrease_only)): + if dBFS_change < 0 and increase_only: + return wav + if dBFS_change > 0 and decrease_only: return wav gain = 10**(dBFS_change / 20) return wav * gain @@ -59,9 +58,14 @@ def trim_long_silences(wav, """ Ensures that segments without voice in the waveform remain no longer than a threshold determined by the VAD parameters in params.py. - - :param wav: the raw waveform as a numpy array of floats - :return: the same waveform with silences trimmed away (length <= original wav length) + Parameters + ---------- + wav : np.array + the raw waveform as a numpy array of floats + Returns + ---------- + np.array + the same waveform with silences trimmed away (length <= original wav length) """ # Compute the voice detection window size samples_per_window = (vad_window_length * sampling_rate) // 1000 @@ -117,20 +121,25 @@ def compute_partial_slices(n_samples: int, The returned ranges may be indexing further than the length of the waveform. It is recommended that you pad the waveform with zeros up to wave_slices[-1].stop. + Parameters + ---------- + n_samples : int + the number of samples in the waveform. + partial_utterance_n_frames : int + the number of mel spectrogram frames in each partial utterance. - :param n_samples: the number of samples in the waveform - :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial - utterance - :param min_pad_coverage: when reaching the last partial utterance, it may or may not have - enough frames. If at least of are present, - then the last partial utterance will be considered, as if we padded the audio. Otherwise, - it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial - utterance, this parameter is ignored so that the function always returns at least 1 slice. - :param overlap: by how much the partial utterance should overlap. If set to 0, the partial - utterances are entirely disjoint. - :return: the waveform slices and mel spectrogram slices as lists of array slices. Index - respectively the waveform and the mel spectrogram with these slices to obtain the partial - utterances. + min_pad_coverage : int + when reaching the last partial utterance, it may or may not have enough frames. + If at least of are present, + then the last partial utterance will be considered, as if we padded the audio. Otherwise, + it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial + utterance, this parameter is ignored so that the function always returns at least 1 slice. + overlap : float + by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. + Returns + ---------- + the waveform slices and mel spectrogram slices as lists of array slices. + Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. """ assert 0 <= overlap < 1 assert 0 < min_pad_coverage <= 1 @@ -138,8 +147,8 @@ def compute_partial_slices(n_samples: int, # librosa's function to compute num_frames from num_samples n_frames = int(np.ceil((n_samples + 1) / hop_length)) # frame shift between ajacent partials - frame_step = max( - 1, int(np.round(partial_utterance_n_frames * (1 - overlap)))) + frame_step = max(1, + int(np.round(partial_utterance_n_frames * (1 - overlap)))) # Compute the slices wav_slices, mel_slices = [], [] diff --git a/examples/ge2e/dataset_processors.py b/examples/ge2e/dataset_processors.py index 5e4fed9..50a8f3e 100644 --- a/examples/ge2e/dataset_processors.py +++ b/examples/ge2e/dataset_processors.py @@ -57,7 +57,7 @@ def _process_speaker(speaker_dir: Path, try: with sources_fpath.open("rt") as sources_file: existing_names = {line.split(",")[0] for line in sources_file} - except: + except Exception as e: existing_names = {} else: existing_names = {} @@ -114,9 +114,7 @@ def process_librispeech(processor, output_dir, "*.flac", skip_existing) -def process_voxceleb1(processor, - datasets_root, - output_dir, +def process_voxceleb1(processor, datasets_root, output_dir, skip_existing=False): dataset_name = "VoxCeleb1" dataset_root = datasets_root / dataset_name @@ -126,10 +124,7 @@ def process_voxceleb1(processor, metadata = [line.strip().split("\t") for line in metafile][1:] # speaker id -> nationality - nationalities = { - line[0]: line[3] - for line in metadata if line[-1] == "dev" - } + nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"} keep_speaker_ids = [ speaker_id for speaker_id, nationality in nationalities.items() if nationality.lower() in anglophone_nationalites @@ -147,9 +142,7 @@ def process_voxceleb1(processor, output_dir, "*.wav", skip_existing) -def process_voxceleb2(processor, - datasets_root, - output_dir, +def process_voxceleb2(processor, datasets_root, output_dir, skip_existing=False): dataset_name = "VoxCeleb2" dataset_root = datasets_root / dataset_name @@ -171,9 +164,7 @@ def process_aidatatang_200zh(processor, output_dir, "*.wav", skip_existing) -def process_magicdata(processor, - datasets_root, - output_dir, +def process_magicdata(processor, datasets_root, output_dir, skip_existing=False): dataset_name = "magicdata/train" dataset_root = datasets_root / dataset_name diff --git a/examples/ge2e/preprocess.py b/examples/ge2e/preprocess.py index 615a71e..b1e5946 100644 --- a/examples/ge2e/preprocess.py +++ b/examples/ge2e/preprocess.py @@ -52,7 +52,8 @@ if __name__ == "__main__": if not args.no_trim: try: import webrtcvad - except: + print(webrtcvad.__version__) + except Exception as e: raise ModuleNotFoundError( "Package 'webrtcvad' not found. This package enables " "noise removal and is recommended. Please install and " @@ -96,5 +97,5 @@ if __name__ == "__main__": for dataset in args.datasets: print("Preprocessing %s" % dataset) - preprocess_func[dataset](processor, args.datasets_root, - args.output_dir, args.skip_existing) + preprocess_func[dataset](processor, args.datasets_root, args.output_dir, + args.skip_existing) diff --git a/examples/ge2e/train.py b/examples/ge2e/train.py index f015472..950d486 100644 --- a/examples/ge2e/train.py +++ b/examples/ge2e/train.py @@ -83,12 +83,11 @@ class Ge2eExperiment(ExperimentBase): self.logger.info(msg) if dist.get_rank() == 0: - self.visualizer.add_scalar("train/loss", loss_value, - self.iteration) + self.visualizer.add_scalar("train/loss", loss_value, self.iteration) self.visualizer.add_scalar("train/eer", eer, self.iteration) - self.visualizer.add_scalar( - "param/w", - float(self.model_core.similarity_weight), self.iteration) + self.visualizer.add_scalar("param/w", + float(self.model_core.similarity_weight), + self.iteration) self.visualizer.add_scalar("param/b", float(self.model_core.similarity_bias), self.iteration) diff --git a/examples/parallelwave_gan/baker/batch_fn.py b/examples/parallelwave_gan/baker/batch_fn.py index 11a45c5..925303b 100644 --- a/examples/parallelwave_gan/baker/batch_fn.py +++ b/examples/parallelwave_gan/baker/batch_fn.py @@ -109,8 +109,7 @@ class Clip(object): """ if len(x) < c.shape[1] * self.hop_size: - x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), - mode="edge") + x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), mode="edge") # check the legnth is valid assert len(x) == c.shape[ diff --git a/examples/parallelwave_gan/baker/compute_statistics.py b/examples/parallelwave_gan/baker/compute_statistics.py index 06b9b65..2a0c458 100644 --- a/examples/parallelwave_gan/baker/compute_statistics.py +++ b/examples/parallelwave_gan/baker/compute_statistics.py @@ -17,18 +17,12 @@ import argparse import logging import os -import numpy as np -import yaml -import json import jsonlines - +import numpy as np +from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm -from parakeet.datasets.data_table import DataTable -from parakeet.utils.h5_utils import read_hdf5 -from parakeet.utils.h5_utils import write_hdf5 - from config import get_cfg_default diff --git a/examples/parallelwave_gan/baker/normalize.py b/examples/parallelwave_gan/baker/normalize.py index 0cf2841..06a3dd2 100644 --- a/examples/parallelwave_gan/baker/normalize.py +++ b/examples/parallelwave_gan/baker/normalize.py @@ -15,18 +15,15 @@ import argparse import logging -import os from operator import itemgetter from pathlib import Path -import numpy as np -import yaml import jsonlines +import numpy as np +from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm -from parakeet.datasets.data_table import DataTable - from config import get_cfg_default diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py index 92021eb..83782c3 100644 --- a/examples/parallelwave_gan/baker/preprocess.py +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -13,7 +13,9 @@ # limitations under the License. from operator import itemgetter -from typing import List, Dict, Any +from typing import Any +from typing import Dict +from typing import List import argparse import jsonlines @@ -39,8 +41,8 @@ def process_sentence(config: Dict[str, Any], # reading y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio." - assert np.abs(y).max( - ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + assert np.abs( + y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." duration = librosa.get_duration(y, sr=sr) # trim according to the alignment file @@ -80,8 +82,8 @@ def process_sentence(config: Dict[str, Any], # adjust time to make num_samples == num_frames * hop_length num_frames = logmel.shape[0] if y.size < num_frames * config.hop_length: - y = np.pad(y, (0, num_frames * config.hop_length - y.size), - mode="reflect") + y = np.pad( + y, (0, num_frames * config.hop_length - y.size), mode="reflect") else: y = y[:num_frames * config.hop_length] num_sample = y.shape[0] @@ -139,10 +141,7 @@ def main(): parser = argparse.ArgumentParser( description="Preprocess audio and then extract features .") parser.add_argument( - "--rootdir", - default=None, - type=str, - help="directory to baker dataset.") + "--rootdir", default=None, type=str, help="directory to baker dataset.") parser.add_argument( "--dumpdir", type=str, diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index 90cf655..6b47584 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -20,17 +20,11 @@ from paddle.nn import Layer from paddle.optimizer import Optimizer from paddle.optimizer.lr import LRScheduler from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler from timer import timer -from parakeet.datasets.data_table import DataTable from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState from parakeet.training.extensions.evaluator import StandardEvaluator -from parakeet.training.trainer import Trainer from parakeet.training.reporter import report -from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator -from parakeet.modules.stft_loss import MultiResolutionSTFTLoss -from parakeet.utils.profile import synchronize class PWGUpdater(StandardUpdater): diff --git a/examples/parallelwave_gan/baker/synthesize.py b/examples/parallelwave_gan/baker/synthesize.py index 1216220..8a78ad5 100644 --- a/examples/parallelwave_gan/baker/synthesize.py +++ b/examples/parallelwave_gan/baker/synthesize.py @@ -12,20 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys -from timer import timer -import logging import argparse +import os from pathlib import Path +from timer import timer -import yaml import jsonlines -import paddle import numpy as np +import paddle import soundfile as sf +import yaml from paddle import distributed as dist - from parakeet.datasets.data_table import DataTable from parakeet.models.parallel_wavegan import PWGGenerator diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 1bf0a90..7082494 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -130,8 +130,7 @@ def train_sp(args, config): parameters=generator.parameters(), **config["generator_optimizer_params"]) lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"]) - gradient_clip_d = nn.ClipGradByGlobalNorm(config[ - "discriminator_grad_norm"]) + gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"]) optimizer_d = Adam( learning_rate=lr_schedule_d, grad_clip=gradient_clip_d, @@ -184,8 +183,7 @@ def train_sp(args, config): stop_trigger=(config.train_max_steps, "iteration"), out=output_dir, ) - trainer.extend( - evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(evaluator, trigger=(config.eval_interval_steps, 'iteration')) if dist.get_rank() == 0: writer = LogWriter(str(trainer.out)) trainer.extend(VisualDL(writer), trigger=(1, 'iteration')) diff --git a/examples/speedyspeech/baker/batch_fn.py b/examples/speedyspeech/baker/batch_fn.py index 6bc0df7..e9089ed 100644 --- a/examples/speedyspeech/baker/batch_fn.py +++ b/examples/speedyspeech/baker/batch_fn.py @@ -22,8 +22,7 @@ def collate_baker_examples(examples): tones = [np.array(item["tones"], dtype=np.int64) for item in examples] feats = [np.array(item["feats"], dtype=np.float32) for item in examples] durations = [ - np.array( - item["durations"], dtype=np.int64) for item in examples + np.array(item["durations"], dtype=np.int64) for item in examples ] num_phones = np.array([item["num_phones"] for item in examples]) num_frames = np.array([item["num_frames"] for item in examples]) diff --git a/examples/speedyspeech/baker/compute_statistics.py b/examples/speedyspeech/baker/compute_statistics.py index 3d3dd5b..ab8767c 100644 --- a/examples/speedyspeech/baker/compute_statistics.py +++ b/examples/speedyspeech/baker/compute_statistics.py @@ -15,21 +15,14 @@ import argparse import logging -import os from pathlib import Path -import numpy as np -import yaml -import json import jsonlines - +import numpy as np +from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm -from parakeet.datasets.data_table import DataTable -from parakeet.utils.h5_utils import read_hdf5 -from parakeet.utils.h5_utils import write_hdf5 - from config import get_cfg_default diff --git a/examples/speedyspeech/baker/frontend.py b/examples/speedyspeech/baker/frontend.py index e8869dd..85029ad 100644 --- a/examples/speedyspeech/baker/frontend.py +++ b/examples/speedyspeech/baker/frontend.py @@ -17,7 +17,6 @@ from pathlib import Path import numpy as np import paddle -import pypinyin from pypinyin import lazy_pinyin, Style import jieba import phkit diff --git a/examples/speedyspeech/baker/inference.py b/examples/speedyspeech/baker/inference.py index 3bd4384..2be9322 100644 --- a/examples/speedyspeech/baker/inference.py +++ b/examples/speedyspeech/baker/inference.py @@ -15,9 +15,8 @@ import argparse from pathlib import Path -import numpy as np -from paddle import inference import soundfile as sf +from paddle import inference from frontend import text_analysis @@ -73,8 +72,8 @@ def main(): speedyspeech_predictor.run() output_names = speedyspeech_predictor.get_output_names() - output_handle = speedyspeech_predictor.get_output_handle(output_names[ - 0]) + output_handle = speedyspeech_predictor.get_output_handle( + output_names[0]) output_data = output_handle.copy_to_cpu() input_names = pwg_predictor.get_input_names() diff --git a/examples/speedyspeech/baker/normalize.py b/examples/speedyspeech/baker/normalize.py index 2d1b028..f453898 100644 --- a/examples/speedyspeech/baker/normalize.py +++ b/examples/speedyspeech/baker/normalize.py @@ -15,19 +15,16 @@ import argparse import logging -import os -from copy import copy from operator import itemgetter from pathlib import Path -import numpy as np -import yaml import jsonlines +import numpy as np from sklearn.preprocessing import StandardScaler from tqdm import tqdm -from parakeet.frontend.vocab import Vocab from parakeet.datasets.data_table import DataTable +from parakeet.frontend.vocab import Vocab from config import get_cfg_default @@ -100,7 +97,10 @@ def main(): for item in metadata: item["feats"] = str(metadata_dir / item["feats"]) - dataset = DataTable(metadata, converters={'feats': np.load, }) + dataset = DataTable( + metadata, converters={ + 'feats': np.load, + }) logging.info(f"The number of files = {len(dataset)}.") # restore scaler diff --git a/examples/speedyspeech/baker/preprocess.py b/examples/speedyspeech/baker/preprocess.py index 2c720b7..1ec0ed9 100644 --- a/examples/speedyspeech/baker/preprocess.py +++ b/examples/speedyspeech/baker/preprocess.py @@ -13,7 +13,9 @@ # limitations under the License. from operator import itemgetter -from typing import List, Dict, Any +from typing import Any +from typing import Dict +from typing import List import argparse import jsonlines @@ -41,8 +43,8 @@ def process_sentence(config: Dict[str, Any], # reading y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio." - assert np.abs(y).max( - ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + assert np.abs( + y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." duration = librosa.get_duration(y, sr=sr) # intervals with empty lables are ignored @@ -162,10 +164,7 @@ def main(): parser = argparse.ArgumentParser( description="Preprocess audio and then extract features.") parser.add_argument( - "--rootdir", - default=None, - type=str, - help="directory to baker dataset.") + "--rootdir", default=None, type=str, help="directory to baker dataset.") parser.add_argument( "--dumpdir", type=str, diff --git a/examples/speedyspeech/baker/speedyspeech_updater.py b/examples/speedyspeech/baker/speedyspeech_updater.py index bbb65d7..daa0f57 100644 --- a/examples/speedyspeech/baker/speedyspeech_updater.py +++ b/examples/speedyspeech/baker/speedyspeech_updater.py @@ -13,15 +13,13 @@ # limitations under the License. import paddle -from paddle.nn import functional as F from paddle.fluid.layers import huber_loss - -from parakeet.modules.ssim import ssim +from paddle.nn import functional as F from parakeet.modules.losses import masked_l1_loss, weighted_mean +from parakeet.modules.ssim import ssim +from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater -from parakeet.training.extensions.evaluator import StandardEvaluator -from parakeet.models.speedyspeech import SpeedySpeech class SpeedySpeechUpdater(StandardUpdater): diff --git a/examples/speedyspeech/baker/synthesize.py b/examples/speedyspeech/baker/synthesize.py index 0cddf73..65fccb4 100644 --- a/examples/speedyspeech/baker/synthesize.py +++ b/examples/speedyspeech/baker/synthesize.py @@ -11,30 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os -import sys import logging import argparse -import dataclasses from pathlib import Path -import yaml import jsonlines -import paddle import numpy as np import soundfile as sf import paddle -from paddle import nn -from paddle.nn import functional as F -from paddle import distributed as dist +import yaml from paddle import jit from paddle.static import InputSpec from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference -from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore @@ -79,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config): speedyspeech_inference = jit.to_static( speedyspeech_inference, input_spec=[ - InputSpec( - [-1], dtype=paddle.int64), InputSpec( - [-1], dtype=paddle.int64) + InputSpec([-1], dtype=paddle.int64), InputSpec( + [-1], dtype=paddle.int64) ]) paddle.jit.save(speedyspeech_inference, os.path.join(args.inference_dir, "speedyspeech")) @@ -91,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config): pwg_inference = PWGInference(pwg_normalizer, vocoder) pwg_inference.eval() pwg_inference = jit.to_static( - pwg_inference, - input_spec=[InputSpec( - [-1, 80], dtype=paddle.float32), ]) + pwg_inference, input_spec=[ + InputSpec([-1, 80], dtype=paddle.float32), + ]) paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg")) pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg")) @@ -119,9 +113,7 @@ def main(): parser = argparse.ArgumentParser( description="Synthesize with speedyspeech & parallel wavegan.") parser.add_argument( - "--speedyspeech-config", - type=str, - help="config file for speedyspeech.") + "--speedyspeech-config", type=str, help="config file for speedyspeech.") parser.add_argument( "--speedyspeech-checkpoint", type=str, diff --git a/examples/speedyspeech/baker/synthesize.sh b/examples/speedyspeech/baker/synthesize.sh index 18f056d..2bca37c 100644 --- a/examples/speedyspeech/baker/synthesize.sh +++ b/examples/speedyspeech/baker/synthesize.sh @@ -1,6 +1,6 @@ python synthesize.py \ --speedyspeech-config=conf/default.yaml \ - --speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \ + --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \ --speedyspeech-stat=dump/train/stats.npy \ --pwg-config=../../parallelwave_gan/baker/conf/default.yaml \ --pwg-params=../../parallelwave_gan/baker/converted.pdparams \ diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py index 8bf911b..f9ec33e 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.py +++ b/examples/speedyspeech/baker/synthesize_e2e.py @@ -13,28 +13,22 @@ # limitations under the License. import os -import sys import logging import argparse -import dataclasses from pathlib import Path -import yaml -import jsonlines -import paddle import numpy as np import soundfile as sf import paddle +import yaml from paddle import jit from paddle.static import InputSpec -from paddle import nn -from paddle.nn import functional as F -from paddle import distributed as dist from yacs.config import CfgNode -from parakeet.datasets.data_table import DataTable -from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference -from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from frontend import text_analysis @@ -57,8 +51,7 @@ def evaluate(args, speedyspeech_config, pwg_config): model.eval() vocoder = PWGGenerator(**pwg_config["generator_params"]) - vocoder.set_state_dict( - paddle.load(args.pwg_checkpoint)["generator_params"]) + vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) vocoder.remove_weight_norm() vocoder.eval() print("model done!") @@ -81,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config): speedyspeech_inference = jit.to_static( speedyspeech_inference, input_spec=[ - InputSpec( - [-1], dtype=paddle.int64), InputSpec( - [-1], dtype=paddle.int64) + InputSpec([-1], dtype=paddle.int64), InputSpec( + [-1], dtype=paddle.int64) ]) paddle.jit.save(speedyspeech_inference, os.path.join(args.inference_dir, "speedyspeech")) @@ -93,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config): pwg_inference = PWGInference(pwg_normalizer, vocoder) pwg_inference.eval() pwg_inference = jit.to_static( - pwg_inference, - input_spec=[InputSpec( - [-1, 80], dtype=paddle.float32), ]) + pwg_inference, input_spec=[ + InputSpec([-1, 80], dtype=paddle.float32), + ]) paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg")) pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg")) @@ -119,9 +111,7 @@ def main(): parser = argparse.ArgumentParser( description="Synthesize with speedyspeech & parallel wavegan.") parser.add_argument( - "--speedyspeech-config", - type=str, - help="config file for speedyspeech.") + "--speedyspeech-config", type=str, help="config file for speedyspeech.") parser.add_argument( "--speedyspeech-checkpoint", type=str, diff --git a/examples/speedyspeech/baker/tg_utils.py b/examples/speedyspeech/baker/tg_utils.py index 18c0385..e6ea593 100644 --- a/examples/speedyspeech/baker/tg_utils.py +++ b/examples/speedyspeech/baker/tg_utils.py @@ -13,7 +13,6 @@ # limitations under the License. import librosa -from praatio import tgio def validate_textgrid(text_grid, num_samples, sr): diff --git a/examples/speedyspeech/baker/train.py b/examples/speedyspeech/baker/train.py index c4b5b20..6f063a6 100644 --- a/examples/speedyspeech/baker/train.py +++ b/examples/speedyspeech/baker/train.py @@ -72,7 +72,9 @@ def train_sp(args, config): fields=[ "phones", "tones", "num_phones", "num_frames", "feats", "durations" ], - converters={"feats": np.load, }, ) + converters={ + "feats": np.load, + }, ) with jsonlines.open(args.dev_metadata, 'r') as reader: dev_metadata = list(reader) metadata_dir = Path(args.dev_metadata).parent @@ -83,7 +85,9 @@ def train_sp(args, config): fields=[ "phones", "tones", "num_phones", "num_frames", "feats", "durations" ], - converters={"feats": np.load, }, ) + converters={ + "feats": np.load, + }, ) # collate function and dataloader train_sampler = DistributedBatchSampler( diff --git a/examples/tacotron2/ljspeech.py b/examples/tacotron2/ljspeech.py index a5054d4..76e4b3a 100644 --- a/examples/tacotron2/ljspeech.py +++ b/examples/tacotron2/ljspeech.py @@ -46,8 +46,7 @@ class LJSpeech(Dataset): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" - def __init__(self, padding_idx=0, padding_value=0., - padding_stop_token=1.0): + def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0): self.padding_idx = padding_idx self.padding_value = padding_value self.padding_stop_token = padding_stop_token diff --git a/examples/tacotron2/preprocess.py b/examples/tacotron2/preprocess.py index f93aa46..aa7bf24 100644 --- a/examples/tacotron2/preprocess.py +++ b/examples/tacotron2/preprocess.py @@ -63,8 +63,7 @@ def create_dataset(config, source_path, target_path, verbose=False): with open(target_path / "metadata.pkl", 'wb') as f: pickle.dump(records, f) if verbose: - print("saved metadata into {}".format(target_path / - "metadata.pkl")) + print("saved metadata into {}".format(target_path / "metadata.pkl")) print("Done.") diff --git a/examples/tacotron2/train.py b/examples/tacotron2/train.py index 20fdd40..82dd4c3 100644 --- a/examples/tacotron2/train.py +++ b/examples/tacotron2/train.py @@ -14,14 +14,13 @@ import time from collections import defaultdict + import numpy as np - import paddle +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler - from parakeet.data import dataset -from parakeet.frontend import EnglishCharacter # pylint: disable=unused-import from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase from parakeet.utils import display, mp_tools @@ -74,8 +73,7 @@ class Experiment(ExperimentBase): if dist.get_rank() == 0: for k, v in losses_np.items(): - self.visualizer.add_scalar(f"train_loss/{k}", v, - self.iteration) + self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() diff --git a/examples/tacotron2_aishell3/aishell3.py b/examples/tacotron2_aishell3/aishell3.py index 66b4680..c53cf59 100644 --- a/examples/tacotron2_aishell3/aishell3.py +++ b/examples/tacotron2_aishell3/aishell3.py @@ -65,8 +65,8 @@ def collate_aishell3_examples(examples): text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64) spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64) T_dec = np.max(spec_lengths) - stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1) - ).astype(np.float32) + stop_tokens = ( + np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32) phones, _ = batch_text_id(phones) tones, _ = batch_text_id(tones) mel, _ = batch_spec(mel) diff --git a/examples/tacotron2_aishell3/preprocess_transcription.py b/examples/tacotron2_aishell3/preprocess_transcription.py index 2d4aa85..fa74331 100644 --- a/examples/tacotron2_aishell3/preprocess_transcription.py +++ b/examples/tacotron2_aishell3/preprocess_transcription.py @@ -121,8 +121,8 @@ def convert(syllable): syllable = syllable.replace("ing", "ieng").replace("in", "ien") # expansion for un, ui, iu - syllable = syllable.replace("un", "uen").replace( - "ui", "uei").replace("iu", "iou") + syllable = syllable.replace("un", "uen").replace("ui", + "uei").replace("iu", "iou") # rule for variants of i syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\ diff --git a/examples/tacotron2_aishell3/process_wav.py b/examples/tacotron2_aishell3/process_wav.py index e3a1c73..34d4089 100644 --- a/examples/tacotron2_aishell3/process_wav.py +++ b/examples/tacotron2_aishell3/process_wav.py @@ -68,8 +68,7 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir): alignment_dir=alignment_dir) with Pool(16) as p: list( - tqdm( - p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance")) + tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance")) if __name__ == "__main__": diff --git a/examples/tacotron2_aishell3/train.py b/examples/tacotron2_aishell3/train.py index 64b5166..de01811 100644 --- a/examples/tacotron2_aishell3/train.py +++ b/examples/tacotron2_aishell3/train.py @@ -109,8 +109,7 @@ class Experiment(ExperimentBase): mel_pred = outputs['mel_outputs_postnet'] self.visualizer.add_figure( f"valid_sentence_{i}_predicted_spectrogram", - display.plot_spectrogram(mel_pred[0].numpy().T), - self.iteration) + display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration) # write visual log valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} diff --git a/examples/text_frontend/get_textnorm_data.py b/examples/text_frontend/get_textnorm_data.py index 11c5c0f..8058e05 100644 --- a/examples/text_frontend/get_textnorm_data.py +++ b/examples/text_frontend/get_textnorm_data.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse -import re from pathlib import Path diff --git a/examples/text_frontend/test_g2p.py b/examples/text_frontend/test_g2p.py index 90f7bf2..ba456e9 100644 --- a/examples/text_frontend/test_g2p.py +++ b/examples/text_frontend/test_g2p.py @@ -40,6 +40,7 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir): raw_text = raw_dict[utt_id] text = text_cleaner(raw_text) g2p_phones = frontend.get_phonemes(text) + g2p_phones = sum(g2p_phones, []) gt_phones = ref_dict[utt_id].split(" ") # delete silence tokens in predicted phones and ground truth phones g2p_phones = [phn for phn in g2p_phones if phn not in SILENCE_TOKENS] diff --git a/examples/transformer_tts/ljspeech.py b/examples/transformer_tts/ljspeech.py index f8fcc67..6397f3c 100644 --- a/examples/transformer_tts/ljspeech.py +++ b/examples/transformer_tts/ljspeech.py @@ -53,10 +53,10 @@ class Transform(object): ids, mel = example # ids already have and ids = np.array(ids, dtype=np.int64) # add start and end frame - mel = np.pad(mel, [(0, 0), (1, 1)], - mode='constant', - constant_values=[(0, 0), - (self.start_value, self.end_value)]) + mel = np.pad( + mel, [(0, 0), (1, 1)], + mode='constant', + constant_values=[(0, 0), (self.start_value, self.end_value)]) stop_labels = np.ones([mel.shape[1]], dtype=np.int64) stop_labels[-1] = 2 # actually this thing can also be done within the model diff --git a/examples/transformer_tts/preprocess.py b/examples/transformer_tts/preprocess.py index 23fbc7f..670227e 100644 --- a/examples/transformer_tts/preprocess.py +++ b/examples/transformer_tts/preprocess.py @@ -64,8 +64,7 @@ def create_dataset(config, source_path, target_path, verbose=False): with open(target_path / "metadata.pkl", 'wb') as f: pickle.dump(records, f) if verbose: - print("saved metadata into {}".format(target_path / - "metadata.pkl")) + print("saved metadata into {}".format(target_path / "metadata.pkl")) # also save meta data into text format for inspection with open(target_path / "metadata.txt", 'wt') as f: @@ -73,8 +72,7 @@ def create_dataset(config, source_path, target_path, verbose=False): phoneme_str = "|".join(phonemes) f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str)) if verbose: - print("saved metadata into {}".format(target_path / - "metadata.txt")) + print("saved metadata into {}".format(target_path / "metadata.txt")) print("Done.") diff --git a/examples/transformer_tts/synthesize.py b/examples/transformer_tts/synthesize.py index 14bdfcb..6b49f3a 100644 --- a/examples/transformer_tts/synthesize.py +++ b/examples/transformer_tts/synthesize.py @@ -60,7 +60,7 @@ def main(config, args): display.plot_multilayer_multihead_alignments(attns) plt.savefig(str(output_dir / f"sentence_{i}.png")) - mel_output = mel_output.T #(C, T) + mel_output = mel_output.T # (C, T) np.save(str(output_dir / f"sentence_{i}"), mel_output) if args.verbose: print("spectrogram saved at {}".format(output_dir / diff --git a/examples/transformer_tts/train.py b/examples/transformer_tts/train.py index ff0c40f..e2da676 100644 --- a/examples/transformer_tts/train.py +++ b/examples/transformer_tts/train.py @@ -76,8 +76,7 @@ class TransformerTTSExperiment(ExperimentBase): ljspeech_dataset = LJSpeech(args.data) transform = Transform(config.data.mel_start_value, config.data.mel_end_value) - ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, - transform) + ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform) valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) @@ -159,8 +158,7 @@ class TransformerTTSExperiment(ExperimentBase): if dist.get_rank() == 0: for k, v in losses_np.items(): - self.visualizer.add_scalar(f"train_loss/{k}", v, - self.iteration) + self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() diff --git a/examples/use_mfa/local/generate_lexicon.py b/examples/use_mfa/local/generate_lexicon.py index 1791e7b..b6e594a 100644 --- a/examples/use_mfa/local/generate_lexicon.py +++ b/examples/use_mfa/local/generate_lexicon.py @@ -90,8 +90,8 @@ def rule(C, V, R, T): return None # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼 - if V in ['ua', 'uai', 'uang' - ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: + if V in ['ua', 'uai', + 'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: return None # sh 和 ong 不能拼 diff --git a/examples/waveflow/preprocess.py b/examples/waveflow/preprocess.py index 1d2ca25..199081c 100644 --- a/examples/waveflow/preprocess.py +++ b/examples/waveflow/preprocess.py @@ -28,8 +28,8 @@ from config import get_cfg_defaults class Transform(object): - def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, - fmin, fmax): + def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, fmin, + fmax): self.sample_rate = sample_rate self.n_fft = n_fft self.win_length = win_length @@ -79,11 +79,8 @@ class Transform(object): spectrogram_magnitude = np.abs(spectrogram) # Compute mel-spectrograms. - mel_filter_bank = librosa.filters.mel(sr=sr, - n_fft=n_fft, - n_mels=n_mels, - fmin=fmin, - fmax=fmax) + mel_filter_bank = librosa.filters.mel( + sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude) # log scale mel_spectrogram. diff --git a/examples/waveflow/synthesize.py b/examples/waveflow/synthesize.py index bb7c0ff..e25cec3 100644 --- a/examples/waveflow/synthesize.py +++ b/examples/waveflow/synthesize.py @@ -39,8 +39,7 @@ def main(config, args): mel = np.load(str(file_path)) with paddle.amp.auto_cast(): audio = model.predict(mel) - audio_path = output_dir / ( - os.path.splitext(file_path.name)[0] + ".wav") + audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") sf.write(audio_path, audio, config.data.sample_rate) print("[synthesize] {} -> {}".format(file_path, audio_path)) diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py index feddf20..359670f 100644 --- a/examples/waveflow/train.py +++ b/examples/waveflow/train.py @@ -114,8 +114,7 @@ class Experiment(ExperimentBase): msg += "loss: {:>.6f}".format(loss_value) self.logger.info(msg) if dist.get_rank() == 0: - self.visualizer.add_scalar("train/loss", loss_value, - self.iteration) + self.visualizer.add_scalar("train/loss", loss_value, self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() diff --git a/parakeet/__init__.py b/parakeet/__init__.py index f08f907..67be25b 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -13,6 +13,3 @@ # limitations under the License. __version__ = "0.0.0" - -import logging -from parakeet import audio, data, datasets, frontend, models, modules, training, utils diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py index 7fc437c..abf198b 100644 --- a/parakeet/audio/__init__.py +++ b/parakeet/audio/__init__.py @@ -11,6 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .audio import AudioProcessor -from .spec_normalizer import NormalizerBase, LogMagnitude \ No newline at end of file diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index c0d4c97..ab9a45d 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import librosa -import soundfile as sf import numpy as np +import soundfile as sf __all__ = ["AudioProcessor"] @@ -53,11 +52,12 @@ class AudioProcessor(object): self.inv_mel_filter = np.linalg.pinv(self.mel_filter) def _create_mel_filter(self): - mel_filter = librosa.filters.mel(self.sample_rate, - self.n_fft, - n_mels=self.n_mels, - fmin=self.fmin, - fmax=self.fmax) + mel_filter = librosa.filters.mel( + self.sample_rate, + self.n_fft, + n_mels=self.n_mels, + fmin=self.fmin, + fmax=self.fmax) return mel_filter def read_wav(self, filename): diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py index 23476bc..2fed920 100644 --- a/parakeet/data/__init__.py +++ b/parakeet/data/__init__.py @@ -13,20 +13,3 @@ # limitations under the License. """Parakeet's infrastructure for data processing. """ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from parakeet.data.batch import * -from parakeet.data.dataset import * -from parakeet.data.get_feats import * diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index d5f5e91..515074d 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -61,9 +61,10 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64): for example in minibatch: pad_len = max_len - example.shape[0] batch.append( - np.pad(example, [(0, pad_len)], - mode='constant', - constant_values=pad_id)) + np.pad( + example, [(0, pad_len)], + mode='constant', + constant_values=pad_id)) return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64) @@ -103,9 +104,10 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): for example in minibatch: pad_len = max_len - example.shape[-1] batch.append( - np.pad(example, [(0, pad_len)], - mode='constant', - constant_values=pad_value)) + np.pad( + example, [(0, pad_len)], + mode='constant', + constant_values=pad_value)) return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64) @@ -152,14 +154,16 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32): pad_len = max_len - example.shape[time_idx] if time_major: batch.append( - np.pad(example, [(0, pad_len), (0, 0)], - mode='constant', - constant_values=pad_value)) + np.pad( + example, [(0, pad_len), (0, 0)], + mode='constant', + constant_values=pad_value)) else: batch.append( - np.pad(example, [(0, 0), (0, pad_len)], - mode='constant', - constant_values=pad_value)) + np.pad( + example, [(0, 0), (0, pad_len)], + mode='constant', + constant_values=pad_value)) return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64) @@ -178,10 +182,8 @@ def batch_sequences(sequences, axis=0, pad_value=0): for seq, length in zip(sequences, seq_lengths): padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * ( ndim - axis - 1) - padded_seq = np.pad(seq, - padding, - mode='constant', - constant_values=pad_value) + padded_seq = np.pad( + seq, padding, mode='constant', constant_values=pad_value) padded_sequences.append(padded_seq) batch = np.stack(padded_sequences) return batch diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index a188767..2d6c03c 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import six -import paddle from paddle.io import Dataset __all__ = [ @@ -69,7 +67,7 @@ class CacheDataset(Dataset): return len(self._dataset) def __getitem__(self, i): - if not i in self._cache: + if i not in self._cache: self._cache[i] = self._dataset[i] return self._cache[i] @@ -86,9 +84,8 @@ class TupleDataset(Dataset): length = len(datasets[0]) for i, dataset in enumerate(datasets): if len(dataset) != length: - raise ValueError( - "all the datasets should have the same length." - "dataset {} has a different length".format(i)) + raise ValueError("all the datasets should have the same length." + "dataset {} has a different length".format(i)) self._datasets = datasets self._length = length @@ -115,7 +112,7 @@ class DictDataset(Dataset): A compound dataset made from several datasets of the same length. An example of the `DictDataset` is a dict of examples from the constituent datasets. - + WARNING: paddle does not have a good support for DictDataset, because every batch yield from a DataLoader is a list, but it cannot be a dict. So you have to provide a collate function because you cannot use the diff --git a/parakeet/data/get_feats.py b/parakeet/data/get_feats.py index 4027e9b..0acfd7f 100644 --- a/parakeet/data/get_feats.py +++ b/parakeet/data/get_feats.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import librosa import numpy as np import pyworld @@ -46,11 +45,12 @@ class LogMelFBank(): self.mel_filter = self._create_mel_filter() def _create_mel_filter(self): - mel_filter = librosa.filters.mel(sr=self.sr, - n_fft=self.n_fft, - n_mels=self.n_mels, - fmin=self.fmin, - fmax=self.fmax) + mel_filter = librosa.filters.mel( + sr=self.sr, + n_fft=self.n_fft, + n_mels=self.n_mels, + fmin=self.fmin, + fmax=self.fmax) return mel_filter def _stft(self, wav): @@ -121,11 +121,12 @@ class Pitch(): use_log_f0=True) -> np.array: input = input.astype(np.float) frame_period = 1000 * self.hop_length / self.sr - f0, timeaxis = pyworld.dio(input, - fs=self.sr, - f0_floor=self.f0min, - f0_ceil=self.f0max, - frame_period=frame_period) + f0, timeaxis = pyworld.dio( + input, + fs=self.sr, + f0_floor=self.f0min, + f0_ceil=self.f0max, + frame_period=frame_period) f0 = pyworld.stonemask(input, f0, timeaxis, self.sr) if use_continuous_f0: f0 = self._convert_to_continuous_f0(f0) @@ -195,8 +196,7 @@ class Energy(): input_power = np.abs(input_stft)**2 energy = np.sqrt( np.clip( - np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float( - 'inf'))) + np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float('inf'))) return energy def _average_by_duration(self, input: np.array, d: np.array) -> np.array: diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py index e75da0b..abf198b 100644 --- a/parakeet/datasets/__init__.py +++ b/parakeet/datasets/__init__.py @@ -11,6 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.datasets.common import * -from parakeet.datasets.ljspeech import * \ No newline at end of file diff --git a/parakeet/datasets/common.py b/parakeet/datasets/common.py index 61d0c93..d6fa3a8 100644 --- a/parakeet/datasets/common.py +++ b/parakeet/datasets/common.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from paddle.io import Dataset -import os -import librosa from pathlib import Path -import numpy as np from typing import List +import librosa +import numpy as np +from paddle.io import Dataset + __all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"] @@ -57,7 +56,7 @@ class AudioSegmentDataset(Dataset): class AudioDataset(Dataset): - """A simple dataset adaptor for the audio files. + """A simple dataset adaptor for the audio files. Read -> trim silence -> normalize """ diff --git a/parakeet/datasets/data_table.py b/parakeet/datasets/data_table.py index 78a3608..b0e4c89 100644 --- a/parakeet/datasets/data_table.py +++ b/parakeet/datasets/data_table.py @@ -11,12 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from typing import Union, Optional, Callable, Tuple, List, Dict, Any -from pathlib import Path from multiprocessing import Manager +from typing import Any +from typing import Callable +from typing import Dict +from typing import List -import numpy as np from paddle.io import Dataset diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py index c34f52b..85cc3c1 100644 --- a/parakeet/datasets/ljspeech.py +++ b/parakeet/datasets/ljspeech.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from paddle.io import Dataset -from pathlib import Path __all__ = ["LJSpeechMetaData"] diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py index b7b5874..abf198b 100644 --- a/parakeet/frontend/__init__.py +++ b/parakeet/frontend/__init__.py @@ -11,11 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.frontend.vocab import * -from parakeet.frontend.phonectic import * -from parakeet.frontend.punctuation import * -from parakeet.frontend.normalizer import * -from parakeet.frontend.cn_normalization import * -from parakeet.frontend.tone_sandhi import * -from parakeet.frontend.generate_lexicon import * diff --git a/parakeet/frontend/arpabet.py b/parakeet/frontend/arpabet.py index e6f63b7..ae9212b 100644 --- a/parakeet/frontend/arpabet.py +++ b/parakeet/frontend/arpabet.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.frontend.phonectic import Phonetics """ A phonology system with ARPABET symbols and limited punctuations. The G2P @@ -200,8 +199,7 @@ class ARPABET(Phonetics): The list of pronunciation id sequence. """ return self.numericalize( - self.phoneticize( - sentence, add_start_end=add_start_end)) + self.phoneticize(sentence, add_start_end=add_start_end)) @property def vocab_size(self): @@ -217,9 +215,9 @@ class ARPABETWithStress(Phonetics): 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', - 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', - 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', - 'V', 'W', 'Y', 'Z', 'ZH' + 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', + 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V', + 'W', 'Y', 'Z', 'ZH' ] punctuations = [',', '.', '?', '!'] symbols = phonemes + punctuations @@ -294,8 +292,7 @@ class ARPABETWithStress(Phonetics): The list of pronunciation id sequence. """ return self.numericalize( - self.phoneticize( - sentence, add_start_end=add_start_end)) + self.phoneticize(sentence, add_start_end=add_start_end)) @property def vocab_size(self): diff --git a/parakeet/frontend/cn_frontend.py b/parakeet/frontend/cn_frontend.py index 62172f0..84903fc 100644 --- a/parakeet/frontend/cn_frontend.py +++ b/parakeet/frontend/cn_frontend.py @@ -11,17 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re import jieba.posseg as psg -import numpy as np -import paddle -import re from g2pM import G2pM -from parakeet.frontend.tone_sandhi import ToneSandhi -from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer -from pypinyin import lazy_pinyin, Style +from pypinyin import lazy_pinyin +from pypinyin import Style +from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer from parakeet.frontend.generate_lexicon import generate_lexicon +from parakeet.frontend.tone_sandhi import ToneSandhi class Frontend(): diff --git a/parakeet/frontend/cn_normalization/__init__.py b/parakeet/frontend/cn_normalization/__init__.py index b1471d6..abf198b 100644 --- a/parakeet/frontend/cn_normalization/__init__.py +++ b/parakeet/frontend/cn_normalization/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.frontend.cn_normalization.text_normlization import * \ No newline at end of file diff --git a/parakeet/frontend/cn_normalization/chronology.py b/parakeet/frontend/cn_normalization/chronology.py index 157d4ca..1d9520e 100644 --- a/parakeet/frontend/cn_normalization/chronology.py +++ b/parakeet/frontend/cn_normalization/chronology.py @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re -from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS +from .num import DIGITS +from .num import num2str +from .num import verbalize_cardinal +from .num import verbalize_digit def _time_num2str(num_string: str) -> str: diff --git a/parakeet/frontend/cn_normalization/constants.py b/parakeet/frontend/cn_normalization/constants.py index d1ae42b..5d2b0b3 100644 --- a/parakeet/frontend/cn_normalization/constants.py +++ b/parakeet/frontend/cn_normalization/constants.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re import string + from pypinyin.constants import SUPPORT_UCS4 # 全角半角转换 @@ -32,10 +32,7 @@ F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits} H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} # 标点符号全角 -> 半角映射表 (num: 32) -F2H_PUNCTUATIONS = { - chr(ord(char) + 65248): char - for char in string.punctuation -} +F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation} # 标点符号半角 -> 全角映射表 H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} diff --git a/parakeet/frontend/cn_normalization/num.py b/parakeet/frontend/cn_normalization/num.py index ca6ee80..7cc36d7 100644 --- a/parakeet/frontend/cn_normalization/num.py +++ b/parakeet/frontend/cn_normalization/num.py @@ -15,7 +15,6 @@ Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文 """ - import re from collections import OrderedDict from typing import List diff --git a/parakeet/frontend/cn_normalization/phonecode.py b/parakeet/frontend/cn_normalization/phonecode.py index 354e463..437348c 100644 --- a/parakeet/frontend/cn_normalization/phonecode.py +++ b/parakeet/frontend/cn_normalization/phonecode.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re from .num import verbalize_digit @@ -32,14 +31,12 @@ def phone2str(phone_string: str, mobile=True) -> str: if mobile: sp_parts = phone_string.strip('+').split() result = ''.join( - [verbalize_digit( - part, alt_one=True) for part in sp_parts]) + [verbalize_digit(part, alt_one=True) for part in sp_parts]) return result else: sil_parts = phone_string.split('-') result = ''.join( - [verbalize_digit( - part, alt_one=True) for part in sil_parts]) + [verbalize_digit(part, alt_one=True) for part in sil_parts]) return result diff --git a/parakeet/frontend/cn_normalization/quantifier.py b/parakeet/frontend/cn_normalization/quantifier.py index 0a4bcaf..f40867f 100644 --- a/parakeet/frontend/cn_normalization/quantifier.py +++ b/parakeet/frontend/cn_normalization/quantifier.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re from .num import num2str diff --git a/parakeet/frontend/cn_normalization/text_normlization.py b/parakeet/frontend/cn_normalization/text_normlization.py index d55f00a..9bb7fc0 100644 --- a/parakeet/frontend/cn_normalization/text_normlization.py +++ b/parakeet/frontend/cn_normalization/text_normlization.py @@ -11,16 +11,37 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re from typing import List -from .chronology import RE_TIME, RE_DATE, RE_DATE2 -from .chronology import replace_time, replace_date, replace_date2 -from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE -from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM, RE_DECIMAL_NUM, RE_POSITIVE_QUANTIFIERS -from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num, replace_negative_num, replace_positive_quantifier -from .phonecode import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone, replace_mobile +from .chronology import RE_DATE +from .chronology import RE_DATE2 +from .chronology import RE_TIME +from .chronology import replace_date +from .chronology import replace_date2 +from .chronology import replace_time +from .constants import F2H_ASCII_LETTERS +from .constants import F2H_DIGITS +from .constants import F2H_SPACE +from .num import RE_DECIMAL_NUM +from .num import RE_DEFAULT_NUM +from .num import RE_FRAC +from .num import RE_INTEGER +from .num import RE_NUMBER +from .num import RE_PERCENTAGE +from .num import RE_POSITIVE_QUANTIFIERS +from .num import RE_RANGE +from .num import replace_default_num +from .num import replace_frac +from .num import replace_negative_num +from .num import replace_number +from .num import replace_percentage +from .num import replace_positive_quantifier +from .num import replace_range +from .phonecode import RE_MOBILE_PHONE +from .phonecode import RE_TELEPHONE +from .phonecode import replace_mobile +from .phonecode import replace_phone from .quantifier import RE_TEMPERATURE from .quantifier import replace_temperature diff --git a/parakeet/frontend/generate_lexicon.py b/parakeet/frontend/generate_lexicon.py index eae2fde..155e159 100644 --- a/parakeet/frontend/generate_lexicon.py +++ b/parakeet/frontend/generate_lexicon.py @@ -18,8 +18,6 @@ than words are used in transcriptions produced by `reorganize_baker.py`. We make this choice to better leverage other software for chinese text to pinyin tools like pypinyin. This is the convention for G2P in Chinese. """ - -import argparse import re from collections import OrderedDict @@ -41,10 +39,10 @@ SPECIALS = ['sil', 'sp'] def rule(C, V, R, T): """Generate a syllable given the initial, the final, erhua indicator, and tone. Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu) - + Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to 'u' in syllables when certain conditions are satisfied. - + 'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'. Erhua is is possibly applied to every finals, except for finals that already ends with 'r'. When a syllable is impossible or does not have any characters with this pronunciation, return None @@ -86,8 +84,8 @@ def rule(C, V, R, T): return None # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼 - if V in ['ua', 'uai', 'uang' - ] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: + if V in ['ua', 'uai', + 'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: return None # sh 和 ong 不能拼 diff --git a/parakeet/frontend/normalizer/__init__.py b/parakeet/frontend/normalizer/__init__.py index 37fd580..abf198b 100644 --- a/parakeet/frontend/normalizer/__init__.py +++ b/parakeet/frontend/normalizer/__init__.py @@ -11,6 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.frontend.normalizer.normalizer import * -from parakeet.frontend.normalizer.numbers import * diff --git a/parakeet/frontend/normalizer/normalizer.py b/parakeet/frontend/normalizer/normalizer.py index 6f8f5ce..795607e 100644 --- a/parakeet/frontend/normalizer/normalizer.py +++ b/parakeet/frontend/normalizer/normalizer.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re import unicodedata from builtins import str as unicode + from parakeet.frontend.normalizer.numbers import normalize_numbers diff --git a/parakeet/frontend/normalizer/numbers.py b/parakeet/frontend/normalizer/numbers.py index e693691..564fb9b 100644 --- a/parakeet/frontend/normalizer/numbers.py +++ b/parakeet/frontend/normalizer/numbers.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # number expansion is not that easy -import inflect import re +import inflect + _inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index 1ac0912..81674bf 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -11,16 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from abc import ABC +from abc import abstractmethod -from abc import ABC, abstractmethod -from typing import Union from g2p_en import G2p from g2pM import G2pM + from parakeet.frontend import Vocab +from parakeet.frontend.normalizer.normalizer import normalize +from parakeet.frontend.punctuation import get_punctuations + # discard opencc untill we find an easy solution to install it on windows # from opencc import OpenCC -from parakeet.frontend.punctuation import get_punctuations -from parakeet.frontend.normalizer.normalizer import normalize __all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"] @@ -65,14 +67,14 @@ class English(Phonetics): start = self.vocab.start_symbol end = self.vocab.end_symbol phonemes = ([] if start is None else [start]) \ - + self.backend(sentence) \ - + ([] if end is None else [end]) + + self.backend(sentence) \ + + ([] if end is None else [end]) phonemes = [item for item in phonemes if item in self.vocab.stoi] return phonemes def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - + Parameters ----------- phonemes: List[str] @@ -91,7 +93,7 @@ class English(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - + Parameters ----------- ids: List[int] @@ -183,7 +185,7 @@ class EnglishCharacter(Phonetics): ---------- str The input text sequence. - + """ return [self.vocab.reverse(i) for i in ids] @@ -244,8 +246,8 @@ class Chinese(Phonetics): start = self.vocab.start_symbol end = self.vocab.end_symbol phonemes = ([] if start is None else [start]) \ - + phonemes \ - + ([] if end is None else [end]) + + phonemes \ + + ([] if end is None else [end]) return self._filter_symbols(phonemes) def _filter_symbols(self, phonemes): @@ -261,7 +263,7 @@ class Chinese(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - + Parameters ----------- phonemes: List[str] @@ -298,7 +300,7 @@ class Chinese(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - + Parameters ----------- ids: List[int] diff --git a/parakeet/frontend/pinyin.py b/parakeet/frontend/pinyin.py index 958fd88..503bfd5 100644 --- a/parakeet/frontend/pinyin.py +++ b/parakeet/frontend/pinyin.py @@ -19,13 +19,15 @@ text -> pinyin to other part of a TTS system. Other NLP techniques may be used (e.g. tokenization, tagging, NER...) """ import re +from itertools import product + +from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin +from pypinyin.core import DefaultConverter +from pypinyin.core import Pinyin +from pypinyin.core import Style + from parakeet.frontend.phonectic import Phonetics from parakeet.frontend.vocab import Vocab -import pypinyin -from pypinyin.core import Pinyin, Style -from pypinyin.core import DefaultConverter -from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin -from itertools import product _punctuations = [',', '。', '?', '!'] _initials = [ @@ -33,10 +35,10 @@ _initials = [ 'ch', 'sh', 'r', 'z', 'c', 's' ] _finals = [ - 'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', - 'ang', 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', - 'ien', 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', - 'ueng', 'v', 've', 'van', 'ven', 'veng' + 'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', + 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', + 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng', + 'v', 've', 'van', 'ven', 'veng' ] _ernized_symbol = ['&r'] _phones = _initials + _finals + _ernized_symbol + _punctuations @@ -76,12 +78,12 @@ class ParakeetPinyin(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. - + Parameters ----------- sentence: str The input text sequence. - + Returns ---------- List[str] @@ -95,12 +97,12 @@ class ParakeetPinyin(Phonetics): def numericalize(self, phonemes, tones): """ Convert pronunciation sequence into pronunciation id sequence. - + Parameters ----------- phonemes: List[str] The list of pronunciation sequence. - + Returns ---------- List[int] @@ -112,12 +114,12 @@ class ParakeetPinyin(Phonetics): def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. - + Parameters ----------- sentence: str The input text sequence. - + Returns ---------- List[str] @@ -159,12 +161,12 @@ class ParakeetPinyinWithTone(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. - + Parameters ----------- sentence: str The input text sequence. - + Returns ---------- List[str] @@ -178,12 +180,12 @@ class ParakeetPinyinWithTone(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - + Parameters ----------- phonemes: List[str] The list of pronunciation sequence. - + Returns ---------- List[int] @@ -194,12 +196,12 @@ class ParakeetPinyinWithTone(Phonetics): def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. - + Parameters ----------- sentence: str The input text sequence. - + Returns ---------- List[str] @@ -232,17 +234,17 @@ def _convert_to_parakeet_convension(syllable): syllable = syllable.replace("ing", "ieng").replace("in", "ien") # expansion for un, ui, iu - syllable = syllable.replace("un","uen")\ - .replace("ui", "uei")\ + syllable = syllable.replace("un", "uen") \ + .replace("ui", "uei") \ .replace("iu", "iou") # rule for variants of i - syllable = syllable.replace("zi", "zii")\ - .replace("ci", "cii")\ - .replace("si", "sii")\ - .replace("zhi", "zhiii")\ - .replace("chi", "chiii")\ - .replace("shi", "shiii")\ + syllable = syllable.replace("zi", "zii") \ + .replace("ci", "cii") \ + .replace("si", "sii") \ + .replace("zhi", "zhiii") \ + .replace("chi", "chiii") \ + .replace("shi", "shiii") \ .replace("ri", "riii") # rule for y preceding i, u @@ -252,8 +254,8 @@ def _convert_to_parakeet_convension(syllable): syllable = syllable.replace("wu", "u").replace("w", "u") # rule for v following j, q, x - syllable = syllable.replace("ju", "jv")\ - .replace("qu", "qv")\ + syllable = syllable.replace("ju", "jv") \ + .replace("qu", "qv") \ .replace("xu", "xv") return syllable + tone diff --git a/parakeet/frontend/punctuation.py b/parakeet/frontend/punctuation.py index 099e759..23636dc 100644 --- a/parakeet/frontend/punctuation.py +++ b/parakeet/frontend/punctuation.py @@ -12,9 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import abc -import string - __all__ = ["get_punctuations"] EN_PUNCT = [ diff --git a/parakeet/frontend/tone_sandhi.py b/parakeet/frontend/tone_sandhi.py index 9dc3917..268a160 100644 --- a/parakeet/frontend/tone_sandhi.py +++ b/parakeet/frontend/tone_sandhi.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from typing import List, Tuple +from typing import List +from typing import Tuple import jieba from pypinyin import lazy_pinyin @@ -76,8 +76,7 @@ class ToneSandhi(): # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[ - 0] in {"n", "v", "a"}: + if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": @@ -125,8 +124,8 @@ class ToneSandhi(): else: for i, char in enumerate(word): # "不" before tone4 should be bu2, e.g. 不怕 - if char == "不" and i + 1 < len(word) and finals[i + 1][ - -1] == "4": + if char == "不" and i + 1 < len(word) and finals[i + + 1][-1] == "4": finals[i] = finals[i][:-1] + "2" return finals @@ -266,12 +265,12 @@ class ToneSandhi(): assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and self._all_tone_three(sub_finals_list[ - i - 1]) and self._all_tone_three(sub_finals_list[ - i]) and not merge_last[i - 1]: + if i - 1 >= 0 and self._all_tone_three( + sub_finals_list[i - 1]) and self._all_tone_three( + sub_finals_list[i]) and not merge_last[i - 1]: # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len(seg[ - i - 1][0]) + len(seg[i][0]) <= 3: + if not self._is_reduplication(seg[i - 1][0]) and len( + seg[i - 1][0]) + len(seg[i][0]) <= 3: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: @@ -299,8 +298,8 @@ class ToneSandhi(): if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \ merge_last[i - 1]: # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len(seg[ - i - 1][0]) + len(seg[i][0]) <= 3: + if not self._is_reduplication(seg[i - 1][0]) and len( + seg[i - 1][0]) + len(seg[i][0]) <= 3: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: diff --git a/parakeet/frontend/vocab.py b/parakeet/frontend/vocab.py index a56cfb8..9ef6b13 100644 --- a/parakeet/frontend/vocab.py +++ b/parakeet/frontend/vocab.py @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from typing import Dict, Iterable, List from collections import OrderedDict +from typing import Iterable __all__ = ["Vocab"] @@ -25,13 +24,13 @@ class Vocab(object): ----------- symbols: Iterable[str] Common symbols. - + padding_symbol: str, optional Symbol for pad. Defaults to "". unk_symbol: str, optional Symbol for unknow. Defaults to "" - + start_symbol: str, optional Symbol for start. Defaults to "" diff --git a/parakeet/models/__init__.py b/parakeet/models/__init__.py index 6cf65ec..abf198b 100644 --- a/parakeet/models/__init__.py +++ b/parakeet/models/__init__.py @@ -11,13 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -#from parakeet.models.clarinet import * -from parakeet.models.waveflow import * -#from parakeet.models.wavenet import * - -from parakeet.models.transformer_tts import * -#from parakeet.models.deepvoice3 import * -# from parakeet.models.fastspeech import * -from parakeet.models.tacotron2 import * -from parakeet.models.fastspeech2 import * diff --git a/parakeet/models/fastspeech2.py b/parakeet/models/fastspeech2.py index c351e92..daaba74 100644 --- a/parakeet/models/fastspeech2.py +++ b/parakeet/models/fastspeech2.py @@ -12,19 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. """Fastspeech2 related modules for paddle""" - -from typing import Dict, Sequence, Tuple +from typing import Sequence +from typing import Tuple import paddle from paddle import nn -from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss +from typeguard import check_argument_types + +from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor +from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator from parakeet.modules.fastspeech2_predictor.postnet import Postnet from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor -from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding, ScaledPositionalEncoding +from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding +from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder -from parakeet.modules.nets_utils import initialize, make_non_pad_mask, make_pad_mask -from typeguard import check_argument_types +from parakeet.modules.nets_utils import initialize +from parakeet.modules.nets_utils import make_non_pad_mask +from parakeet.modules.nets_utils import make_pad_mask class FastSpeech2(nn.Layer): @@ -293,9 +298,8 @@ class FastSpeech2(nn.Layer): xs, ilens, ys, olens, ds, ps, es, is_inference=False) # modify mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor([ - olen - olen % self.reduction_factor for olen in olens.numpy() - ]) + olens = paddle.to_tensor( + [olen - olen % self.reduction_factor for olen in olens.numpy()]) max_olen = max(olens) ys = ys[:, :max_olen] @@ -501,8 +505,7 @@ class FastSpeech2Inference(nn.Layer): class FastSpeech2Loss(nn.Layer): """Loss function module for FastSpeech2.""" - def __init__(self, - use_masking: bool=True, + def __init__(self, use_masking: bool=True, use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. @@ -538,8 +541,8 @@ class FastSpeech2Loss(nn.Layer): ps: paddle.Tensor, es: paddle.Tensor, ilens: paddle.Tensor, - olens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, - paddle.Tensor, paddle.Tensor]: + olens: paddle.Tensor, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Calculate forward propagation. Parameters @@ -611,9 +614,9 @@ class FastSpeech2Loss(nn.Layer): # make weighted mask and apply it if self.use_weighted_masking: out_masks = make_non_pad_mask(olens).unsqueeze(-1) - out_weights = out_masks.cast( - dtype=paddle.float32) / out_masks.cast( - dtype=paddle.float32).sum(axis=1, keepdim=True) + out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast( + dtype=paddle.float32).sum( + axis=1, keepdim=True) out_weights /= ys.shape[0] * ys.shape[2] duration_masks = make_non_pad_mask(ilens) duration_weights = (duration_masks.cast(dtype=paddle.float32) / diff --git a/parakeet/models/lstm_speaker_encoder.py b/parakeet/models/lstm_speaker_encoder.py index 529f27b..3372b21 100644 --- a/parakeet/models/lstm_speaker_encoder.py +++ b/parakeet/models/lstm_speaker_encoder.py @@ -11,17 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle from paddle import nn -from paddle.fluid.param_attr import ParamAttr from paddle.nn import functional as F from paddle.nn import initializer as I - from scipy.interpolate import interp1d -from sklearn.metrics import roc_curve from scipy.optimize import brentq +from sklearn.metrics import roc_curve class LSTMSpeakerEncoder(nn.Layer): @@ -81,8 +78,7 @@ class LSTMSpeakerEncoder(nn.Layer): # print("p1: ", p1.shape) p2 = paddle.bmm( embeds.reshape([-1, 1, embed_dim]), - normalized_centroids_excl.reshape( - [-1, embed_dim, 1])) # (NM, 1, 1) + normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1) p2 = p2.reshape([-1]) # (NM) # begin: alternative implementation for scatter @@ -94,9 +90,8 @@ class LSTMSpeakerEncoder(nn.Layer): index = index * speakers_per_batch + paddle.arange( 0, speakers_per_batch, dtype="int64").unsqueeze(-1) index = paddle.reshape(index, [-1]) - ones = paddle.ones([ - speakers_per_batch * utterances_per_speaker * speakers_per_batch - ]) + ones = paddle.ones( + [speakers_per_batch * utterances_per_speaker * speakers_per_batch]) zeros = paddle.zeros_like(index, dtype=ones.dtype) mask_p1 = paddle.scatter(ones, index, zeros) p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2) @@ -113,6 +108,9 @@ class LSTMSpeakerEncoder(nn.Layer): g = p._grad_ivar() g[...] = g * 0.01 + def inv_argmax(self, i, num): + return np.eye(1, num, i, dtype=np.int)[0] + def loss(self, embeds): """ Computes the softmax loss according the section 2.1 of GE2E. @@ -138,8 +136,8 @@ class LSTMSpeakerEncoder(nn.Layer): # EER (not backpropagated) with paddle.no_grad(): ground_truth = target.numpy() - inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] - labels = np.array([inv_argmax(i) for i in ground_truth]) + labels = np.array( + [self.inv_argmax(i, speakers_per_batch) for i in ground_truth]) preds = sim_matrix.numpy() # Snippet from https://yangcha.github.io/EER-ROC/ diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index cd4539f..bb21465 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -11,13 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math -from typing import List, Dict, Any, Union, Optional, Tuple +from typing import Any +from typing import Dict +from typing import List +from typing import Optional import numpy as np import paddle -from paddle import Tensor from paddle import nn from paddle.nn import functional as F @@ -63,8 +64,8 @@ class Stretch2D(nn.Layer): class UpsampleNet(nn.Layer): - """A Layer to upsample spectrogram by applying consecutive stretch and - convolutions. + """A Layer to upsample spectrogram by applying consecutive stretch and + convolutions. Parameters ---------- @@ -81,10 +82,10 @@ class UpsampleNet(nn.Layer): use_causal_conv : bool, optional Whether to use causal padding before convolution, by default False - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, + If True, Causal padding is used along the time axis, i.e. padding + amount is ``receptive field - 1`` and 0 for before and after, respectively. - + If False, "same" padding is used along the time axis. """ @@ -158,7 +159,7 @@ class ConvInUpsampleNet(nn.Layer): aux_context_window : int, optional Context window of the first 1D convolution applied to the input. It related to the kernel size of the convolution, by default 0 - + If use causal convolution, the kernel size is ``window + 1``, else the kernel size is ``2 * window + 1``. use_causal_conv : bool, optional @@ -167,7 +168,7 @@ class ConvInUpsampleNet(nn.Layer): If True, Causal padding is used along the time axis, i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. - + If False, "same" padding is used along the time axis. """ @@ -276,10 +277,7 @@ class ResidualBlock(nn.Layer): gate_out_channels = gate_channels // 2 self.conv1x1_out = nn.Conv1D( - gate_out_channels, - residual_channels, - kernel_size=1, - bias_attr=bias) + gate_out_channels, residual_channels, kernel_size=1, bias_attr=bias) self.conv1x1_skip = nn.Conv1D( gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias) @@ -428,13 +426,18 @@ class PWGGenerator(nn.Layer): use_causal_conv=use_causal_conv) self.conv_layers.append(conv) - self.last_conv_layers = nn.Sequential( - nn.ReLU(), - nn.Conv1D( - skip_channels, skip_channels, 1, bias_attr=True), - nn.ReLU(), - nn.Conv1D( - skip_channels, out_channels, 1, bias_attr=True)) + self.last_conv_layers = nn.Sequential(nn.ReLU(), + nn.Conv1D( + skip_channels, + skip_channels, + 1, + bias_attr=True), + nn.ReLU(), + nn.Conv1D( + skip_channels, + out_channels, + 1, + bias_attr=True)) if use_weight_norm: self.apply_weight_norm() @@ -548,18 +551,18 @@ class PWGDiscriminator(nn.Layer): by default True """ - def __init__(self, - in_channels: int=1, - out_channels: int=1, - kernel_size: int=3, - layers: int=10, - conv_channels: int=64, - dilation_factor: int=1, - nonlinear_activation: str="LeakyReLU", - nonlinear_activation_params: Dict[ - str, Any]={"negative_slope": 0.2}, - bias: bool=True, - use_weight_norm: bool=True): + def __init__( + self, + in_channels: int=1, + out_channels: int=1, + kernel_size: int=3, + layers: int=10, + conv_channels: int=64, + dilation_factor: int=1, + nonlinear_activation: str="LeakyReLU", + nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2}, + bias: bool=True, + use_weight_norm: bool=True): super().__init__() assert kernel_size % 2 == 1 assert dilation_factor > 0 @@ -693,8 +696,7 @@ class ResidualPWGDiscriminator(nn.Layer): layers_per_stack = layers // stacks self.first_conv = nn.Sequential( - nn.Conv1D( - in_channels, residual_channels, 1, bias_attr=True), + nn.Conv1D(in_channels, residual_channels, 1, bias_attr=True), getattr(nn, nonlinear_activation)(**nonlinear_activation_params)) self.conv_layers = nn.LayerList() @@ -714,11 +716,9 @@ class ResidualPWGDiscriminator(nn.Layer): self.last_conv_layers = nn.Sequential( getattr(nn, nonlinear_activation)(**nonlinear_activation_params), - nn.Conv1D( - skip_channels, skip_channels, 1, bias_attr=True), + nn.Conv1D(skip_channels, skip_channels, 1, bias_attr=True), getattr(nn, nonlinear_activation)(**nonlinear_activation_params), - nn.Conv1D( - skip_channels, out_channels, 1, bias_attr=True)) + nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)) if use_weight_norm: self.apply_weight_norm() diff --git a/parakeet/models/speedyspeech.py b/parakeet/models/speedyspeech.py index bd7055b..e98e633 100644 --- a/parakeet/models/speedyspeech.py +++ b/parakeet/models/speedyspeech.py @@ -11,18 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import math - -import numpy as np import paddle -from paddle import Tensor from paddle import nn -from paddle.nn import functional as F -from paddle.nn import initializer as I -from parakeet.modules.positional_encoding import sinusoid_position_encoding from parakeet.modules.expansion import expand +from parakeet.modules.positional_encoding import sinusoid_position_encoding class ResidualBlock(nn.Layer): @@ -38,8 +31,7 @@ class ResidualBlock(nn.Layer): padding="same", data_format="NLC"), nn.ReLU(), - nn.BatchNorm1D( - channels, data_format="NLC"), ) for _ in range(n) + nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n) ] self.blocks = nn.Sequential(*blocks) @@ -95,16 +87,14 @@ class SpeedySpeechEncoder(nn.Layer): nn.Linear(hidden_size, hidden_size), nn.ReLU(), ) res_blocks = [ - ResidualBlock( - hidden_size, kernel_size, d, n=2) for d in dilations + ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations ] self.res_blocks = nn.Sequential(*res_blocks) self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size)) self.postnet2 = nn.Sequential( nn.ReLU(), - nn.BatchNorm1D( - hidden_size, data_format="NLC"), + nn.BatchNorm1D(hidden_size, data_format="NLC"), nn.Linear(hidden_size, hidden_size), ) def forward(self, text, tones): @@ -120,13 +110,9 @@ class DurationPredictor(nn.Layer): def __init__(self, hidden_size): super().__init__() self.layers = nn.Sequential( - ResidualBlock( - hidden_size, 4, 1, n=1), - ResidualBlock( - hidden_size, 3, 1, n=1), - ResidualBlock( - hidden_size, 1, 1, n=1), - nn.Linear(hidden_size, 1)) + ResidualBlock(hidden_size, 4, 1, n=1), + ResidualBlock(hidden_size, 3, 1, n=1), + ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1)) def forward(self, x): return paddle.squeeze(self.layers(x), -1) @@ -136,15 +122,13 @@ class SpeedySpeechDecoder(nn.Layer): def __init__(self, hidden_size, output_size, kernel_size, dilations): super().__init__() res_blocks = [ - ResidualBlock( - hidden_size, kernel_size, d, n=2) for d in dilations + ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations ] self.res_blocks = nn.Sequential(*res_blocks) self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size)) self.postnet2 = nn.Sequential( - ResidualBlock( - hidden_size, kernel_size, 1, n=2), + ResidualBlock(hidden_size, kernel_size, 1, n=2), nn.Linear(hidden_size, output_size)) def forward(self, x): diff --git a/parakeet/models/tacotron2.py b/parakeet/models/tacotron2.py index 5b18aab..ab94faf 100644 --- a/parakeet/models/tacotron2.py +++ b/parakeet/models/tacotron2.py @@ -11,20 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math import paddle from paddle import nn +from paddle.fluid.layers import sequence_mask from paddle.nn import functional as F from paddle.nn import initializer as I -from paddle.fluid.layers import sequence_mask +from tqdm import trange -from parakeet.modules.conv import Conv1dBatchNorm from parakeet.modules.attention import LocationSensitiveAttention +from parakeet.modules.conv import Conv1dBatchNorm from parakeet.modules.losses import guided_attention_loss from parakeet.utils import checkpoint -from tqdm import trange __all__ = ["Tacotron2", "Tacotron2Loss"] @@ -74,8 +73,7 @@ class DecoderPreNet(nn.Layer): """ - x = F.dropout( - F.relu(self.linear1(x)), self.dropout_rate, training=True) + x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate, training=True) output = F.dropout( F.relu(self.linear2(x)), self.dropout_rate, training=True) return output @@ -745,10 +743,10 @@ class Tacotron2(nn.Layer): if global_condition is not None: global_condition = global_condition.unsqueeze(1) - global_condition = paddle.expand( - global_condition, [-1, encoder_outputs.shape[1], -1]) - encoder_outputs = paddle.concat( - [encoder_outputs, global_condition], -1) + global_condition = paddle.expand(global_condition, + [-1, encoder_outputs.shape[1], -1]) + encoder_outputs = paddle.concat([encoder_outputs, global_condition], + -1) # [B, T_enc, 1] mask = sequence_mask( @@ -813,10 +811,10 @@ class Tacotron2(nn.Layer): if global_condition is not None: global_condition = global_condition.unsqueeze(1) - global_condition = paddle.expand( - global_condition, [-1, encoder_outputs.shape[1], -1]) - encoder_outputs = paddle.concat( - [encoder_outputs, global_condition], -1) + global_condition = paddle.expand(global_condition, + [-1, encoder_outputs.shape[1], -1]) + encoder_outputs = paddle.concat([encoder_outputs, global_condition], + -1) if self.decoder.use_stop_token: mel_outputs, alignments, stop_logits = self.decoder.infer( encoder_outputs, max_decoder_steps=max_decoder_steps) diff --git a/parakeet/models/transformer_tts.py b/parakeet/models/transformer_tts.py index eed1fbe..4ec943a 100644 --- a/parakeet/models/transformer_tts.py +++ b/parakeet/models/transformer_tts.py @@ -11,22 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math -from tqdm import trange + import paddle from paddle import nn from paddle.nn import functional as F from paddle.nn import initializer as I +from tqdm import trange import parakeet -from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention -from parakeet.modules.transformer import PositionwiseFFN -from parakeet.modules import masking -from parakeet.modules.conv import Conv1dBatchNorm -from parakeet.modules import positional_encoding as pe from parakeet.modules import losses as L -from parakeet.utils import checkpoint, scheduler +from parakeet.modules import masking +from parakeet.modules import positional_encoding as pe +from parakeet.modules.attention import _concat_heads +from parakeet.modules.attention import _split_heads +from parakeet.modules.attention import drop_head +from parakeet.modules.attention import scaled_dot_product_attention +from parakeet.modules.conv import Conv1dBatchNorm +from parakeet.modules.transformer import PositionwiseFFN +from parakeet.utils import checkpoint +from parakeet.utils import scheduler __all__ = ["TransformerTTS", "TransformerTTSLoss"] @@ -404,16 +408,14 @@ class TransformerTTS(nn.Layer): self.toned = False # position encoding matrix may be extended later self.encoder_pe = pe.sinusoid_position_encoding(1000, d_encoder) - self.encoder_pe_scalar = self.create_parameter( - [1], attr=I.Constant(1.)) + self.encoder_pe_scalar = self.create_parameter([1], attr=I.Constant(1.)) self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn, encoder_layers, dropout) # decoder self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout) self.decoder_pe = pe.sinusoid_position_encoding(1000, d_decoder) - self.decoder_pe_scalar = self.create_parameter( - [1], attr=I.Constant(1.)) + self.decoder_pe_scalar = self.create_parameter([1], attr=I.Constant(1.)) self.decoder = TransformerDecoder( d_decoder, n_heads, @@ -470,14 +472,13 @@ class TransformerTTS(nn.Layer): self.encoder_pe = pe.sinusoid_position_encoding(new_T, self.d_encoder) pos_enc = self.encoder_pe[:T_enc, :] # (T, C) - x = embed.scale(math.sqrt( - self.d_encoder)) + pos_enc * self.encoder_pe_scalar + x = embed.scale( + math.sqrt(self.d_encoder)) + pos_enc * self.encoder_pe_scalar x = F.dropout(x, self.dropout, training=self.training) # TODO(chenfeiyu): unsqueeze a decoder_time_steps=1 for the mask encoder_padding_mask = paddle.unsqueeze( - masking.id_mask( - text, self.padding_idx, dtype=x.dtype), 1) + masking.id_mask(text, self.padding_idx, dtype=x.dtype), 1) x, attention_weights = self.encoder(x, encoder_padding_mask, self.drop_n_heads) return x, attention_weights, encoder_padding_mask @@ -492,8 +493,8 @@ class TransformerTTS(nn.Layer): self.decoder_pe = pe.sinusoid_position_encoding(new_T, self.d_decoder) pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :] - x = x.scale(math.sqrt( - self.d_decoder)) + pos_enc * self.decoder_pe_scalar + x = x.scale( + math.sqrt(self.d_decoder)) + pos_enc * self.decoder_pe_scalar x = F.dropout(x, self.dropout, training=self.training) no_future_mask = masking.future_mask(T_dec, dtype=input.dtype) @@ -547,9 +548,8 @@ class TransformerTTS(nn.Layer): # stop condition: (if any ouput frame of the output multiframes hits the stop condition) # import pdb; pdb.set_trace() if paddle.any( - paddle.argmax( - stop_logits[0, -self.r:, :], axis=-1) == - self.stop_prob_index): + paddle.argmax(stop_logits[0, -self.r:, :], + axis=-1) == self.stop_prob_index): if verbose: print("Hits stop condition.") break @@ -602,8 +602,7 @@ class TransformerTTSLoss(nn.Layer): def forward(self, mel_output, mel_intermediate, mel_target, stop_logits, stop_probs): - mask = masking.feature_mask( - mel_target, axis=-1, dtype=mel_target.dtype) + mask = masking.feature_mask(mel_target, axis=-1, dtype=mel_target.dtype) mask1 = paddle.unsqueeze(mask, -1) mel_loss1 = L.masked_l1_loss(mel_output, mel_target, mask1) mel_loss2 = L.masked_l1_loss(mel_intermediate, mel_target, mask1) diff --git a/parakeet/models/waveflow.py b/parakeet/models/waveflow.py index e274cef..b6317bf 100644 --- a/parakeet/models/waveflow.py +++ b/parakeet/models/waveflow.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import time import math -from typing import List, Union, Tuple +import time +from typing import List +from typing import Tuple +from typing import Union import numpy as np import paddle @@ -22,8 +23,8 @@ from paddle import nn from paddle.nn import functional as F from paddle.nn import initializer as I -from parakeet.utils import checkpoint from parakeet.modules import geometry as geo +from parakeet.utils import checkpoint __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] @@ -120,7 +121,7 @@ class UpsampleNet(nn.LayerList): If trim_conv_artifact is ``True``, the output time steps is less than ``time_steps \* upsample_factors``. """ - x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T) + x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T) for layer in self: x = layer(x) if trim_conv_artifact: @@ -795,7 +796,7 @@ class ConditionalWaveFlow(nn.LayerList): The synthesized audio, where``T <= T_mel \* upsample_factors``. """ start = time.time() - condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) + condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T) batch_size, _, time_steps = condition.shape z = paddle.randn([batch_size, time_steps], dtype=mel.dtype) x = self.decoder.inverse(z, condition) @@ -893,12 +894,12 @@ class WaveFlowLoss(nn.Layer): class ConditionalWaveFlow2Infer(ConditionalWaveFlow): def forward(self, mel): """Generate raw audio given mel spectrogram. - + Parameters ---------- mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). - + Mel spectrogram of an utterance(in log-magnitude). + Returns ------- np.ndarray [shape=(T,)] diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py index 327cb7b..abf198b 100644 --- a/parakeet/modules/__init__.py +++ b/parakeet/modules/__init__.py @@ -11,11 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.modules.attention import * -from parakeet.modules.conv import * -from parakeet.modules.geometry import * -from parakeet.modules.losses import * -from parakeet.modules.masking import * -from parakeet.modules.positional_encoding import * -from parakeet.modules.transformer import * diff --git a/parakeet/modules/attention.py b/parakeet/modules/attention.py index e91ea74..154625c 100644 --- a/parakeet/modules/attention.py +++ b/parakeet/modules/attention.py @@ -11,19 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math + import numpy as np import paddle from paddle import nn from paddle.nn import functional as F -def scaled_dot_product_attention(q, - k, - v, - mask=None, - dropout=0.0, +def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0, training=True): r"""Scaled dot product attention with masking. @@ -33,24 +29,19 @@ def scaled_dot_product_attention(q, Parameters ----------- - q : Tensor [shape=(\*, T_q, d)] the query tensor. - k : Tensor [shape=(\*, T_k, d)] the key tensor. - v : Tensor [shape=(\*, T_k, d_v)] the value tensor. - mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional the mask tensor, zeros correspond to paddings. Defaults to None. - + Returns ---------- - out : Tensor [shape=(\*, T_q, d_v)] + out : Tensor [shape=(\*, T_q, d_v)] the context vector. - attn_weights : Tensor [shape=(\*, T_q, T_k)] the attention weights. """ @@ -74,10 +65,8 @@ def drop_head(x, drop_n_heads, training=True): ---------- x : Tensor [shape=(batch_size, num_heads, time_steps, channels)] The input, multiple context vectors. - drop_n_heads : int [0<= drop_n_heads <= num_heads] Number of vectors to drop. - training : bool A flag indicating whether it is in training. If `False`, no dropout is applied. @@ -127,17 +116,14 @@ class MonoheadAttention(nn.Layer): ---------- model_dim : int Feature size of the query. - dropout : float, optional - Dropout probability of scaled dot product attention and final context + Dropout probability of scaled dot product attention and final context vector. Defaults to 0.0. - k_dim : int, optional - Feature size of the key of each scaled dot product attention. If not + Feature size of the key of each scaled dot product attention. If not provided, it is set to `model_dim / num_heads`. Defaults to None. - v_dim : int, optional - Feature size of the key of each scaled dot product attention. If not + Feature size of the key of each scaled dot product attention. If not provided, it is set to `model_dim / num_heads`. Defaults to None. """ @@ -162,23 +148,19 @@ class MonoheadAttention(nn.Layer): Parameters ----------- - q : Tensor [shape=(batch_size, time_steps_q, model_dim)] + q : Tensor [shape=(batch_size, time_steps_q, model_dim)] The queries. - - k : Tensor [shape=(batch_size, time_steps_k, model_dim)] + k : Tensor [shape=(batch_size, time_steps_k, model_dim)] The keys. - - v : Tensor [shape=(batch_size, time_steps_k, model_dim)] + v : Tensor [shape=(batch_size, time_steps_k, model_dim)] The values. - mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape The mask. Returns ---------- - out : Tensor [shape=(batch_size, time_steps_q, model_dim)] + out : Tensor [shape=(batch_size, time_steps_q, model_dim)] The context vector. - attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)] The attention weights. """ @@ -200,20 +182,16 @@ class MultiheadAttention(nn.Layer): ----------- model_dim: int The feature size of query. - num_heads : int The number of attention heads. - dropout : float, optional - Dropout probability of scaled dot product attention and final context + Dropout probability of scaled dot product attention and final context vector. Defaults to 0.0. - k_dim : int, optional - Feature size of the key of each scaled dot product attention. If not + Feature size of the key of each scaled dot product attention. If not provided, it is set to ``model_dim / num_heads``. Defaults to None. - v_dim : int, optional - Feature size of the key of each scaled dot product attention. If not + Feature size of the key of each scaled dot product attention. If not provided, it is set to ``model_dim / num_heads``. Defaults to None. Raises @@ -248,23 +226,19 @@ class MultiheadAttention(nn.Layer): Parameters ----------- - q : Tensor [shape=(batch_size, time_steps_q, model_dim)] + q : Tensor [shape=(batch_size, time_steps_q, model_dim)] The queries. - - k : Tensor [shape=(batch_size, time_steps_k, model_dim)] + k : Tensor [shape=(batch_size, time_steps_k, model_dim)] The keys. - - v : Tensor [shape=(batch_size, time_steps_k, model_dim)] + v : Tensor [shape=(batch_size, time_steps_k, model_dim)] The values. - mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape The mask. Returns ---------- - out : Tensor [shape=(batch_size, time_steps_q, model_dim)] + out : Tensor [shape=(batch_size, time_steps_q, model_dim)] The context vector. - attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)] The attention weights. """ @@ -290,16 +264,12 @@ class LocationSensitiveAttention(nn.Layer): ----------- d_query: int The feature size of query. - d_key : int The feature size of key. - d_attention : int - The feature size of dimension. - + The feature size of dimension. location_filters : int Filter size of attention convolution. - location_kernel_size : int Kernel size of attention convolution. """ @@ -337,27 +307,22 @@ class LocationSensitiveAttention(nn.Layer): Parameters ----------- - query : Tensor [shape=(batch_size, d_query)] + query : Tensor [shape=(batch_size, d_query)] The queries. - - processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)] + processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)] The keys after linear layer. - - value : Tensor [shape=(batch_size, time_steps_k, d_key)] + value : Tensor [shape=(batch_size, time_steps_k, d_key)] The values. - attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)] Attention weights concat. - mask : Tensor, optional The mask. Shape should be (batch_size, times_steps_k, 1). Defaults to None. Returns ---------- - attention_context : Tensor [shape=(batch_size, d_attention)] + attention_context : Tensor [shape=(batch_size, d_attention)] The context vector. - attention_weights : Tensor [shape=(batch_size, time_steps_k)] The attention weights. """ diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py index c44aa66..926ce8f 100644 --- a/parakeet/modules/audio.py +++ b/parakeet/modules/audio.py @@ -11,20 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import librosa +import numpy as np import paddle +from librosa.util import pad_center from paddle import nn from paddle.nn import functional as F from scipy import signal -import librosa -from librosa.util import pad_center -import numpy as np __all__ = ["quantize", "dequantize", "STFT", "MelScale"] def quantize(values, n_bands): - """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in + """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in [0, n_bands). Parameters @@ -33,7 +32,7 @@ def quantize(values, n_bands): The floating point value. n_bands : int - The number of bands. The output integer Tensor's value is in the range + The number of bands. The output integer Tensor's value is in the range [0, n_bans). Returns @@ -46,7 +45,7 @@ def quantize(values, n_bands): def dequantize(quantized, n_bands, dtype=None): - """Linearlly dequantize an integer Tensor into a float Tensor in the range + """Linearlly dequantize an integer Tensor into a float Tensor in the range [-1, 1). Parameters @@ -55,7 +54,7 @@ def dequantize(quantized, n_bands, dtype=None): The quantized value in the range [0, n_bands). n_bands : int - Number of bands. The input integer Tensor's value is in the range + Number of bands. The input integer Tensor's value is in the range [0, n_bans). dtype : str, optional @@ -73,43 +72,36 @@ def dequantize(quantized, n_bands, dtype=None): class STFT(nn.Layer): - """A module for computing stft transformation in a differentiable way. + """A module for computing stft transformation in a differentiable way. Parameters ------------ n_fft : int Number of samples in a frame. - hop_length : int Number of samples shifted between adjacent frames. - win_length : int Length of the window. - window : str, optional - Name of window function, see `scipy.signal.get_window` for more + Name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning". - center : bool If True, the signal y is padded so that frame D[:, t] is centered at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length]. Defaults to True. - pad_mode : string or function - If center=True, this argument is passed to np.pad for padding the edges - of the signal y. By default (pad_mode="reflect"), y is padded on both - sides with its own reflection, mirrored around its first and last + If center=True, this argument is passed to np.pad for padding the edges + of the signal y. By default (pad_mode="reflect"), y is padded on both + sides with its own reflection, mirrored around its first and last sample respectively. If center=False, this argument is ignored. - - Notes ----------- - It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more + It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more details. - Given a audio which ``T`` samples, it the STFT transformation outputs a - spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2`` + Given a audio which ``T`` samples, it the STFT transformation outputs a + spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2`` and ``frames = 1 + T // hop_lenghth``. Ony ``center`` and ``reflect`` padding is supported now. @@ -144,19 +136,19 @@ class STFT(nn.Layer): # pad window to n_fft size if n_fft != win_length: window = pad_center(window, n_fft, mode="constant") - #lpad = (n_fft - win_length) // 2 - #rpad = n_fft - win_length - lpad - #window = np.pad(window, ((lpad, pad), ), 'constant') + # lpad = (n_fft - win_length) // 2 + # rpad = n_fft - win_length - lpad + # window = np.pad(window, ((lpad, pad), ), 'constant') # calculate weights - #r = np.arange(0, n_fft) - #M = np.expand_dims(r, -1) * np.expand_dims(r, 0) - #w_real = np.reshape(window * - #np.cos(2 * np.pi * M / n_fft)[:self.n_bin], - #(self.n_bin, 1, self.n_fft)) - #w_imag = np.reshape(window * - #np.sin(-2 * np.pi * M / n_fft)[:self.n_bin], - #(self.n_bin, 1, self.n_fft)) + # r = np.arange(0, n_fft) + # M = np.expand_dims(r, -1) * np.expand_dims(r, 0) + # w_real = np.reshape(window * + # np.cos(2 * np.pi * M / n_fft)[:self.n_bin], + # (self.n_bin, 1, self.n_fft)) + # w_imag = np.reshape(window * + # np.sin(-2 * np.pi * M / n_fft)[:self.n_bin], + # (self.n_bin, 1, self.n_fft)) weight = np.fft.fft(np.eye(n_fft))[:self.n_bin] w_real = weight.real w_imag = weight.imag @@ -174,17 +166,18 @@ class STFT(nn.Layer): The input waveform. Returns ------------ - real : Tensor [shape=(B, C, frames)] + real : Tensor [shape=(B, C, frames)] The real part of the spectrogram. - imag : Tensor [shape=(B, C, frames)] + imag : Tensor [shape=(B, C, frames)] The image part of the spectrogram. """ x = paddle.unsqueeze(x, axis=1) if self.center: - x = F.pad(x, [self.n_fft // 2, self.n_fft // 2], - data_format='NCL', - mode=self.pad_mode) + x = F.pad( + x, [self.n_fft // 2, self.n_fft // 2], + data_format='NCL', + mode=self.pad_mode) # to BCT, C=1 out = F.conv1d(x, self.weight, stride=self.hop_length) @@ -199,7 +192,7 @@ class STFT(nn.Layer): The input waveform. Returns ------------ - Tensor [shape=(B, C, T)] + Tensor [shape=(B, C, T)] The power spectrum. """ real, imag = self.forward(x) @@ -214,7 +207,7 @@ class STFT(nn.Layer): The input waveform. Returns ------------ - Tensor [shape=(B, C, T)] + Tensor [shape=(B, C, T)] The magnitude of the spectrum. """ power = self.power(x) diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py index d984605..d9bd98d 100644 --- a/parakeet/modules/conv.py +++ b/parakeet/modules/conv.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle from paddle import nn @@ -22,48 +21,40 @@ __all__ = [ class Conv1dCell(nn.Conv1D): - """A subclass of Conv1D layer, which can be used in an autoregressive + """A subclass of Conv1D layer, which can be used in an autoregressive decoder like an RNN cell. - When used in autoregressive decoding, it performs causal temporal - convolution incrementally. At each time step, it takes a step input and - returns a step output. + When used in autoregressive decoding, it performs causal temporal + convolution incrementally. At each time step, it takes a step input and + returns a step output. Notes ------ - It is done by caching an internal buffer of length ``receptive_file - 1``. - when adding a step input, the buffer is shited by one step, the latest - input is added to be buffer and the oldest step is discarded. And it - returns a step output. For single step case, convolution is equivalent to a + It is done by caching an internal buffer of length ``receptive_file - 1``. + when adding a step input, the buffer is shited by one step, the latest + input is added to be buffer and the oldest step is discarded. And it + returns a step output. For single step case, convolution is equivalent to a linear transformation. - That it can be used as a cell depends on several restrictions: - 1. stride must be 1; 2. padding must be a causal padding (recpetive_field - 1, 0). - - Thus, these arguments are removed from the ``__init__`` method of this + Thus, these arguments are removed from the ``__init__`` method of this class. Parameters ---------- in_channels: int The feature size of the input. - out_channels: int The feature size of the output. - kernel_size: int or Tuple[int] The size of the kernel. - dilation: int or Tuple[int] The dilation of the convolution, by default 1 - weight_attr: ParamAttr, Initializer, str or bool, optional The parameter attribute of the convolution kernel, by default None. - bias_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias. If ``False``, this layer does not + The parameter attribute of the bias. If ``False``, this layer does not have a bias, by default None. Examples @@ -114,7 +105,7 @@ class Conv1dCell(nn.Conv1D): Warnings --------- - This method should be called before a sequence of calls to + This method should be called before a sequence of calls to ``add_input``. Raises @@ -165,12 +156,12 @@ class Conv1dCell(nn.Conv1D): Parameters ----------- - x_t : Tensor [shape=(batch_size, in_channels)] + x_t : Tensor [shape=(batch_size, in_channels)] The step input. Returns ------- - y_t :Tensor [shape=(batch_size, out_channels)] + y_t :Tensor [shape=(batch_size, out_channels)] The step output. """ batch_size = x_t.shape[0] @@ -199,36 +190,27 @@ class Conv1dBatchNorm(nn.Layer): ---------- in_channels : int The feature size of the input. - out_channels : int The feature size of the output. - kernel_size : int The size of the convolution kernel. - stride : int, optional The stride of the convolution, by default 1. - padding : int, str or Tuple[int], optional - The padding of the convolution. + The padding of the convolution. If int, a symmetrical padding is applied before convolution; If str, it should be "same" or "valid"; - If Tuple[int], its length should be 2, meaning + If Tuple[int], its length should be 2, meaning ``(pad_before, pad_after)``, by default 0. - weight_attr : ParamAttr, Initializer, str or bool, optional The parameter attribute of the convolution kernel, by default None. - bias_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias of the convolution, by default + The parameter attribute of the bias of the convolution, by default None. - data_format : str ["NCL" or "NLC"], optional The data layout of the input, by default "NCL" - momentum : float, optional The momentum of the BatchNorm1D layer, by default 0.9 - epsilon : [type], optional The epsilon of the BatchNorm1D layer, by default 1e-05 """ diff --git a/parakeet/modules/expansion.py b/parakeet/modules/expansion.py index d136ada..e9d4b6f 100644 --- a/parakeet/modules/expansion.py +++ b/parakeet/modules/expansion.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np - import paddle from paddle import Tensor diff --git a/parakeet/modules/fastspeech2_predictor/duration_predictor.py b/parakeet/modules/fastspeech2_predictor/duration_predictor.py index 10e3f38..d0f5262 100644 --- a/parakeet/modules/fastspeech2_predictor/duration_predictor.py +++ b/parakeet/modules/fastspeech2_predictor/duration_predictor.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Duration predictor related modules.""" - import paddle from paddle import nn + from parakeet.modules.layer_norm import LayerNorm from parakeet.modules.masked_fill import masked_fill @@ -78,8 +78,7 @@ class DurationPredictor(nn.Layer): stride=1, padding=(kernel_size - 1) // 2, ), nn.ReLU(), - LayerNorm( - n_chans, dim=1), + LayerNorm(n_chans, dim=1), nn.Dropout(dropout_rate), )) self.linear = nn.Linear(n_chans, 1, bias_attr=True) diff --git a/parakeet/modules/fastspeech2_predictor/length_regulator.py b/parakeet/modules/fastspeech2_predictor/length_regulator.py index 0e6233c..86f9ebb 100644 --- a/parakeet/modules/fastspeech2_predictor/length_regulator.py +++ b/parakeet/modules/fastspeech2_predictor/length_regulator.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Length regulator related modules.""" - import numpy as np import paddle from paddle import nn diff --git a/parakeet/modules/fastspeech2_predictor/postnet.py b/parakeet/modules/fastspeech2_predictor/postnet.py index 50b849e..885ecda 100644 --- a/parakeet/modules/fastspeech2_predictor/postnet.py +++ b/parakeet/modules/fastspeech2_predictor/postnet.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import six -import paddle from paddle import nn diff --git a/parakeet/modules/fastspeech2_predictor/variance_predictor.py b/parakeet/modules/fastspeech2_predictor/variance_predictor.py index 92136a2..0a980dd 100644 --- a/parakeet/modules/fastspeech2_predictor/variance_predictor.py +++ b/parakeet/modules/fastspeech2_predictor/variance_predictor.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """Variance predictor related modules.""" - import paddle from paddle import nn +from typeguard import check_argument_types + from parakeet.modules.layer_norm import LayerNorm from parakeet.modules.masked_fill import masked_fill -from typeguard import check_argument_types class VariancePredictor(nn.Layer): @@ -69,8 +69,7 @@ class VariancePredictor(nn.Layer): padding=(kernel_size - 1) // 2, bias_attr=True, ), nn.ReLU(), - LayerNorm( - n_chans, dim=1), + LayerNorm(n_chans, dim=1), nn.Dropout(dropout_rate), )) self.linear = nn.Linear(n_chans, 1, bias_attr=True) diff --git a/parakeet/modules/fastspeech2_transformer/attention.py b/parakeet/modules/fastspeech2_transformer/attention.py index 9cb6001..ae941a7 100644 --- a/parakeet/modules/fastspeech2_transformer/attention.py +++ b/parakeet/modules/fastspeech2_transformer/attention.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """Multi-Head Attention layer definition.""" - import math import numpy import paddle from paddle import nn + from parakeet.modules.masked_fill import masked_fill @@ -111,8 +111,7 @@ class MultiHeadedAttention(nn.Layer): mask = paddle.logical_not(mask) min_value = float( numpy.finfo( - paddle.to_tensor( - 0, dtype=scores.dtype).numpy().dtype).min) + paddle.to_tensor(0, dtype=scores.dtype).numpy().dtype).min) scores = masked_fill(scores, mask, min_value) # (batch, head, time1, time2) diff --git a/parakeet/modules/fastspeech2_transformer/embedding.py b/parakeet/modules/fastspeech2_transformer/embedding.py index 9767193..71160a6 100644 --- a/parakeet/modules/fastspeech2_transformer/embedding.py +++ b/parakeet/modules/fastspeech2_transformer/embedding.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Positional Encoding Module.""" - import math import paddle @@ -55,9 +54,8 @@ class PositionalEncoding(nn.Layer): position = paddle.arange( 0, x.shape[1], dtype=paddle.float32).unsqueeze(1) div_term = paddle.exp( - paddle.arange( - 0, self.d_model, 2, - dtype=paddle.float32) * -(math.log(10000.0) / self.d_model)) + paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * + -(math.log(10000.0) / self.d_model)) pe[:, 0::2] = paddle.sin(position * div_term) pe[:, 1::2] = paddle.cos(position * div_term) pe = pe.unsqueeze(0) diff --git a/parakeet/modules/fastspeech2_transformer/encoder.py b/parakeet/modules/fastspeech2_transformer/encoder.py index 84a6142..630b50f 100644 --- a/parakeet/modules/fastspeech2_transformer/encoder.py +++ b/parakeet/modules/fastspeech2_transformer/encoder.py @@ -11,16 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging from paddle import nn -from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding + from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention +from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding +from parakeet.modules.fastspeech2_transformer.encoder_layer import EncoderLayer from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward -from parakeet.modules.fastspeech2_transformer.encoder_layer import EncoderLayer from parakeet.modules.fastspeech2_transformer.repeat import repeat @@ -90,16 +90,14 @@ class Encoder(nn.Layer): self.conv_subsampling_factor = 1 if input_layer == "linear": self.embed = nn.Sequential( - nn.Linear( - idim, attention_dim, bias_attr=True), + nn.Linear(idim, attention_dim, bias_attr=True), nn.LayerNorm(attention_dim), nn.Dropout(dropout_rate), nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "embed": self.embed = nn.Sequential( - nn.Embedding( - idim, attention_dim, padding_idx=padding_idx), + nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, nn.Layer): self.embed = nn.Sequential( @@ -125,10 +123,9 @@ class Encoder(nn.Layer): ]: logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = [( - attention_heads, - attention_dim, - attention_dropout_rate, )] * num_blocks + encoder_selfattn_layer_args = [ + (attention_heads, attention_dim, attention_dropout_rate, ) + ] * num_blocks else: raise NotImplementedError(selfattention_layer_type) @@ -159,18 +156,14 @@ class Encoder(nn.Layer): dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d - positionwise_layer_args = ( - attention_dim, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, ) + positionwise_layer_args = (attention_dim, linear_units, + positionwise_conv_kernel_size, + dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear - positionwise_layer_args = ( - attention_dim, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, ) + positionwise_layer_args = (attention_dim, linear_units, + positionwise_conv_kernel_size, + dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") return positionwise_layer, positionwise_layer_args @@ -229,4 +222,4 @@ class Encoder(nn.Layer): new_cache.append(xs) if self.normalize_before: xs = self.after_norm(xs) - return xs, masks, new_cache \ No newline at end of file + return xs, masks, new_cache diff --git a/parakeet/modules/fastspeech2_transformer/encoder_layer.py b/parakeet/modules/fastspeech2_transformer/encoder_layer.py index 00d551e..d8f89d6 100644 --- a/parakeet/modules/fastspeech2_transformer/encoder_layer.py +++ b/parakeet/modules/fastspeech2_transformer/encoder_layer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Encoder self-attention layer definition.""" - import paddle from paddle import nn diff --git a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py index 273d8d0..8845b2a 100644 --- a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py +++ b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Layer modules for FFT block in FastSpeech (Feed-forward Transformer).""" - import paddle diff --git a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py index c57fba6..39c06eb 100644 --- a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py +++ b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Positionwise feed forward layer definition.""" - import paddle diff --git a/parakeet/modules/fastspeech2_transformer/repeat.py b/parakeet/modules/fastspeech2_transformer/repeat.py index 250a3a4..3c62298 100644 --- a/parakeet/modules/fastspeech2_transformer/repeat.py +++ b/parakeet/modules/fastspeech2_transformer/repeat.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Repeat the same layer definition.""" - import paddle diff --git a/parakeet/modules/geometry.py b/parakeet/modules/geometry.py index 05a5931..a3d56f7 100644 --- a/parakeet/modules/geometry.py +++ b/parakeet/modules/geometry.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle @@ -23,15 +22,13 @@ def shuffle_dim(x, axis, perm=None): ---------- x : Tensor The input tensor. - axis : int The axis to shuffle. - perm : List[int], ndarray, optional The order to reorder the tensor along the ``axis``-th dimension. - It is a permutation of ``[0, d)``, where d is the size of the - ``axis``-th dimension of the input tensor. If not provided, + It is a permutation of ``[0, d)``, where d is the size of the + ``axis``-th dimension of the input tensor. If not provided, a random permutation is used. Defaults to None. Returns diff --git a/parakeet/modules/layer_norm.py b/parakeet/modules/layer_norm.py index 2ff91b8..3bab823 100644 --- a/parakeet/modules/layer_norm.py +++ b/parakeet/modules/layer_norm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Layer normalization module.""" - import paddle @@ -55,8 +54,9 @@ class LayerNorm(paddle.nn.LayerNorm): orig_perm = list(range(len_dim)) new_perm = orig_perm[:] - new_perm[self.dim], new_perm[len_dim - 1] = new_perm[ - len_dim - 1], new_perm[self.dim] + new_perm[self.dim], new_perm[len_dim - + 1] = new_perm[len_dim - + 1], new_perm[self.dim] return paddle.transpose( super(LayerNorm, self).forward(paddle.transpose(x, new_perm)), diff --git a/parakeet/modules/losses.py b/parakeet/modules/losses.py index 9c34cc7..ece9e04 100644 --- a/parakeet/modules/losses.py +++ b/parakeet/modules/losses.py @@ -11,13 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import numba -import numpy as np import paddle -from paddle import nn -from paddle.nn import functional as F from paddle.fluid.layers import sequence_mask +from paddle.nn import functional as F __all__ = [ "guided_attention_loss", @@ -30,7 +26,7 @@ __all__ = [ def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None): """Build that W matrix. shape(B, T_dec, T_enc) W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) - + See also: Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969. """ @@ -88,12 +84,10 @@ def masked_l1_loss(prediction, target, mask): ---------- prediction : Tensor The prediction. - target : Tensor The target. The shape should be broadcastable to ``prediction``. - mask : Tensor - The mask. The shape should be broadcatable to the broadcasted shape of + The mask. The shape should be broadcatable to the broadcasted shape of ``prediction`` and ``target``. Returns @@ -113,13 +107,10 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1): ---------- logits : Tensor The logits. The ``axis``-th axis is the class dimension. - label : Tensor [dtype: int] The label. The size of the ``axis``-th axis should be 1. - mask : Tensor The mask. The shape should be broadcastable to ``label``. - axis : int, optional The index of the class dimension in the shape of ``logits``, by default -1. diff --git a/parakeet/modules/masked_fill.py b/parakeet/modules/masked_fill.py index 4ca9826..34230f1 100644 --- a/parakeet/modules/masked_fill.py +++ b/parakeet/modules/masked_fill.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union import paddle -from typing import Union def is_broadcastable(shp1, shp2): @@ -34,4 +34,4 @@ def masked_fill(xs: paddle.Tensor, trues = paddle.ones_like(xs) * value mask = mask.cast(dtype=paddle.bool) xs = paddle.where(mask, trues, xs) - return xs \ No newline at end of file + return xs diff --git a/parakeet/modules/masking.py b/parakeet/modules/masking.py index 96871a9..7cf3704 100644 --- a/parakeet/modules/masking.py +++ b/parakeet/modules/masking.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle -from paddle.fluid.layers import sequence_mask __all__ = [ "id_mask", @@ -25,7 +23,7 @@ __all__ = [ def id_mask(input, padding_index=0, dtype="bool"): """Generate mask with input ids. - + Those positions where the value equals ``padding_index`` correspond to 0 or ``False``, otherwise, 1 or ``True``. @@ -33,10 +31,8 @@ def id_mask(input, padding_index=0, dtype="bool"): ---------- input : Tensor [dtype: int] The input tensor. It represents the ids. - padding_index : int, optional The id which represents padding, by default 0. - dtype : str, optional Data type of the returned mask, by default "bool". @@ -50,7 +46,7 @@ def id_mask(input, padding_index=0, dtype="bool"): def feature_mask(input, axis, dtype="bool"): """Compute mask from input features. - + For a input features, represented as batched feature vectors, those vectors which all zeros are considerd padding vectors. @@ -58,19 +54,16 @@ def feature_mask(input, axis, dtype="bool"): ---------- input : Tensor [dtype: float] The input tensor which represents featues. - axis : int The index of the feature dimension in ``input``. Other dimensions are considered ``spatial`` dimensions. - dtype : str, optional Data type of the generated mask, by default "bool" - Returns ------- Tensor The geenrated mask with ``spatial`` shape as mentioned above. - + It has one less dimension than ``input`` does. """ feature_sum = paddle.sum(paddle.abs(input), axis) @@ -83,22 +76,20 @@ def combine_mask(mask1, mask2): Parameters ----------- mask1 : Tensor - The first mask. - + The first mask. mask2 : Tensor The second mask with broadcastable shape with ``mask1``. - Returns -------- Tensor Combined mask. - + Notes ------ - It is mainly used to combine the padding mask and no future mask for + It is mainly used to combine the padding mask and no future mask for transformer decoder. - - Padding mask is used to mask padding positions of the decoder inputs and + + Padding mask is used to mask padding positions of the decoder inputs and no future mask is used to prevent the decoder to see future information. """ if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL: @@ -109,8 +100,8 @@ def combine_mask(mask1, mask2): def future_mask(time_steps, dtype="bool"): """Generate lower triangular mask. - - It is used at transformer decoder to prevent the decoder to see future + + It is used at transformer decoder to prevent the decoder to see future information. Parameters diff --git a/parakeet/modules/nets_utils.py b/parakeet/modules/nets_utils.py index 5997873..47eae65 100644 --- a/parakeet/modules/nets_utils.py +++ b/parakeet/modules/nets_utils.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle from paddle import nn from typeguard import check_argument_types diff --git a/parakeet/modules/normalizer.py b/parakeet/modules/normalizer.py index 176741b..a4fc598 100644 --- a/parakeet/modules/normalizer.py +++ b/parakeet/modules/normalizer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle from paddle import nn diff --git a/parakeet/modules/positional_encoding.py b/parakeet/modules/positional_encoding.py index 919af10..7c368c3 100644 --- a/parakeet/modules/positional_encoding.py +++ b/parakeet/modules/positional_encoding.py @@ -11,13 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import math -import numpy as np - import paddle from paddle import Tensor -from paddle.nn import functional as F __all__ = ["sinusoid_position_encoding", "scaled_position_encoding"] diff --git a/parakeet/modules/ssim.py b/parakeet/modules/ssim.py index 3e4b20d..c9899cd 100644 --- a/parakeet/modules/ssim.py +++ b/parakeet/modules/ssim.py @@ -11,13 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from math import exp -import numpy as np import paddle -from paddle import nn import paddle.nn.functional as F +from paddle import nn def gaussian(window_size, sigma): @@ -30,9 +28,8 @@ def gaussian(window_size, sigma): def create_window(window_size, channel): _1D_window = gaussian(window_size, 1.5).unsqueeze(1) - _2D_window = paddle.matmul(_1D_window, - paddle.transpose(_1D_window, - [1, 0])).unsqueeze([0, 1]) + _2D_window = paddle.matmul(_1D_window, paddle.transpose( + _1D_window, [1, 0])).unsqueeze([0, 1]) window = paddle.expand(_2D_window, [channel, 1, window_size, window_size]) return window @@ -50,8 +47,7 @@ def _ssim(img1, img2, window, window_size, channel, size_average=True): sigma2_sq = F.conv2d( img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq sigma12 = F.conv2d( - img1 * img2, window, padding=window_size // 2, - groups=channel) - mu1_mu2 + img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 C1 = 0.01**2 C2 = 0.03**2 @@ -81,4 +77,4 @@ class SSIM(nn.Layer): def ssim(img1, img2, window_size=11, size_average=True): (_, channel, _, _) = img1.shape window = create_window(window_size, channel) - return _ssim(img1, img2, window, window_size, channel, size_average) \ No newline at end of file + return _ssim(img1, img2, window, window_size, channel, size_average) diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py index 7c3779c..16382d6 100644 --- a/parakeet/modules/stft_loss.py +++ b/parakeet/modules/stft_loss.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle from paddle import nn from paddle.nn import functional as F @@ -28,16 +27,20 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns: - Tensor: Spectral convergence loss value. + Parameters + ---------- + x_mag : Tensor + Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag : Tensor) + Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns + ---------- + Tensor + Spectral convergence loss value. """ return paddle.norm( y_mag - x_mag, p="fro") / paddle.clip( - paddle.norm( - y_mag, p="fro"), min=1e-10) + paddle.norm(y_mag, p="fro"), min=1e-10) class LogSTFTMagnitudeLoss(nn.Layer): @@ -62,10 +65,8 @@ class LogSTFTMagnitudeLoss(nn.Layer): Log STFT magnitude loss value. """ return F.l1_loss( - paddle.log(paddle.clip( - y_mag, min=self.epsilon)), - paddle.log(paddle.clip( - x_mag, min=self.epsilon))) + paddle.log(paddle.clip(y_mag, min=self.epsilon)), + paddle.log(paddle.clip(x_mag, min=self.epsilon))) class STFTLoss(nn.Layer): diff --git a/parakeet/modules/transformer.py b/parakeet/modules/transformer.py index e857990..696b12b 100644 --- a/parakeet/modules/transformer.py +++ b/parakeet/modules/transformer.py @@ -11,14 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import math -import paddle from paddle import nn -from paddle.nn import functional as F - from parakeet.modules import attention as attn -from parakeet.modules.masking import combine_mask +from paddle.nn import functional as F __all__ = [ "PositionwiseFFN", @@ -31,18 +26,16 @@ class PositionwiseFFN(nn.Layer): """A faithful implementation of Position-wise Feed-Forward Network in `Attention is All You Need `_. It is basically a 2-layer MLP, with relu actication and dropout in between. - + Parameters ---------- input_size: int - The feature size of the intput. It is also the feature size of the + The feature size of the intput. It is also the feature size of the output. - hidden_size: int The hidden size. - dropout: float - The probability of the Dropout applied to the output of the first + The probability of the Dropout applied to the output of the first layer, by default 0. """ @@ -74,30 +67,27 @@ class PositionwiseFFN(nn.Layer): class TransformerEncoderLayer(nn.Layer): - """A faithful implementation of Transformer encoder layer in + """A faithful implementation of Transformer encoder layer in `Attention is All You Need `_. - + Parameters ---------- d_model :int - The feature size of the input. It is also the feature size of the + The feature size of the input. It is also the feature size of the output. - n_heads : int - The number of heads of self attention (a ``MultiheadAttention`` + The number of heads of self attention (a ``MultiheadAttention`` layer). - d_ffn : int - The hidden size of the positional feed forward network (a + The hidden size of the positional feed forward network (a ``PositionwiseFFN`` layer). - dropout : float, optional - The probability of the dropout in MultiHeadAttention and + The probability of the dropout in MultiHeadAttention and PositionwiseFFN, by default 0. - + Notes ------ - It uses the PostLN (post layer norm) scheme. + It uses the PostLN (post layer norm) scheme. """ def __init__(self, d_model, n_heads, d_ffn, dropout=0.): @@ -112,60 +102,54 @@ class TransformerEncoderLayer(nn.Layer): def forward(self, x, mask): """Forward pass of TransformerEncoderLayer. - + Parameters ---------- x : Tensor [shape=(batch_size, time_steps, d_model)] The input. - mask : Tensor - The padding mask. The shape is (batch_size, time_steps, + The padding mask. The shape is (batch_size, time_steps, time_steps) or broadcastable shape. - + Returns ------- x :Tensor [shape=(batch_size, time_steps, d_model)] The encoded output. - + attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)] The attention weights of the self attention. """ context_vector, attn_weights = self.self_mha(x, x, x, mask) x = self.layer_norm1( - F.dropout( - x + context_vector, self.dropout, training=self.training)) + F.dropout(x + context_vector, self.dropout, training=self.training)) x = self.layer_norm2( - F.dropout( - x + self.ffn(x), self.dropout, training=self.training)) + F.dropout(x + self.ffn(x), self.dropout, training=self.training)) return x, attn_weights class TransformerDecoderLayer(nn.Layer): """A faithful implementation of Transformer decoder layer in `Attention is All You Need `_. - + Parameters ---------- d_model :int - The feature size of the input. It is also the feature size of the + The feature size of the input. It is also the feature size of the output. - n_heads : int - The number of heads of attentions (``MultiheadAttention`` + The number of heads of attentions (``MultiheadAttention`` layers). - d_ffn : int - The hidden size of the positional feed forward network (a + The hidden size of the positional feed forward network (a ``PositionwiseFFN`` layer). - dropout : float, optional - The probability of the dropout in MultiHeadAttention and + The probability of the dropout in MultiHeadAttention and PositionwiseFFN, by default 0. - + Notes ------ - It uses the PostLN (post layer norm) scheme. + It uses the PostLN (post layer norm) scheme. """ def __init__(self, d_model, n_heads, d_ffn, dropout=0.): @@ -183,46 +167,41 @@ class TransformerDecoderLayer(nn.Layer): def forward(self, q, k, v, encoder_mask, decoder_mask): """Forward pass of TransformerEncoderLayer. - + Parameters ---------- - q : Tensor [shape=(batch_size, time_steps_q, d_model)] + q : Tensor [shape=(batch_size, time_steps_q, d_model)] The decoder input. - k : Tensor [shape=(batch_size, time_steps_k, d_model)] + k : Tensor [shape=(batch_size, time_steps_k, d_model)] The keys. v : Tensor [shape=(batch_size, time_steps_k, d_model)] The values encoder_mask : Tensor - Encoder padding mask, shape is ``(batch_size, time_steps_k, + Encoder padding mask, shape is ``(batch_size, time_steps_k, time_steps_k)`` or broadcastable shape. decoder_mask : Tensor Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)`` or broadcastable shape. - + Returns -------- q : Tensor [shape=(batch_size, time_steps_q, d_model)] The decoder output. - self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)] Decoder self attention. - - cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] + + cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] Decoder-encoder cross attention. """ - context_vector, self_attn_weights = self.self_mha(q, q, q, - decoder_mask) + context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) q = self.layer_norm1( - F.dropout( - q + context_vector, self.dropout, training=self.training)) + F.dropout(q + context_vector, self.dropout, training=self.training)) context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask) q = self.layer_norm2( - F.dropout( - q + context_vector, self.dropout, training=self.training)) + F.dropout(q + context_vector, self.dropout, training=self.training)) q = self.layer_norm3( - F.dropout( - q + self.ffn(q), self.dropout, training=self.training)) + F.dropout(q + self.ffn(q), self.dropout, training=self.training)) return q, self_attn_weights, cross_attn_weights diff --git a/parakeet/training/__init__.py b/parakeet/training/__init__.py index aec401c..abf198b 100644 --- a/parakeet/training/__init__.py +++ b/parakeet/training/__init__.py @@ -11,6 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from parakeet.training.cli import * -from parakeet.training.experiment import * diff --git a/parakeet/training/cli.py b/parakeet/training/cli.py index a3cfbda..a630994 100644 --- a/parakeet/training/cli.py +++ b/parakeet/training/cli.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse @@ -33,7 +32,6 @@ def default_argument_parser(): The ``--device`` and ``--nprocs`` specifies how to run the training. - See Also -------- parakeet.training.experiment diff --git a/parakeet/training/default_config.py b/parakeet/training/default_config.py index 583f6e6..7deb795 100644 --- a/parakeet/training/default_config.py +++ b/parakeet/training/default_config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode _C = CfgNode( diff --git a/parakeet/training/experiment.py b/parakeet/training/experiment.py index 5daaf08..892e810 100644 --- a/parakeet/training/experiment.py +++ b/parakeet/training/experiment.py @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import sys import logging +import sys from pathlib import Path import paddle @@ -21,7 +20,8 @@ from paddle import distributed as dist from paddle.io import DistributedBatchSampler from visualdl import LogWriter -from parakeet.utils import checkpoint, mp_tools +from parakeet.utils import checkpoint +from parakeet.utils import mp_tools __all__ = ["ExperimentBase"] diff --git a/parakeet/training/extension.py b/parakeet/training/extension.py index 57c4f29..07e9269 100644 --- a/parakeet/training/extension.py +++ b/parakeet/training/extension.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import Callable PRIORITY_WRITER = 300 diff --git a/parakeet/training/extensions/evaluator.py b/parakeet/training/extensions/evaluator.py index 6ebaae6..47b3527 100644 --- a/parakeet/training/extensions/evaluator.py +++ b/parakeet/training/extensions/evaluator.py @@ -11,18 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Dict -from collections import defaultdict -from typing import Optional, Callable, Dict - -from tqdm import tqdm import paddle -from paddle import Tensor -from paddle.nn import Layer from paddle.io import DataLoader +from paddle.nn import Layer -from parakeet.training.reporter import scope, report, DictSummary from parakeet.training import extension +from parakeet.training.reporter import DictSummary +from parakeet.training.reporter import report +from parakeet.training.reporter import scope class StandardEvaluator(extension.Extension): diff --git a/parakeet/training/extensions/snapshot.py b/parakeet/training/extensions/snapshot.py index 92d74ef..7806dd6 100644 --- a/parakeet/training/extensions/snapshot.py +++ b/parakeet/training/extensions/snapshot.py @@ -11,18 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import logging -from pathlib import Path +import os from datetime import datetime -from typing import List, Dict, Any +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List import jsonlines -from parakeet.utils.mp_tools import rank_zero_only -from parakeet.training.trainer import Trainer from parakeet.training import extension +from parakeet.training.trainer import Trainer +from parakeet.utils.mp_tools import rank_zero_only def load_records(records_fp): @@ -56,7 +57,7 @@ class Snapshot(extension.Extension): self.max_size = max_size self._snapshot_on_error = snapshot_on_error self._save_all = (max_size == -1) - self.checkpoint_dir =... + self.checkpoint_dir = None def initialize(self, trainer: Trainer): """Setting up this extention.""" @@ -107,4 +108,4 @@ class Snapshot(extension.Extension): with jsonlines.open(record_path, 'w') as writer: for record in self.records: # jsonlines.open may return a Writer or a Reader - writer.write(record) # pylint: disable=no-member + writer.write(record) # pylint: disable=no-member diff --git a/parakeet/training/extensions/visualizer.py b/parakeet/training/extensions/visualizer.py index 138bf1e..1c66ad8 100644 --- a/parakeet/training/extensions/visualizer.py +++ b/parakeet/training/extensions/visualizer.py @@ -11,11 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from visualdl import LogWriter - -from parakeet.training.trainer import Trainer from parakeet.training import extension +from parakeet.training.trainer import Trainer class VisualDL(extension.Extension): diff --git a/parakeet/training/reporter.py b/parakeet/training/reporter.py index c2f171c..013c754 100644 --- a/parakeet/training/reporter.py +++ b/parakeet/training/reporter.py @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import math import contextlib +import math from collections import defaultdict OBSERVATIONS = None diff --git a/parakeet/training/seeding.py b/parakeet/training/seeding.py index 1663d2d..8ca30fd 100644 --- a/parakeet/training/seeding.py +++ b/parakeet/training/seeding.py @@ -11,12 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import random import logging +import random -import paddle import numpy as np +import paddle def seed_everything(seed: int): diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index d0a1494..65e2f5e 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -11,20 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import sys -import six import traceback -from pathlib import Path from collections import OrderedDict -from typing import Callable, Union, List +from pathlib import Path +from typing import Callable +from typing import List +from typing import Union +import six import tqdm -from parakeet.training.trigger import get_trigger, IntervalTrigger, LimitTrigger -from parakeet.training.updater import UpdaterBase +from parakeet.training.extension import Extension +from parakeet.training.extension import PRIORITY_READER from parakeet.training.reporter import scope -from parakeet.training.extension import Extension, PRIORITY_READER +from parakeet.training.trigger import get_trigger +from parakeet.training.triggers.limit_trigger import LimitTrigger +from parakeet.training.updater import UpdaterBase class _ExtensionEntry(object): @@ -44,7 +47,7 @@ class Trainer(object): self.extensions = OrderedDict() self.stop_trigger = LimitTrigger(*stop_trigger) self.out = Path(out) - self.observation =... + self.observation = None self._done = False if extensions: @@ -70,8 +73,7 @@ class Trainer(object): if name is None: name = getattr(extension, '__name__', None) if name is None: - raise ValueError( - "Name is not given for the extension.") + raise ValueError("Name is not given for the extension.") if name == 'training': raise ValueError("training is a reserved name.") @@ -112,8 +114,7 @@ class Trainer(object): self.extensions.keys(), key=lambda name: self.extensions[name].priority, reverse=True) - extensions = [(name, self.extensions[name]) - for name in extension_order] + extensions = [(name, self.extensions[name]) for name in extension_order] # initializing all extensions for name, entry in extensions: @@ -126,7 +127,7 @@ class Trainer(object): # display only one progress bar max_iteration = None if isinstance(stop_trigger, LimitTrigger): - if stop_trigger.unit is 'epoch': + if stop_trigger.unit == 'epoch': max_epoch = self.stop_trigger.limit updates_per_epoch = getattr(self.updater, "updates_per_epoch", None) @@ -134,8 +135,7 @@ class Trainer(object): else: max_iteration = self.stop_trigger.limit - p = tqdm.tqdm( - initial=self.updater.state.iteration, total=max_iteration) + p = tqdm.tqdm(initial=self.updater.state.iteration, total=max_iteration) try: while not stop_trigger(self): diff --git a/parakeet/training/trigger.py b/parakeet/training/trigger.py index b588512..f5724c8 100644 --- a/parakeet/training/trigger.py +++ b/parakeet/training/trigger.py @@ -11,10 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.training.triggers.interval_trigger import IntervalTrigger -from parakeet.training.triggers.limit_trigger import LimitTrigger -from parakeet.training.triggers.time_trigger import TimeTrigger def never_file_trigger(trainer): diff --git a/parakeet/training/triggers/interval_trigger.py b/parakeet/training/triggers/interval_trigger.py index e21afdd..98c0368 100644 --- a/parakeet/training/triggers/interval_trigger.py +++ b/parakeet/training/triggers/interval_trigger.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from copy import deepcopy - class IntervalTrigger(object): """A Predicate to do something every N cycle.""" diff --git a/parakeet/training/updater.py b/parakeet/training/updater.py index 5ec5eec..1db3d5f 100644 --- a/parakeet/training/updater.py +++ b/parakeet/training/updater.py @@ -11,22 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging from dataclasses import dataclass -from typing import Optional -from typing import Dict -from typing import Union -from timer import timer import paddle -from paddle import Tensor -from paddle.nn import Layer -from paddle.optimizer import Optimizer -from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler - -from parakeet.training.reporter import report @dataclass diff --git a/parakeet/training/updaters/standard_updater.py b/parakeet/training/updaters/standard_updater.py index 62751cf..2725bb3 100644 --- a/parakeet/training/updaters/standard_updater.py +++ b/parakeet/training/updaters/standard_updater.py @@ -11,23 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging -from dataclasses import dataclass -from typing import Optional from typing import Dict -from typing import Union +from typing import Optional -from timer import timer -import paddle from paddle import Tensor -from paddle.nn import Layer -from paddle.optimizer import Optimizer from paddle.io import DataLoader from paddle.io import DistributedBatchSampler +from paddle.nn import Layer +from paddle.optimizer import Optimizer +from timer import timer from parakeet.training.reporter import report -from parakeet.training.updater import UpdaterBase, UpdaterState +from parakeet.training.updater import UpdaterBase +from parakeet.training.updater import UpdaterState class StandardUpdater(UpdaterBase): @@ -65,34 +62,34 @@ class StandardUpdater(UpdaterBase): # We increase the iteration index after updating and before extension. # Here are the reasons. - # 0. Snapshotting(as well as other extensions, like visualizer) is + # 0. Snapshotting(as well as other extensions, like visualizer) is # executed after a step of updating; - # 1. We decide to increase the iteration index after updating and + # 1. We decide to increase the iteration index after updating and # before any all extension is executed. - # 3. We do not increase the iteration after extension because we - # prefer a consistent resume behavior, when load from a - # `snapshot_iter_100.pdz` then the next step to train is `101`, - # naturally. But if iteration is increased increased after - # extension(including snapshot), then, a `snapshot_iter_99` is - # loaded. You would need a extra increasing of the iteration idex - # before training to avoid another iteration `99`, which has been + # 3. We do not increase the iteration after extension because we + # prefer a consistent resume behavior, when load from a + # `snapshot_iter_100.pdz` then the next step to train is `101`, + # naturally. But if iteration is increased increased after + # extension(including snapshot), then, a `snapshot_iter_99` is + # loaded. You would need a extra increasing of the iteration idex + # before training to avoid another iteration `99`, which has been # done before snapshotting. - # 4. Thus iteration index represrnts "currently how mant epochs has + # 4. Thus iteration index represrnts "currently how mant epochs has # been done." - # NOTE: use report to capture the correctly value. If you want to + # NOTE: use report to capture the correctly value. If you want to # report the learning rate used for a step, you must report it before - # the learning rate scheduler's step() has been called. In paddle's + # the learning rate scheduler's step() has been called. In paddle's # convention, we do not use an extension to change the learning rate. # so if you want to report it, do it in the updater. - # Then here comes the next question. When is the proper time to - # increase the epoch index? Since all extensions are executed after - # updating, it is the time that after updating is the proper time to - # increase epoch index. + # Then here comes the next question. When is the proper time to + # increase the epoch index? Since all extensions are executed after + # updating, it is the time that after updating is the proper time to + # increase epoch index. # 1. If we increase the epoch index before updating, then an extension - # based ot epoch would miss the correct timing. It could only be + # based ot epoch would miss the correct timing. It could only be # triggerd after an extra updating. - # 2. Theoretically, when an epoch is done, the epoch index should be + # 2. Theoretically, when an epoch is done, the epoch index should be # increased. So it would be increase after updating. # 3. Thus, eppoch index represents "currently how many epochs has been # done." So it starts from 0. @@ -140,7 +137,7 @@ class StandardUpdater(UpdaterBase): @property def updates_per_epoch(self): - """Number of updater per epoch, determined by the length of the + """Number of updater per epoch, determined by the length of the dataloader.""" length_of_dataloader = None try: diff --git a/parakeet/utils/__init__.py b/parakeet/utils/__init__.py index a3bd0dc..abf198b 100644 --- a/parakeet/utils/__init__.py +++ b/parakeet/utils/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from . import checkpoint, layer_tools, scheduler, display, mp_tools diff --git a/parakeet/utils/checkpoint.py b/parakeet/utils/checkpoint.py index 0d2a2e2..8df791b 100644 --- a/parakeet/utils/checkpoint.py +++ b/parakeet/utils/checkpoint.py @@ -11,14 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os -import time -import numpy as np + import paddle from paddle import distributed as dist -from paddle.nn import Layer -from paddle.optimizer import Optimizer from parakeet.utils import mp_tools @@ -66,7 +62,7 @@ def load_parameters(model, optimizer=None, checkpoint_dir=None, checkpoint_path=None): - """Load a specific model checkpoint from disk. + """Load a specific model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -74,8 +70,8 @@ def load_parameters(model, Defaults to None. checkpoint_dir (str, optional): the directory where checkpoint is saved. checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path and the argument 'checkpoint_dir' will - be ignored. Defaults to None. + stored in the checkpoint_path and the argument 'checkpoint_dir' will + be ignored. Defaults to None. Returns: iteration (int): number of iterations that the loaded checkpoint has @@ -137,7 +133,6 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None): opt_dict = optimizer.state_dict() optimizer_path = checkpoint_path + ".pdopt" paddle.save(opt_dict, optimizer_path) - print("[checkpoint] Saved optimzier state to {}".format( - optimizer_path)) + print("[checkpoint] Saved optimzier state to {}".format(optimizer_path)) _save_checkpoint(checkpoint_dir, iteration) diff --git a/parakeet/utils/display.py b/parakeet/utils/display.py index faf27e7..af7d44e 100644 --- a/parakeet/utils/display.py +++ b/parakeet/utils/display.py @@ -11,13 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import numpy as np -import matplotlib -import librosa import librosa.display import matplotlib.pylab as plt -from matplotlib import cm, pyplot __all__ = [ "plot_alignment", diff --git a/parakeet/utils/h5_utils.py b/parakeet/utils/h5_utils.py index cd0c670..d0e277d 100644 --- a/parakeet/utils/h5_utils.py +++ b/parakeet/utils/h5_utils.py @@ -11,11 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from pathlib import Path -from typing import Union, Any -import sys import logging +import sys +from pathlib import Path +from typing import Any +from typing import Union + import h5py import numpy as np @@ -44,8 +45,7 @@ def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: hdf5_file = h5py.File(filename, "r") if dataset_name not in hdf5_file: - logging.error( - f"There is no such a data in hdf5 file. ({dataset_name})") + logging.error(f"There is no such a data in hdf5 file. ({dataset_name})") sys.exit(1) # [()]: a special syntax of h5py to get the dataset as-is diff --git a/parakeet/utils/internals.py b/parakeet/utils/internals.py index 968a604..6c10bd2 100644 --- a/parakeet/utils/internals.py +++ b/parakeet/utils/internals.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np from paddle.framework import core diff --git a/parakeet/utils/layer_tools.py b/parakeet/utils/layer_tools.py index fcda44f..6e971f9 100644 --- a/parakeet/utils/layer_tools.py +++ b/parakeet/utils/layer_tools.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np from paddle import nn @@ -42,7 +41,7 @@ def recursively_remove_weight_norm(layer: nn.Layer): for layer in layer.sublayers(): try: nn.utils.remove_weight_norm(layer) - except: + except Exception as e: # ther is not weight norm hoom in this layer pass diff --git a/parakeet/utils/mp_tools.py b/parakeet/utils/mp_tools.py index edc1845..ed8c83e 100644 --- a/parakeet/utils/mp_tools.py +++ b/parakeet/utils/mp_tools.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import paddle -from paddle import distributed as dist from functools import wraps +from paddle import distributed as dist + __all__ = ["rank_zero_only"] diff --git a/parakeet/utils/profile.py b/parakeet/utils/profile.py index cfffb4b..5f9b495 100644 --- a/parakeet/utils/profile.py +++ b/parakeet/utils/profile.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from contextlib import contextmanager import paddle from paddle.framework import core from paddle.framework import CUDAPlace -from contextlib import contextmanager def synchronize(): diff --git a/parakeet/utils/scheduler.py b/parakeet/utils/scheduler.py index 4d41aca..9338995 100644 --- a/parakeet/utils/scheduler.py +++ b/parakeet/utils/scheduler.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math - __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"] diff --git a/parakeet/utils/timeline.py b/parakeet/utils/timeline.py index 119a2e9..0a5509d 100644 --- a/parakeet/utils/timeline.py +++ b/parakeet/utils/timeline.py @@ -11,15 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import json -import six -import sys -import unittest -import google.protobuf.text_format as text_format import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 +import six parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -167,19 +163,19 @@ class Timeline(object): if (k, mevent.device_id, "GPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "GPU")] = pid - self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" - % (k, mevent.device_id), - pid) + self._chrome_trace.emit_pid( + "memory usage on %s:gpu:%d" % (k, mevent.device_id), + pid) elif mevent.place == profiler_pb2.MemEvent.CPUPlace: if (k, mevent.device_id, "CPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "CPU")] = pid - self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" - % (k, mevent.device_id), - pid) + self._chrome_trace.emit_pid( + "memory usage on %s:cpu:%d" % (k, mevent.device_id), + pid) elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace: - if (k, mevent.device_id, "CUDAPinnedPlace" - ) not in self._mem_devices: + if (k, mevent.device_id, + "CUDAPinnedPlace") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "CUDAPinnedPlace")] = pid @@ -190,9 +186,9 @@ class Timeline(object): if (k, mevent.device_id, "NPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, mevent.device_id, "NPU")] = pid - self._chrome_trace.emit_pid("memory usage on %s:npu:%d" - % (k, mevent.device_id), - pid) + self._chrome_trace.emit_pid( + "memory usage on %s:npu:%d" % (k, mevent.device_id), + pid) if (k, 0, "CPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "CPU")] = pid @@ -273,14 +269,14 @@ class Timeline(object): total_size = 0 while i < len(mem_list): total_size += mem_list[i]['size'] - while i < len(mem_list) - 1 and mem_list[i][ - 'time'] == mem_list[i + 1]['time']: + while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[ + i + 1]['time']: total_size += mem_list[i + 1]['size'] i += 1 self._chrome_trace.emit_counter( - "Memory", "Memory", mem_list[i]['pid'], - mem_list[i]['time'], 0, total_size) + "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'], + 0, total_size) i += 1 def generate_chrome_trace(self): diff --git a/setup.py b/setup.py index 7408415..4123f3f 100644 --- a/setup.py +++ b/setup.py @@ -11,12 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import io +import os import re -import sys -from setuptools import setup, find_packages + +from setuptools import find_packages +from setuptools import setup def read(*names, **kwargs): @@ -80,7 +80,9 @@ setup_info = dict( 'jieba', "phkit", ], - extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], }, + extras_require={ + 'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], + }, # Package info packages=find_packages(exclude=('tests', 'tests.*')), diff --git a/tests/unit/test_data_table.py b/tests/unit/test_data_table.py index aca0605..3664ea3 100644 --- a/tests/unit/test_data_table.py +++ b/tests/unit/test_data_table.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.datasets.data_tabel import DataTable diff --git a/tests/unit/test_expansion.py b/tests/unit/test_expansion.py index d548993..418e9ba 100644 --- a/tests/unit/test_expansion.py +++ b/tests/unit/test_expansion.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle + from parakeet.modules import expansion diff --git a/tests/unit/test_optimizer.py b/tests/unit/test_optimizer.py index bdb3d96..74f5036 100644 --- a/tests/unit/test_optimizer.py +++ b/tests/unit/test_optimizer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import shutil from pathlib import Path diff --git a/tests/unit/test_pwg.py b/tests/unit/test_pwg.py index 0978714..2f07a4a 100644 --- a/tests/unit/test_pwg.py +++ b/tests/unit/test_pwg.py @@ -11,18 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle import torch -from timer import timer -from parallel_wavegan.layers import upsample, residual_block +from parallel_wavegan.layers import residual_block +from parallel_wavegan.layers import upsample from parallel_wavegan.models import parallel_wavegan as pwgan +from timer import timer + +from parakeet.models.parallel_wavegan import ConvInUpsampleNet +from parakeet.models.parallel_wavegan import PWGDiscriminator +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import ResidualBlock +from parakeet.models.parallel_wavegan import ResidualPWGDiscriminator from parakeet.utils.layer_tools import summary from parakeet.utils.profile import synchronize -from parakeet.models.parallel_wavegan import ConvInUpsampleNet, ResidualBlock -from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator, ResidualPWGDiscriminator - paddle.set_device("gpu:0") device = torch.device("cuda:0") diff --git a/tests/unit/test_raise.py b/tests/unit/test_raise.py index a4a5e70..7abdadf 100644 --- a/tests/unit/test_raise.py +++ b/tests/unit/test_raise.py @@ -11,14 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import math -import numpy as np - import paddle -from paddle import Tensor from paddle.static import InputSpec -from paddle.nn import functional as F def sinusoid_position_encoding(num_positions: int, @@ -52,6 +46,5 @@ def call_it(x): call_it(paddle.randn([8, 32])) m = paddle.jit.to_static( - call_it, input_spec=[InputSpec( - [-1, -1], dtype=paddle.int32)]) + call_it, input_spec=[InputSpec([-1, -1], dtype=paddle.int32)]) m(paddle.randn([8, 32]).astype(paddle.int32)) diff --git a/tests/unit/test_reporter.py b/tests/unit/test_reporter.py index cd40364..bba81d6 100644 --- a/tests/unit/test_reporter.py +++ b/tests/unit/test_reporter.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np -from parakeet.training.reporter import report, scope -from parakeet.training.reporter import Summary, DictSummary + +from parakeet.training.reporter import report +from parakeet.training.reporter import scope +from parakeet.training.reporter import Summary def test_reporter_scope(): diff --git a/tests/unit/test_snapshot.py b/tests/unit/test_snapshot.py index 71e422c..e940a81 100644 --- a/tests/unit/test_snapshot.py +++ b/tests/unit/test_snapshot.py @@ -11,19 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from pathlib import Path import shutil - -import numpy as np -import paddle -from paddle import nn -from paddle.optimizer import Adam from itertools import count -from parakeet.training.updater import StandardUpdater -from parakeet.training.trainer import Trainer +from paddle import nn +from paddle.optimizer import Adam + from parakeet.training.extensions.snapshot import Snapshot +from parakeet.training.trainer import Trainer +from parakeet.training.updater import StandardUpdater def test_snapshot(): diff --git a/tests/unit/test_stft.py b/tests/unit/test_stft.py index c985235..8e6ce47 100644 --- a/tests/unit/test_stft.py +++ b/tests/unit/test_stft.py @@ -11,15 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import paddle -import torch import librosa import numpy as np -from parakeet.modules.stft_loss import STFT, MultiResolutionSTFTLoss +import paddle +import torch from parallel_wavegan.losses import stft_loss as sl from scipy import signal +from parakeet.modules.stft_loss import MultiResolutionSTFTLoss +from parakeet.modules.stft_loss import STFT + def test_stft(): stft = STFT(n_fft=1024, hop_length=256, win_length=1024) @@ -34,8 +35,7 @@ def test_stft(): window=torch.as_tensor(window)) S2 = (D2**2).sum(-1).sqrt() S3 = np.abs( - librosa.stft( - x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024)) + librosa.stft(x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024)) print(S2.shape) print(S.numpy()[0]) print(S2.data.cpu().numpy()[0]) diff --git a/tests/unit/test_to_static.py b/tests/unit/test_to_static.py index 251d492..b8ff300 100644 --- a/tests/unit/test_to_static.py +++ b/tests/unit/test_to_static.py @@ -11,12 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math import paddle from paddle import nn -from paddle.jit import to_static, save +from paddle.jit import to_static from paddle.static import InputSpec