Merge pull request #141 from yt605155624/fix_pwg
fix docstrings and some bug in pwg
This commit is contained in:
commit
24c5b3c1a2
|
@ -0,0 +1,28 @@
|
|||
# This file is used by clang-format to autoformat paddle source code
|
||||
#
|
||||
# The clang-format is part of llvm toolchain.
|
||||
# It need to install llvm and clang to format source code style.
|
||||
#
|
||||
# The basic usage is,
|
||||
# clang-format -i -style=file PATH/TO/SOURCE/CODE
|
||||
#
|
||||
# The -style=file implicit use ".clang-format" file located in one of
|
||||
# parent directory.
|
||||
# The -i means inplace change.
|
||||
#
|
||||
# The document of clang-format is
|
||||
# http://clang.llvm.org/docs/ClangFormat.html
|
||||
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
IndentWidth: 4
|
||||
TabWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
MaxEmptyLinesToKeep: 2
|
||||
AccessModifierOffset: -2 # The private/protected/public has no indent in class
|
||||
Standard: Cpp11
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
BinPackParameters: false
|
||||
BinPackArguments: false
|
||||
...
|
|
@ -0,0 +1,50 @@
|
|||
[flake8]
|
||||
|
||||
########## OPTIONS ##########
|
||||
# Set the maximum length that any line (with some exceptions) may be.
|
||||
max-line-length = 120
|
||||
|
||||
|
||||
################### FILE PATTERNS ##########################
|
||||
# Provide a comma-separated list of glob patterns to exclude from checks.
|
||||
exclude =
|
||||
# git folder
|
||||
.git,
|
||||
# python cache
|
||||
__pycache__,
|
||||
third_party/,
|
||||
# Provide a comma-separate list of glob patterns to include for checks.
|
||||
filename =
|
||||
*.py
|
||||
|
||||
|
||||
########## RULES ##########
|
||||
|
||||
# ERROR CODES
|
||||
#
|
||||
# E/W - PEP8 errors/warnings (pycodestyle)
|
||||
# F - linting errors (pyflakes)
|
||||
# C - McCabe complexity error (mccabe)
|
||||
#
|
||||
# W503 - line break before binary operator
|
||||
|
||||
# Specify a list of codes to ignore.
|
||||
ignore =
|
||||
W503
|
||||
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
|
||||
W291,W293,W605
|
||||
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
|
||||
|
||||
# Specify the list of error codes you wish Flake8 to report.
|
||||
select =
|
||||
E,
|
||||
W,
|
||||
F,
|
||||
C
|
|
@ -1,11 +1,11 @@
|
|||
repos:
|
||||
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
|
||||
rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
|
||||
- repo: https://github.com/pre-commit/mirrors-yapf.git
|
||||
sha: v0.16.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
files: \.py$
|
||||
exclude: (?=third_party).*(\.py)$
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: a11d9314b22d8f8c7556443875b731ef05965464
|
||||
sha: a11d9314b22d8f8c7556443875b731ef05965464
|
||||
hooks:
|
||||
- id: check-merge-conflict
|
||||
- id: check-symlinks
|
||||
|
@ -15,8 +15,23 @@ repos:
|
|||
files: \.md$
|
||||
- id: trailing-whitespace
|
||||
files: \.md$
|
||||
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
||||
rev: v1.0.1
|
||||
- id: requirements-txt-fixer
|
||||
exclude: (?=third_party).*$
|
||||
- id: check-yaml
|
||||
- id: check-json
|
||||
- id: pretty-format-json
|
||||
args:
|
||||
- --no-sort-keys
|
||||
- --autofix
|
||||
- id: check-merge-conflict
|
||||
- id: flake8
|
||||
aergs:
|
||||
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
|
||||
- --builtins=G,request
|
||||
- --jobs=1
|
||||
exclude: (?=third_party).*(\.py)$
|
||||
- repo : https://github.com/Lucas-C/pre-commit-hooks
|
||||
sha: v1.0.1
|
||||
hooks:
|
||||
- id: forbid-crlf
|
||||
files: \.md$
|
||||
|
@ -28,9 +43,15 @@ repos:
|
|||
files: \.md$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: clang-format
|
||||
name: clang-format
|
||||
description: Format files with ClangFormat
|
||||
entry: bash .pre-commit-hooks/clang-format.hook -i
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
|
||||
- id: copyright_checker
|
||||
name: copyright_checker
|
||||
entry: python ./tools/copyright.hook
|
||||
entry: python .pre-commit-hooks/copyright-check.hook
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
|
||||
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
|
||||
exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
readonly VERSION="3.9"
|
||||
|
||||
version=$(clang-format -version)
|
||||
|
||||
# if ! [[ $version == *"$VERSION"* ]]; then
|
||||
# echo "clang-format version check failed."
|
||||
# echo "a version contains '$VERSION' is needed, but get '$version'"
|
||||
# echo "you can install the right version, and make an soft-link to '\$PATH' env"
|
||||
# exit -1
|
||||
# fi
|
||||
|
||||
clang-format $@
|
|
@ -0,0 +1,133 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
COPYRIGHT = '''
|
||||
Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
LANG_COMMENT_MARK = None
|
||||
|
||||
NEW_LINE_MARK = None
|
||||
|
||||
COPYRIGHT_HEADER = None
|
||||
|
||||
if platform.system() == "Windows":
|
||||
NEW_LINE_MARK = "\r\n"
|
||||
else:
|
||||
NEW_LINE_MARK = '\n'
|
||||
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
|
||||
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
|
||||
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
|
||||
date, err = process.communicate()
|
||||
date = date.decode("utf-8").rstrip("\n")
|
||||
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
|
||||
|
||||
|
||||
def generate_copyright(template, lang='C'):
|
||||
if lang == 'Python':
|
||||
LANG_COMMENT_MARK = '#'
|
||||
else:
|
||||
LANG_COMMENT_MARK = "//"
|
||||
|
||||
lines = template.split(NEW_LINE_MARK)
|
||||
BLANK = " "
|
||||
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
|
||||
for lino, line in enumerate(lines):
|
||||
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
|
||||
if len(line) == 0:
|
||||
BLANK = ""
|
||||
else:
|
||||
BLANK = " "
|
||||
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
|
||||
|
||||
return ans + "\n"
|
||||
|
||||
|
||||
def lang_type(filename):
|
||||
if filename.endswith(".py"):
|
||||
return "Python"
|
||||
elif filename.endswith(".h"):
|
||||
return "C"
|
||||
elif filename.endswith(".c"):
|
||||
return "C"
|
||||
elif filename.endswith(".hpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cc"):
|
||||
return "C"
|
||||
elif filename.endswith(".cpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cu"):
|
||||
return "C"
|
||||
elif filename.endswith(".cuh"):
|
||||
return "C"
|
||||
elif filename.endswith(".go"):
|
||||
return "C"
|
||||
elif filename.endswith(".proto"):
|
||||
return "C"
|
||||
else:
|
||||
print("Unsupported filetype %s", filename)
|
||||
exit(0)
|
||||
|
||||
|
||||
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Checker for copyright declaration.')
|
||||
parser.add_argument('filenames', nargs='*', help='Filenames to check')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
retv = 0
|
||||
for filename in args.filenames:
|
||||
fd = io.open(filename, encoding="utf-8")
|
||||
first_line = fd.readline()
|
||||
second_line = fd.readline()
|
||||
if "COPYRIGHT (C)" in first_line.upper(): continue
|
||||
if first_line.startswith("#!") or PYTHON_ENCODE.match(
|
||||
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
|
||||
continue
|
||||
original_contents = io.open(filename, encoding="utf-8").read()
|
||||
new_contents = generate_copyright(
|
||||
COPYRIGHT, lang_type(filename)) + original_contents
|
||||
print('Auto Insert Copyright Header {}'.format(filename))
|
||||
retv = 1
|
||||
with io.open(filename, 'w') as output_file:
|
||||
output_file.write(new_contents)
|
||||
|
||||
return retv
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
|
@ -0,0 +1,3 @@
|
|||
[style]
|
||||
based_on_style = pep8
|
||||
column_limit = 80
|
|
@ -11,15 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.data.batch import batch_sequences
|
||||
|
||||
|
||||
|
@ -24,8 +24,7 @@ def collate_baker_examples(examples):
|
|||
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
|
||||
energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(
|
||||
item["durations"], dtype=np.int64) for item in examples
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
text_lengths = np.array([item["text_lengths"] for item in examples])
|
||||
speech_lengths = np.array([item["speech_lengths"] for item in examples])
|
||||
|
@ -54,4 +53,4 @@ def collate_baker_examples(examples):
|
|||
"pitch": pitch,
|
||||
"energy": energy
|
||||
}
|
||||
return batch
|
||||
return batch
|
||||
|
|
|
@ -12,18 +12,17 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Calculate statistics of feature files."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from config import get_cfg_default
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -75,8 +74,8 @@ def main():
|
|||
|
||||
# check directory existence
|
||||
if args.output is None:
|
||||
args.output = Path(args.metadata).parent.with_name(args.field_name +
|
||||
"_stats.npy")
|
||||
args.output = Path(
|
||||
args.metadata).parent.with_name(args.field_name + "_stats.npy")
|
||||
else:
|
||||
args.output = Path(args.output)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
@ -11,11 +11,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from yacs.config import CfgNode as Configuration
|
||||
import yaml
|
||||
from yacs.config import CfgNode as Configuration
|
||||
|
||||
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
|
||||
|
||||
|
|
|
@ -11,8 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.models.fastspeech2 import FastSpeech2, FastSpeech2Loss
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Loss
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
|
|
|
@ -11,10 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.frontend.cn_frontend import Frontend as cnFrontend
|
||||
|
||||
|
||||
|
@ -87,8 +88,7 @@ class Frontend():
|
|||
phones.append(phone)
|
||||
return phones, tones
|
||||
|
||||
def get_input_ids(self, sentence, merge_sentences=True,
|
||||
get_tone_ids=False):
|
||||
def get_input_ids(self, sentence, merge_sentences=True, get_tone_ids=False):
|
||||
phonemes = self.frontend.get_phonemes(
|
||||
sentence, merge_sentences=merge_sentences)
|
||||
result = {}
|
||||
|
|
|
@ -11,16 +11,14 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def readtg(config, tg_path):
|
||||
|
|
|
@ -50,10 +50,7 @@ def main():
|
|||
required=True,
|
||||
help="speech statistics file.")
|
||||
parser.add_argument(
|
||||
"--pitch-stats",
|
||||
type=str,
|
||||
required=True,
|
||||
help="pitch statistics file.")
|
||||
"--pitch-stats", type=str, required=True, help="pitch statistics file.")
|
||||
parser.add_argument(
|
||||
"--energy-stats",
|
||||
type=str,
|
||||
|
|
|
@ -21,10 +21,10 @@ from typing import List, Dict, Any
|
|||
import jsonlines
|
||||
import librosa
|
||||
import numpy as np
|
||||
from parakeet.data.get_feats import LogMelFBank, Energy, Pitch
|
||||
import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
from get_feats import LogMelFBank, Energy, Pitch
|
||||
|
||||
|
||||
def get_phn_dur(file_name):
|
||||
|
@ -262,10 +262,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
parser.add_argument(
|
||||
"--rootdir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dur-file",
|
||||
default=None,
|
||||
|
|
|
@ -67,8 +67,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
|
|||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer,
|
||||
model)
|
||||
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
|
@ -94,7 +93,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--fastspeech2-config",
|
||||
type=str,
|
||||
help="config file to overwrite default config")
|
||||
help="config file to overwrite default config.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
|
@ -121,13 +120,13 @@ def main():
|
|||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt ",
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
|
|
|
@ -105,7 +105,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--fastspeech2-config",
|
||||
type=str,
|
||||
help="config file to overwrite default config")
|
||||
help="fastspeech2 config file to overwrite default config.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
|
@ -118,8 +118,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--pwg-config",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
help="parallel wavegan config file to overwrite default config.")
|
||||
parser.add_argument(
|
||||
"--pwg-params",
|
||||
type=str,
|
||||
|
@ -132,16 +131,16 @@ def main():
|
|||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt ",
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
|
|
|
@ -154,8 +154,7 @@ def train_sp(args, config):
|
|||
output_dir = Path(args.output_dir)
|
||||
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
|
||||
|
||||
evaluator = FastSpeech2Evaluator(model, dev_dataloader,
|
||||
**config["updater"])
|
||||
evaluator = FastSpeech2Evaluator(model, dev_dataloader, **config["updater"])
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||
|
@ -169,18 +168,18 @@ def train_sp(args, config):
|
|||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||
parser = argparse.ArgumentParser(description="Train a FastSpeech2 "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
|
|
|
@ -30,9 +30,7 @@ except ModuleNotFoundError:
|
|||
INT16_MAX = (2**15) - 1
|
||||
|
||||
|
||||
def normalize_volume(wav,
|
||||
target_dBFS,
|
||||
increase_only=False,
|
||||
def normalize_volume(wav, target_dBFS, increase_only=False,
|
||||
decrease_only=False):
|
||||
# this function implements Loudness normalization, instead of peak
|
||||
# normalization, See https://en.wikipedia.org/wiki/Audio_normalization
|
||||
|
@ -44,8 +42,9 @@ def normalize_volume(wav,
|
|||
if increase_only and decrease_only:
|
||||
raise ValueError("Both increase only and decrease only are set")
|
||||
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
|
||||
if ((dBFS_change < 0 and increase_only) or
|
||||
(dBFS_change > 0 and decrease_only)):
|
||||
if dBFS_change < 0 and increase_only:
|
||||
return wav
|
||||
if dBFS_change > 0 and decrease_only:
|
||||
return wav
|
||||
gain = 10**(dBFS_change / 20)
|
||||
return wav * gain
|
||||
|
@ -59,9 +58,14 @@ def trim_long_silences(wav,
|
|||
"""
|
||||
Ensures that segments without voice in the waveform remain no longer than a
|
||||
threshold determined by the VAD parameters in params.py.
|
||||
|
||||
:param wav: the raw waveform as a numpy array of floats
|
||||
:return: the same waveform with silences trimmed away (length <= original wav length)
|
||||
Parameters
|
||||
----------
|
||||
wav : np.array
|
||||
the raw waveform as a numpy array of floats
|
||||
Returns
|
||||
----------
|
||||
np.array
|
||||
the same waveform with silences trimmed away (length <= original wav length)
|
||||
"""
|
||||
# Compute the voice detection window size
|
||||
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
||||
|
@ -117,20 +121,25 @@ def compute_partial_slices(n_samples: int,
|
|||
|
||||
The returned ranges may be indexing further than the length of the waveform. It is
|
||||
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int
|
||||
the number of samples in the waveform.
|
||||
partial_utterance_n_frames : int
|
||||
the number of mel spectrogram frames in each partial utterance.
|
||||
|
||||
:param n_samples: the number of samples in the waveform
|
||||
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
||||
utterance
|
||||
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
||||
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
||||
utterances are entirely disjoint.
|
||||
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
||||
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
||||
utterances.
|
||||
min_pad_coverage : int
|
||||
when reaching the last partial utterance, it may or may not have enough frames.
|
||||
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
overlap : float
|
||||
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
|
||||
Returns
|
||||
----------
|
||||
the waveform slices and mel spectrogram slices as lists of array slices.
|
||||
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
|
||||
"""
|
||||
assert 0 <= overlap < 1
|
||||
assert 0 < min_pad_coverage <= 1
|
||||
|
@ -138,8 +147,8 @@ def compute_partial_slices(n_samples: int,
|
|||
# librosa's function to compute num_frames from num_samples
|
||||
n_frames = int(np.ceil((n_samples + 1) / hop_length))
|
||||
# frame shift between ajacent partials
|
||||
frame_step = max(
|
||||
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
frame_step = max(1,
|
||||
int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
|
||||
# Compute the slices
|
||||
wav_slices, mel_slices = [], []
|
||||
|
|
|
@ -57,7 +57,7 @@ def _process_speaker(speaker_dir: Path,
|
|||
try:
|
||||
with sources_fpath.open("rt") as sources_file:
|
||||
existing_names = {line.split(",")[0] for line in sources_file}
|
||||
except:
|
||||
except Exception as e:
|
||||
existing_names = {}
|
||||
else:
|
||||
existing_names = {}
|
||||
|
@ -114,9 +114,7 @@ def process_librispeech(processor,
|
|||
output_dir, "*.flac", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb1(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_voxceleb1(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb1"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
@ -126,10 +124,7 @@ def process_voxceleb1(processor,
|
|||
metadata = [line.strip().split("\t") for line in metafile][1:]
|
||||
|
||||
# speaker id -> nationality
|
||||
nationalities = {
|
||||
line[0]: line[3]
|
||||
for line in metadata if line[-1] == "dev"
|
||||
}
|
||||
nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
|
||||
keep_speaker_ids = [
|
||||
speaker_id for speaker_id, nationality in nationalities.items()
|
||||
if nationality.lower() in anglophone_nationalites
|
||||
|
@ -147,9 +142,7 @@ def process_voxceleb1(processor,
|
|||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb2(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_voxceleb2(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb2"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
@ -171,9 +164,7 @@ def process_aidatatang_200zh(processor,
|
|||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_magicdata(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_magicdata(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "magicdata/train"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
|
|
@ -52,7 +52,8 @@ if __name__ == "__main__":
|
|||
if not args.no_trim:
|
||||
try:
|
||||
import webrtcvad
|
||||
except:
|
||||
print(webrtcvad.__version__)
|
||||
except Exception as e:
|
||||
raise ModuleNotFoundError(
|
||||
"Package 'webrtcvad' not found. This package enables "
|
||||
"noise removal and is recommended. Please install and "
|
||||
|
@ -96,5 +97,5 @@ if __name__ == "__main__":
|
|||
|
||||
for dataset in args.datasets:
|
||||
print("Preprocessing %s" % dataset)
|
||||
preprocess_func[dataset](processor, args.datasets_root,
|
||||
args.output_dir, args.skip_existing)
|
||||
preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
|
||||
args.skip_existing)
|
||||
|
|
|
@ -83,12 +83,11 @@ class Ge2eExperiment(ExperimentBase):
|
|||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
|
||||
self.visualizer.add_scalar("train/eer", eer, self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"param/w",
|
||||
float(self.model_core.similarity_weight), self.iteration)
|
||||
self.visualizer.add_scalar("param/w",
|
||||
float(self.model_core.similarity_weight),
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("param/b",
|
||||
float(self.model_core.similarity_bias),
|
||||
self.iteration)
|
||||
|
|
|
@ -27,10 +27,14 @@ class Clip(object):
|
|||
aux_context_window=0, ):
|
||||
"""Initialize customized collater for DataLoader.
|
||||
|
||||
Args:
|
||||
batch_max_steps (int): The maximum length of input signal in batch.
|
||||
hop_size (int): Hop size of auxiliary features.
|
||||
aux_context_window (int): Context window size for auxiliary feature conv.
|
||||
Parameters
|
||||
----------
|
||||
batch_max_steps : int
|
||||
The maximum length of input signal in batch.
|
||||
hop_size : int
|
||||
Hop size of auxiliary features.
|
||||
aux_context_window : int
|
||||
Context window size for auxiliary feature conv.
|
||||
|
||||
"""
|
||||
if batch_max_steps % hop_size != 0:
|
||||
|
@ -49,14 +53,18 @@ class Clip(object):
|
|||
def __call__(self, examples):
|
||||
"""Convert into batch tensors.
|
||||
|
||||
Args:
|
||||
batch (list): list of tuple of the pair of audio and features. Audio shape
|
||||
(T, ), features shape(T', C).
|
||||
Parameters
|
||||
----------
|
||||
batch : list
|
||||
list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
|
||||
|
||||
Returns:
|
||||
Tensor: Auxiliary feature batch (B, C, T'), where
|
||||
T = (T' - 2 * aux_context_window) * hop_size.
|
||||
Tensor: Target signal batch (B, 1, T).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Auxiliary feature batch (B, C, T'), where
|
||||
T = (T' - 2 * aux_context_window) * hop_size.
|
||||
Tensor
|
||||
Target signal batch (B, 1, T).
|
||||
|
||||
"""
|
||||
# check length
|
||||
|
@ -93,15 +101,15 @@ class Clip(object):
|
|||
def _adjust_length(self, x, c):
|
||||
"""Adjust the audio and feature lengths.
|
||||
|
||||
Note:
|
||||
Basically we assume that the length of x and c are adjusted
|
||||
through preprocessing stage, but if we use other library processed
|
||||
features, this process will be needed.
|
||||
Note
|
||||
-------
|
||||
Basically we assume that the length of x and c are adjusted
|
||||
through preprocessing stage, but if we use other library processed
|
||||
features, this process will be needed.
|
||||
|
||||
"""
|
||||
if len(x) < c.shape[1] * self.hop_size:
|
||||
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)),
|
||||
mode="edge")
|
||||
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), mode="edge")
|
||||
|
||||
# check the legnth is valid
|
||||
assert len(x) == c.shape[
|
||||
|
|
|
@ -17,18 +17,12 @@ import argparse
|
|||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import json
|
||||
import jsonlines
|
||||
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.utils.h5_utils import read_hdf5
|
||||
from parakeet.utils.h5_utils import write_hdf5
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
|
|||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 6 # Batch size.
|
||||
batch_size: 8 # Batch size.
|
||||
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
||||
num_workers: 4 # Number of workers in Pytorch DataLoader.
|
||||
|
|
|
@ -15,18 +15,15 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
|
|
|
@ -12,95 +12,37 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Dict, Any
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
import numpy as np
|
||||
import argparse
|
||||
import yaml
|
||||
import json
|
||||
import jsonlines
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
from operator import itemgetter
|
||||
from praatio import tgio
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import logging
|
||||
import numpy as np
|
||||
import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from pathlib import Path
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def logmelfilterbank(audio,
|
||||
sr,
|
||||
n_fft=1024,
|
||||
hop_length=256,
|
||||
win_length=None,
|
||||
window="hann",
|
||||
n_mels=80,
|
||||
fmin=None,
|
||||
fmax=None,
|
||||
eps=1e-10):
|
||||
"""Compute log-Mel filterbank feature.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : ndarray
|
||||
Audio signal (T,).
|
||||
sr : int
|
||||
Sampling rate.
|
||||
n_fft : int
|
||||
FFT size. (Default value = 1024)
|
||||
hop_length : int
|
||||
Hop size. (Default value = 256)
|
||||
win_length : int
|
||||
Window length. If set to None, it will be the same as fft_size. (Default value = None)
|
||||
window : str
|
||||
Window function type. (Default value = "hann")
|
||||
n_mels : int
|
||||
Number of mel basis. (Default value = 80)
|
||||
fmin : int
|
||||
Minimum frequency in mel basis calculation. (Default value = None)
|
||||
fmax : int
|
||||
Maximum frequency in mel basis calculation. (Default value = None)
|
||||
eps : float
|
||||
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
Log Mel filterbank feature (#frames, num_mels).
|
||||
|
||||
"""
|
||||
# get amplitude spectrogram
|
||||
x_stft = librosa.stft(
|
||||
audio,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
pad_mode="reflect")
|
||||
spc = np.abs(x_stft) # (#bins, #frames,)
|
||||
|
||||
# get mel basis
|
||||
fmin = 0 if fmin is None else fmin
|
||||
fmax = sr / 2 if fmax is None else fmax
|
||||
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
||||
|
||||
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
alignment_fp: Path,
|
||||
output_dir: Path):
|
||||
output_dir: Path,
|
||||
mel_extractor=None):
|
||||
utt_id = fp.stem
|
||||
|
||||
# reading
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(y).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
assert np.abs(
|
||||
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
duration = librosa.get_duration(y, sr=sr)
|
||||
|
||||
# trim according to the alignment file
|
||||
|
@ -134,22 +76,14 @@ def process_sentence(config: Dict[str, Any],
|
|||
frame_length=config.trim_frame_length,
|
||||
hop_length=config.trim_hop_length)
|
||||
|
||||
logmel = logmelfilterbank(
|
||||
y,
|
||||
sr=sr,
|
||||
n_fft=config.n_fft,
|
||||
window=config.window,
|
||||
win_length=config.win_length,
|
||||
hop_length=config.hop_length,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(y)
|
||||
|
||||
# adjust time to make num_samples == num_frames * hop_length
|
||||
num_frames = logmel.shape[1]
|
||||
num_frames = logmel.shape[0]
|
||||
if y.size < num_frames * config.hop_length:
|
||||
y = np.pad(y, (0, num_frames * config.hop_length - y.size),
|
||||
mode="reflect")
|
||||
y = np.pad(
|
||||
y, (0, num_frames * config.hop_length - y.size), mode="reflect")
|
||||
else:
|
||||
y = y[:num_frames * config.hop_length]
|
||||
num_sample = y.shape[0]
|
||||
|
@ -157,7 +91,7 @@ def process_sentence(config: Dict[str, Any],
|
|||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
wav_path = output_dir / (utt_id + "_wave.npy")
|
||||
np.save(wav_path, y) # (num_samples, )
|
||||
np.save(mel_path, logmel.T) # (num_frames, n_mels)
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"num_samples": num_sample,
|
||||
|
@ -172,19 +106,22 @@ def process_sentences(config,
|
|||
fps: List[Path],
|
||||
alignment_fps: List[Path],
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
|
||||
results.append(
|
||||
process_sentence(config, fp, alignment_fp, output_dir))
|
||||
process_sentence(config, fp, alignment_fp, output_dir,
|
||||
mel_extractor))
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
alignment_fp, output_dir)
|
||||
alignment_fp, output_dir,
|
||||
mel_extractor)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
|
@ -204,10 +141,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features .")
|
||||
parser.add_argument(
|
||||
"--rootdir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
|
@ -260,24 +194,37 @@ def main():
|
|||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.sr,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.hop_length,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
process_sentences(
|
||||
C,
|
||||
train_wav_files,
|
||||
train_alignment_files,
|
||||
train_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
dev_wav_files,
|
||||
dev_alignment_files,
|
||||
dev_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
test_wav_files,
|
||||
test_alignment_files,
|
||||
test_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
|
||||
|
||||
|
|
|
@ -20,17 +20,11 @@ from paddle.nn import Layer
|
|||
from paddle.optimizer import Optimizer
|
||||
from paddle.optimizer.lr import LRScheduler
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from timer import timer
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.trainer import Trainer
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
|
||||
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
|
||||
from parakeet.utils.profile import synchronize
|
||||
|
||||
|
||||
class PWGUpdater(StandardUpdater):
|
||||
|
@ -78,16 +72,17 @@ class PWGUpdater(StandardUpdater):
|
|||
wav_ = self.generator(noise, mel)
|
||||
logging.debug(f"Generator takes {t.elapse}s.")
|
||||
|
||||
## Multi-resolution stft loss
|
||||
# initialize
|
||||
gen_loss = 0.0
|
||||
|
||||
## Multi-resolution stft loss
|
||||
with timer() as t:
|
||||
sc_loss, mag_loss = self.criterion_stft(
|
||||
wav_.squeeze(1), wav.squeeze(1))
|
||||
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
|
||||
logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")
|
||||
|
||||
report("train/spectral_convergence_loss", float(sc_loss))
|
||||
report("train/log_stft_magnitude_loss", float(mag_loss))
|
||||
gen_loss = sc_loss + mag_loss
|
||||
gen_loss += sc_loss + mag_loss
|
||||
|
||||
## Adversarial loss
|
||||
if self.state.iteration > self.discriminator_train_start_steps:
|
||||
|
@ -119,9 +114,9 @@ class PWGUpdater(StandardUpdater):
|
|||
p_ = self.discriminator(wav_.detach())
|
||||
real_loss = self.criterion_mse(p, paddle.ones_like(p))
|
||||
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("train/real_loss", float(real_loss))
|
||||
report("train/fake_loss", float(fake_loss))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("train/discriminator_loss", float(dis_loss))
|
||||
|
||||
self.optimizer_d.clear_grad()
|
||||
|
@ -164,8 +159,7 @@ class PWGEvaluator(StandardEvaluator):
|
|||
|
||||
# stft loss
|
||||
with timer() as t:
|
||||
sc_loss, mag_loss = self.criterion_stft(
|
||||
wav_.squeeze(1), wav.squeeze(1))
|
||||
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
|
||||
logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")
|
||||
|
||||
report("eval/spectral_convergence_loss", float(sc_loss))
|
||||
|
@ -178,7 +172,7 @@ class PWGEvaluator(StandardEvaluator):
|
|||
p = self.discriminator(wav)
|
||||
real_loss = self.criterion_mse(p, paddle.ones_like(p))
|
||||
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("eval/real_loss", float(real_loss))
|
||||
report("eval/fake_loss", float(fake_loss))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("eval/discriminator_loss", float(dis_loss))
|
||||
|
|
|
@ -12,34 +12,31 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
from timer import timer
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from timer import timer
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="synthesize with parallel wavegan.")
|
||||
description="Synthesize with parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load")
|
||||
parser.add_argument("--test-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument("--device", type=str, default="gpu", help="device to run")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument("--test-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument("--device", type=str, default="gpu", help="device to run.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = get_cfg_default()
|
||||
|
@ -89,5 +86,5 @@ for example in test_dataset:
|
|||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
|
||||
)
|
||||
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr)
|
||||
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.sr)
|
||||
print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
python3 synthesize.py \
|
||||
--config=conf/default.yaml \
|
||||
--checkpoint=exp/default/checkpoints/snapshot_iter_220000.pdz \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/debug/test
|
|
@ -0,0 +1,111 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def evaluate(args, config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
vocoder = PWGGenerator(**config["generator_params"])
|
||||
state_dict = paddle.load(args.checkpoint)
|
||||
vocoder.set_state_dict(state_dict["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
normalizer = ZScore(mu, std)
|
||||
|
||||
pwg_inference = PWGInference(normalizer, vocoder)
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=config.sr,
|
||||
n_fft=config.n_fft,
|
||||
hop_length=config.hop_length,
|
||||
win_length=config.win_length,
|
||||
window=config.window,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
|
||||
for utt_name in os.listdir(input_dir):
|
||||
wav, _ = librosa.load(str(input_dir / utt_name), sr=config.sr)
|
||||
# extract mel feats
|
||||
mel = mel_extractor.get_log_mel_fbank(wav)
|
||||
mel = paddle.to_tensor(mel)
|
||||
gen_wav = pwg_inference(mel)
|
||||
sf.write(
|
||||
str(output_dir / ("gen_" + utt_name)),
|
||||
gen_wav.numpy(),
|
||||
samplerate=config.sr)
|
||||
print(f"{utt_name} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with parallel wavegan.")
|
||||
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument(
|
||||
"--stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device to run.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
|
||||
evaluate(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -12,36 +12,29 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from paddle.optimizer import Adam # No RAdaom
|
||||
from paddle.optimizer.lr import StepDecay
|
||||
from paddle import DataParallel
|
||||
from visualdl import LogWriter
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.training.updater import UpdaterBase
|
||||
from parakeet.training.trainer import Trainer
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training import extension
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
|
||||
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from pathlib import Path
|
||||
from visualdl import LogWriter
|
||||
|
||||
from batch_fn import Clip
|
||||
from config import get_cfg_default
|
||||
|
@ -137,8 +130,7 @@ def train_sp(args, config):
|
|||
parameters=generator.parameters(),
|
||||
**config["generator_optimizer_params"])
|
||||
lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
|
||||
gradient_clip_d = nn.ClipGradByGlobalNorm(config[
|
||||
"discriminator_grad_norm"])
|
||||
gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"])
|
||||
optimizer_d = Adam(
|
||||
learning_rate=lr_schedule_d,
|
||||
grad_clip=gradient_clip_d,
|
||||
|
@ -191,8 +183,7 @@ def train_sp(args, config):
|
|||
stop_trigger=(config.train_max_steps, "iteration"),
|
||||
out=output_dir, )
|
||||
|
||||
trainer.extend(
|
||||
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
|
||||
trainer.extend(evaluator, trigger=(config.eval_interval_steps, 'iteration'))
|
||||
if dist.get_rank() == 0:
|
||||
writer = LogWriter(str(trainer.out))
|
||||
trainer.extend(VisualDL(writer), trigger=(1, 'iteration'))
|
||||
|
@ -210,15 +201,15 @@ def main():
|
|||
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
|
|
|
@ -22,8 +22,7 @@ def collate_baker_examples(examples):
|
|||
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
|
||||
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(
|
||||
item["durations"], dtype=np.int64) for item in examples
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
num_phones = np.array([item["num_phones"] for item in examples])
|
||||
num_frames = np.array([item["num_frames"] for item in examples])
|
||||
|
|
|
@ -15,21 +15,14 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import json
|
||||
import jsonlines
|
||||
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.utils.h5_utils import read_hdf5
|
||||
from parakeet.utils.h5_utils import write_hdf5
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@ from pathlib import Path
|
|||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import pypinyin
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
import jieba
|
||||
import phkit
|
||||
|
|
|
@ -15,9 +15,8 @@
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle import inference
|
||||
import soundfile as sf
|
||||
from paddle import inference
|
||||
|
||||
from frontend import text_analysis
|
||||
|
||||
|
@ -73,8 +72,8 @@ def main():
|
|||
|
||||
speedyspeech_predictor.run()
|
||||
output_names = speedyspeech_predictor.get_output_names()
|
||||
output_handle = speedyspeech_predictor.get_output_handle(output_names[
|
||||
0])
|
||||
output_handle = speedyspeech_predictor.get_output_handle(
|
||||
output_names[0])
|
||||
output_data = output_handle.copy_to_cpu()
|
||||
|
||||
input_names = pwg_predictor.get_input_names()
|
||||
|
|
|
@ -15,19 +15,16 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from copy import copy
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
@ -100,7 +97,10 @@ def main():
|
|||
for item in metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
|
||||
dataset = DataTable(metadata, converters={'feats': np.load, })
|
||||
dataset = DataTable(
|
||||
metadata, converters={
|
||||
'feats': np.load,
|
||||
})
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# restore scaler
|
||||
|
|
|
@ -12,97 +12,39 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Dict, Any
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
import numpy as np
|
||||
import argparse
|
||||
import yaml
|
||||
import json
|
||||
import re
|
||||
import jsonlines
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
from operator import itemgetter
|
||||
from praatio import tgio
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import logging
|
||||
import numpy as np
|
||||
import re
|
||||
import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from pathlib import Path
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
from tg_utils import validate_textgrid
|
||||
|
||||
|
||||
def logmelfilterbank(audio,
|
||||
sr,
|
||||
n_fft=1024,
|
||||
hop_length=256,
|
||||
win_length=None,
|
||||
window="hann",
|
||||
n_mels=80,
|
||||
fmin=None,
|
||||
fmax=None,
|
||||
eps=1e-10):
|
||||
"""Compute log-Mel filterbank feature.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : ndarray
|
||||
Audio signal (T,).
|
||||
sr : int
|
||||
Sampling rate.
|
||||
n_fft : int
|
||||
FFT size. (Default value = 1024)
|
||||
hop_length : int
|
||||
Hop size. (Default value = 256)
|
||||
win_length : int
|
||||
Window length. If set to None, it will be the same as fft_size. (Default value = None)
|
||||
window : str
|
||||
Window function type. (Default value = "hann")
|
||||
n_mels : int
|
||||
Number of mel basis. (Default value = 80)
|
||||
fmin : int
|
||||
Minimum frequency in mel basis calculation. (Default value = None)
|
||||
fmax : int
|
||||
Maximum frequency in mel basis calculation. (Default value = None)
|
||||
eps : float
|
||||
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
Log Mel filterbank feature (#frames, num_mels).
|
||||
|
||||
"""
|
||||
# get amplitude spectrogram
|
||||
x_stft = librosa.stft(
|
||||
audio,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
pad_mode="reflect")
|
||||
spc = np.abs(x_stft) # (#bins, #frames,)
|
||||
|
||||
# get mel basis
|
||||
fmin = 0 if fmin is None else fmin
|
||||
fmax = sr / 2 if fmax is None else fmax
|
||||
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
||||
|
||||
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
alignment_fp: Path,
|
||||
output_dir: Path):
|
||||
output_dir: Path,
|
||||
mel_extractor=None):
|
||||
utt_id = fp.stem
|
||||
|
||||
# reading
|
||||
y, sr = librosa.load(fp, sr=config.sr) # resampling may occur
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(y).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
assert np.abs(
|
||||
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
duration = librosa.get_duration(y, sr=sr)
|
||||
|
||||
# intervals with empty lables are ignored
|
||||
|
@ -125,16 +67,8 @@ def process_sentence(config: Dict[str, Any],
|
|||
f" There is something wrong with the last interval {last} in utterance: {utt_id}"
|
||||
)
|
||||
|
||||
logmel = logmelfilterbank(
|
||||
y,
|
||||
sr=sr,
|
||||
n_fft=config.n_fft,
|
||||
window=config.window,
|
||||
win_length=config.win_length,
|
||||
hop_length=config.hop_length,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(y)
|
||||
|
||||
# extract phone and duration
|
||||
phones = []
|
||||
|
@ -162,7 +96,7 @@ def process_sentence(config: Dict[str, Any],
|
|||
ends, sr=sr, hop_length=config.hop_length)
|
||||
durations_frame = np.diff(frame_pos, prepend=0)
|
||||
|
||||
num_frames = logmel.shape[-1] # number of frames of the spectrogram
|
||||
num_frames = logmel.shape[0] # number of frames of the spectrogram
|
||||
extra = np.sum(durations_frame) - num_frames
|
||||
assert extra <= 0, (
|
||||
f"Number of frames inferred from alignemnt is "
|
||||
|
@ -173,7 +107,7 @@ def process_sentence(config: Dict[str, Any],
|
|||
durations_frame = durations_frame.tolist()
|
||||
|
||||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
np.save(mel_path, logmel.T) # (num_frames, n_mels)
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"phones": phones,
|
||||
|
@ -190,20 +124,23 @@ def process_sentences(config,
|
|||
fps: List[Path],
|
||||
alignment_fps: List[Path],
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp, alignment_fp in tqdm.tqdm(
|
||||
zip(fps, alignment_fps), total=len(fps)):
|
||||
results.append(
|
||||
process_sentence(config, fp, alignment_fp, output_dir))
|
||||
process_sentence(config, fp, alignment_fp, output_dir,
|
||||
mel_extractor))
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
alignment_fp, output_dir)
|
||||
alignment_fp, output_dir,
|
||||
mel_extractor)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
|
@ -227,10 +164,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
parser.add_argument(
|
||||
"--rootdir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
|
@ -288,24 +222,37 @@ def main():
|
|||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.sr,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.hop_length,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
process_sentences(
|
||||
C,
|
||||
train_wav_files,
|
||||
train_alignment_files,
|
||||
train_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
dev_wav_files,
|
||||
dev_alignment_files,
|
||||
dev_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
test_wav_files,
|
||||
test_alignment_files,
|
||||
test_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
|
||||
|
||||
|
|
|
@ -13,15 +13,13 @@
|
|||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle.nn import functional as F
|
||||
from paddle.fluid.layers import huber_loss
|
||||
|
||||
from parakeet.modules.ssim import ssim
|
||||
from paddle.nn import functional as F
|
||||
from parakeet.modules.losses import masked_l1_loss, weighted_mean
|
||||
from parakeet.modules.ssim import ssim
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
|
||||
|
||||
class SpeedySpeechUpdater(StandardUpdater):
|
||||
|
|
|
@ -11,30 +11,25 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle import distributed as dist
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
|
||||
|
@ -79,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec(
|
||||
[-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
@ -91,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference,
|
||||
input_spec=[InputSpec(
|
||||
[-1, 80], dtype=paddle.float32), ])
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
|
@ -119,9 +113,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config",
|
||||
type=str,
|
||||
help="config file for speedyspeech.")
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
python synthesize.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \
|
||||
--speedyspeech-stat=dump/train/stats.npy \
|
||||
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
|
||||
--pwg-params=../../parallelwave_gan/baker/converted.pdparams \
|
||||
|
|
|
@ -13,28 +13,22 @@
|
|||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle import distributed as dist
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from frontend import text_analysis
|
||||
|
@ -57,8 +51,7 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(
|
||||
paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
@ -81,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec(
|
||||
[-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
@ -93,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference,
|
||||
input_spec=[InputSpec(
|
||||
[-1, 80], dtype=paddle.float32), ])
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
|
@ -119,9 +111,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config",
|
||||
type=str,
|
||||
help="config file for speedyspeech.")
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def validate_textgrid(text_grid, num_samples, sr):
|
||||
|
|
|
@ -12,40 +12,31 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import os
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
from paddle import DataParallel
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from paddle.optimizer import Adam # No RAdaom
|
||||
from paddle.optimizer.lr import StepDecay
|
||||
from paddle import DataParallel
|
||||
from visualdl import LogWriter
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
|
||||
from parakeet.training.updater import UpdaterBase
|
||||
from parakeet.training.trainer import Trainer
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training import extension
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from pathlib import Path
|
||||
from visualdl import LogWriter
|
||||
|
||||
from batch_fn import collate_baker_examples
|
||||
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
|
||||
from config import get_cfg_default
|
||||
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
|
@ -81,7 +72,9 @@ def train_sp(args, config):
|
|||
fields=[
|
||||
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||
],
|
||||
converters={"feats": np.load, }, )
|
||||
converters={
|
||||
"feats": np.load,
|
||||
}, )
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
metadata_dir = Path(args.dev_metadata).parent
|
||||
|
@ -92,7 +85,9 @@ def train_sp(args, config):
|
|||
fields=[
|
||||
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||
],
|
||||
converters={"feats": np.load, }, )
|
||||
converters={
|
||||
"feats": np.load,
|
||||
}, )
|
||||
|
||||
# collate function and dataloader
|
||||
train_sampler = DistributedBatchSampler(
|
||||
|
@ -100,10 +95,6 @@ def train_sp(args, config):
|
|||
batch_size=config.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=True)
|
||||
# dev_sampler = DistributedBatchSampler(dev_dataset,
|
||||
# batch_size=config.batch_size,
|
||||
# shuffle=False,
|
||||
# drop_last=False)
|
||||
print("samplers done!")
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
|
@ -123,7 +114,6 @@ def train_sp(args, config):
|
|||
model = SpeedySpeech(**config["model"])
|
||||
if world_size > 1:
|
||||
model = DataParallel(model) # TODO, do not use vocab size from config
|
||||
# print(model)
|
||||
print("model done!")
|
||||
optimizer = Adam(
|
||||
0.001,
|
||||
|
@ -154,15 +144,15 @@ def main():
|
|||
parser = argparse.ArgumentParser(description="Train a Speedyspeech "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args, rest = parser.parse_known_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
|
|
|
@ -46,8 +46,7 @@ class LJSpeech(Dataset):
|
|||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.,
|
||||
padding_stop_token=1.0):
|
||||
def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
self.padding_stop_token = padding_stop_token
|
||||
|
|
|
@ -63,8 +63,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
@ -14,14 +14,13 @@
|
|||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.frontend import EnglishCharacter # pylint: disable=unused-import
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
|
@ -74,8 +73,7 @@ class Experiment(ExperimentBase):
|
|||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -65,8 +65,8 @@ def collate_aishell3_examples(examples):
|
|||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
|
||||
).astype(np.float32)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
|
|
|
@ -121,8 +121,8 @@ def convert(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace(
|
||||
"ui", "uei").replace("iu", "iou")
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
|
|
|
@ -68,8 +68,7 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
|||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(
|
||||
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -109,8 +109,7 @@ class Experiment(ExperimentBase):
|
|||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T),
|
||||
self.iteration)
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
|
|
@ -40,6 +40,7 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir):
|
|||
raw_text = raw_dict[utt_id]
|
||||
text = text_cleaner(raw_text)
|
||||
g2p_phones = frontend.get_phonemes(text)
|
||||
g2p_phones = sum(g2p_phones, [])
|
||||
gt_phones = ref_dict[utt_id].split(" ")
|
||||
# delete silence tokens in predicted phones and ground truth phones
|
||||
g2p_phones = [phn for phn in g2p_phones if phn not in SILENCE_TOKENS]
|
||||
|
|
|
@ -53,10 +53,10 @@ class Transform(object):
|
|||
ids, mel = example # ids already have <s> and </s>
|
||||
ids = np.array(ids, dtype=np.int64)
|
||||
# add start and end frame
|
||||
mel = np.pad(mel, [(0, 0), (1, 1)],
|
||||
mode='constant',
|
||||
constant_values=[(0, 0),
|
||||
(self.start_value, self.end_value)])
|
||||
mel = np.pad(
|
||||
mel, [(0, 0), (1, 1)],
|
||||
mode='constant',
|
||||
constant_values=[(0, 0), (self.start_value, self.end_value)])
|
||||
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
|
||||
stop_labels[-1] = 2
|
||||
# actually this thing can also be done within the model
|
||||
|
|
|
@ -64,8 +64,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
# also save meta data into text format for inspection
|
||||
with open(target_path / "metadata.txt", 'wt') as f:
|
||||
|
@ -73,8 +72,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
phoneme_str = "|".join(phonemes)
|
||||
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.txt"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.txt"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ def main(config, args):
|
|||
display.plot_multilayer_multihead_alignments(attns)
|
||||
plt.savefig(str(output_dir / f"sentence_{i}.png"))
|
||||
|
||||
mel_output = mel_output.T #(C, T)
|
||||
mel_output = mel_output.T # (C, T)
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
|
|
|
@ -76,8 +76,7 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
ljspeech_dataset = LJSpeech(args.data)
|
||||
transform = Transform(config.data.mel_start_value,
|
||||
config.data.mel_end_value)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
|
||||
transform)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
@ -159,8 +158,7 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -90,8 +90,8 @@ def rule(C, V, R, T):
|
|||
return None
|
||||
|
||||
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
|
||||
if V in ['ua', 'uai', 'uang'
|
||||
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
if V in ['ua', 'uai',
|
||||
'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
return None
|
||||
|
||||
# sh 和 ong 不能拼
|
||||
|
|
|
@ -28,8 +28,8 @@ from config import get_cfg_defaults
|
|||
|
||||
|
||||
class Transform(object):
|
||||
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels,
|
||||
fmin, fmax):
|
||||
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, fmin,
|
||||
fmax):
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.win_length = win_length
|
||||
|
@ -79,11 +79,8 @@ class Transform(object):
|
|||
spectrogram_magnitude = np.abs(spectrogram)
|
||||
|
||||
# Compute mel-spectrograms.
|
||||
mel_filter_bank = librosa.filters.mel(sr=sr,
|
||||
n_fft=n_fft,
|
||||
n_mels=n_mels,
|
||||
fmin=fmin,
|
||||
fmax=fmax)
|
||||
mel_filter_bank = librosa.filters.mel(
|
||||
sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
|
||||
|
||||
# log scale mel_spectrogram.
|
||||
|
|
|
@ -39,8 +39,7 @@ def main(config, args):
|
|||
mel = np.load(str(file_path))
|
||||
with paddle.amp.auto_cast():
|
||||
audio = model.predict(mel)
|
||||
audio_path = output_dir / (
|
||||
os.path.splitext(file_path.name)[0] + ".wav")
|
||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
|
||||
sf.write(audio_path, audio, config.data.sample_rate)
|
||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||
|
||||
|
|
|
@ -114,8 +114,7 @@ class Experiment(ExperimentBase):
|
|||
msg += "loss: {:>.6f}".format(loss_value)
|
||||
self.logger.info(msg)
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -13,6 +13,3 @@
|
|||
# limitations under the License.
|
||||
|
||||
__version__ = "0.0.0"
|
||||
|
||||
import logging
|
||||
from parakeet import audio, data, datasets, frontend, models, modules, training, utils
|
||||
|
|
|
@ -11,6 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .audio import AudioProcessor
|
||||
from .spec_normalizer import NormalizerBase, LogMagnitude
|
|
@ -11,10 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
__all__ = ["AudioProcessor"]
|
||||
|
||||
|
@ -53,11 +52,12 @@ class AudioProcessor(object):
|
|||
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
|
||||
|
||||
def _create_mel_filter(self):
|
||||
mel_filter = librosa.filters.mel(self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
mel_filter = librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
return mel_filter
|
||||
|
||||
def read_wav(self, filename):
|
||||
|
|
|
@ -13,19 +13,3 @@
|
|||
# limitations under the License.
|
||||
"""Parakeet's infrastructure for data processing.
|
||||
"""
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.data.dataset import *
|
||||
from parakeet.data.batch import *
|
||||
|
|
|
@ -61,9 +61,10 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
|
|||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[0]
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_id))
|
||||
np.pad(
|
||||
example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_id))
|
||||
|
||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||
|
||||
|
@ -103,9 +104,10 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
|
|||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[-1]
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
np.pad(
|
||||
example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||
|
||||
|
||||
|
@ -152,14 +154,16 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
|
|||
pad_len = max_len - example.shape[time_idx]
|
||||
if time_major:
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len), (0, 0)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
np.pad(
|
||||
example, [(0, pad_len), (0, 0)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
else:
|
||||
batch.append(
|
||||
np.pad(example, [(0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
np.pad(
|
||||
example, [(0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||
|
||||
|
||||
|
@ -178,10 +182,8 @@ def batch_sequences(sequences, axis=0, pad_value=0):
|
|||
for seq, length in zip(sequences, seq_lengths):
|
||||
padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
|
||||
ndim - axis - 1)
|
||||
padded_seq = np.pad(seq,
|
||||
padding,
|
||||
mode='constant',
|
||||
constant_values=pad_value)
|
||||
padded_seq = np.pad(
|
||||
seq, padding, mode='constant', constant_values=pad_value)
|
||||
padded_sequences.append(padded_seq)
|
||||
batch = np.stack(padded_sequences)
|
||||
return batch
|
||||
|
|
|
@ -11,9 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import six
|
||||
import paddle
|
||||
from paddle.io import Dataset
|
||||
|
||||
__all__ = [
|
||||
|
@ -69,7 +67,7 @@ class CacheDataset(Dataset):
|
|||
return len(self._dataset)
|
||||
|
||||
def __getitem__(self, i):
|
||||
if not i in self._cache:
|
||||
if i not in self._cache:
|
||||
self._cache[i] = self._dataset[i]
|
||||
return self._cache[i]
|
||||
|
||||
|
@ -86,9 +84,8 @@ class TupleDataset(Dataset):
|
|||
length = len(datasets[0])
|
||||
for i, dataset in enumerate(datasets):
|
||||
if len(dataset) != length:
|
||||
raise ValueError(
|
||||
"all the datasets should have the same length."
|
||||
"dataset {} has a different length".format(i))
|
||||
raise ValueError("all the datasets should have the same length."
|
||||
"dataset {} has a different length".format(i))
|
||||
self._datasets = datasets
|
||||
self._length = length
|
||||
|
||||
|
@ -115,7 +112,7 @@ class DictDataset(Dataset):
|
|||
A compound dataset made from several datasets of the same length. An
|
||||
example of the `DictDataset` is a dict of examples from the constituent
|
||||
datasets.
|
||||
|
||||
|
||||
WARNING: paddle does not have a good support for DictDataset, because
|
||||
every batch yield from a DataLoader is a list, but it cannot be a dict.
|
||||
So you have to provide a collate function because you cannot use the
|
||||
|
|
|
@ -11,14 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import pyworld
|
||||
from scipy.interpolate import interp1d
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
class LogMelFBank():
|
||||
def __init__(self,
|
||||
|
@ -42,17 +39,18 @@ class LogMelFBank():
|
|||
|
||||
# mel
|
||||
self.n_mels = n_mels
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.fmin = 0 if fmin is None else fmin
|
||||
self.fmax = sr / 2 if fmax is None else fmax
|
||||
|
||||
self.mel_filter = self._create_mel_filter()
|
||||
|
||||
def _create_mel_filter(self):
|
||||
mel_filter = librosa.filters.mel(sr=self.sr,
|
||||
n_fft=self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
mel_filter = librosa.filters.mel(
|
||||
sr=self.sr,
|
||||
n_fft=self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
return mel_filter
|
||||
|
||||
def _stft(self, wav):
|
||||
|
@ -123,11 +121,12 @@ class Pitch():
|
|||
use_log_f0=True) -> np.array:
|
||||
input = input.astype(np.float)
|
||||
frame_period = 1000 * self.hop_length / self.sr
|
||||
f0, timeaxis = pyworld.dio(input,
|
||||
fs=self.sr,
|
||||
f0_floor=self.f0min,
|
||||
f0_ceil=self.f0max,
|
||||
frame_period=frame_period)
|
||||
f0, timeaxis = pyworld.dio(
|
||||
input,
|
||||
fs=self.sr,
|
||||
f0_floor=self.f0min,
|
||||
f0_ceil=self.f0max,
|
||||
frame_period=frame_period)
|
||||
f0 = pyworld.stonemask(input, f0, timeaxis, self.sr)
|
||||
if use_continuous_f0:
|
||||
f0 = self._convert_to_continuous_f0(f0)
|
||||
|
@ -197,8 +196,7 @@ class Energy():
|
|||
input_power = np.abs(input_stft)**2
|
||||
energy = np.sqrt(
|
||||
np.clip(
|
||||
np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float(
|
||||
'inf')))
|
||||
np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float('inf')))
|
||||
return energy
|
||||
|
||||
def _average_by_duration(self, input: np.array, d: np.array) -> np.array:
|
||||
|
@ -217,41 +215,3 @@ class Energy():
|
|||
if use_token_averaged_energy and duration is not None:
|
||||
energy = self._average_by_duration(energy, duration)
|
||||
return energy
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
C = get_cfg_default()
|
||||
filename = "../raw_data/data/format.1/000001.flac"
|
||||
wav, _ = librosa.load(filename, sr=C.fs)
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.fs,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.n_shift,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax, )
|
||||
mel = mel_extractor.get_log_mel_fbank(wav)
|
||||
print(mel)
|
||||
print(mel.shape)
|
||||
|
||||
pitch_extractor = Pitch(
|
||||
sr=C.fs, hop_length=C.n_shift, f0min=C.f0min, f0max=C.f0max)
|
||||
duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
|
||||
duration = np.array([int(x) for x in duration.split(" ")])
|
||||
avg_f0 = pitch_extractor.get_pitch(wav, duration=duration)
|
||||
print(avg_f0)
|
||||
print(avg_f0.shape)
|
||||
|
||||
energy_extractor = Energy(
|
||||
sr=C.fs,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.n_shift,
|
||||
win_length=C.win_length,
|
||||
window=C.window)
|
||||
duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
|
||||
duration = np.array([int(x) for x in duration.split(" ")])
|
||||
avg_energy = energy_extractor.get_energy(wav, duration=duration)
|
||||
print(avg_energy)
|
||||
print(avg_energy.sum())
|
|
@ -11,6 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.datasets.common import *
|
||||
from parakeet.datasets.ljspeech import *
|
|
@ -11,14 +11,13 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle.io import Dataset
|
||||
import os
|
||||
import librosa
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
|
||||
|
||||
|
||||
|
@ -57,7 +56,7 @@ class AudioSegmentDataset(Dataset):
|
|||
|
||||
|
||||
class AudioDataset(Dataset):
|
||||
"""A simple dataset adaptor for the audio files.
|
||||
"""A simple dataset adaptor for the audio files.
|
||||
Read -> trim silence -> normalize
|
||||
"""
|
||||
|
||||
|
|
|
@ -11,12 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Union, Optional, Callable, Tuple, List, Dict, Any
|
||||
from pathlib import Path
|
||||
from multiprocessing import Manager
|
||||
from typing import Any
|
||||
from typing import Callable
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
|
||||
from paddle.io import Dataset
|
||||
from pathlib import Path
|
||||
|
||||
__all__ = ["LJSpeechMetaData"]
|
||||
|
||||
|
|
|
@ -11,11 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.vocab import *
|
||||
from parakeet.frontend.phonectic import *
|
||||
from parakeet.frontend.punctuation import *
|
||||
from parakeet.frontend.normalizer import *
|
||||
from parakeet.frontend.cn_normalization import *
|
||||
from parakeet.frontend.tone_sandhi import *
|
||||
from parakeet.frontend.generate_lexicon import *
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.phonectic import Phonetics
|
||||
"""
|
||||
A phonology system with ARPABET symbols and limited punctuations. The G2P
|
||||
|
@ -200,8 +199,7 @@ class ARPABET(Phonetics):
|
|||
The list of pronunciation id sequence.
|
||||
"""
|
||||
return self.numericalize(
|
||||
self.phoneticize(
|
||||
sentence, add_start_end=add_start_end))
|
||||
self.phoneticize(sentence, add_start_end=add_start_end))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -217,9 +215,9 @@ class ARPABETWithStress(Phonetics):
|
|||
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
|
||||
'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
|
||||
'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K',
|
||||
'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P',
|
||||
'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2',
|
||||
'V', 'W', 'Y', 'Z', 'ZH'
|
||||
'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R',
|
||||
'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V',
|
||||
'W', 'Y', 'Z', 'ZH'
|
||||
]
|
||||
punctuations = [',', '.', '?', '!']
|
||||
symbols = phonemes + punctuations
|
||||
|
@ -294,8 +292,7 @@ class ARPABETWithStress(Phonetics):
|
|||
The list of pronunciation id sequence.
|
||||
"""
|
||||
return self.numericalize(
|
||||
self.phoneticize(
|
||||
sentence, add_start_end=add_start_end))
|
||||
self.phoneticize(sentence, add_start_end=add_start_end))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
|
|
@ -11,17 +11,16 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import re
|
||||
|
||||
import jieba.posseg as psg
|
||||
import numpy as np
|
||||
import paddle
|
||||
import re
|
||||
from g2pM import G2pM
|
||||
from parakeet.frontend.tone_sandhi import ToneSandhi
|
||||
from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
|
||||
from parakeet.frontend.generate_lexicon import generate_lexicon
|
||||
from parakeet.frontend.tone_sandhi import ToneSandhi
|
||||
|
||||
|
||||
class Frontend():
|
||||
|
|
|
@ -11,5 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.cn_normalization.text_normlization import *
|
|
@ -11,10 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
|
||||
from .num import DIGITS
|
||||
from .num import num2str
|
||||
from .num import verbalize_cardinal
|
||||
from .num import verbalize_digit
|
||||
|
||||
|
||||
def _time_num2str(num_string: str) -> str:
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
import string
|
||||
|
||||
from pypinyin.constants import SUPPORT_UCS4
|
||||
|
||||
# 全角半角转换
|
||||
|
@ -32,10 +32,7 @@ F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
|
|||
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
|
||||
|
||||
# 标点符号全角 -> 半角映射表 (num: 32)
|
||||
F2H_PUNCTUATIONS = {
|
||||
chr(ord(char) + 65248): char
|
||||
for char in string.punctuation
|
||||
}
|
||||
F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
|
||||
# 标点符号半角 -> 全角映射表
|
||||
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
|
||||
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
Rules to verbalize numbers into Chinese characters.
|
||||
https://zh.wikipedia.org/wiki/中文数字#現代中文
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from typing import List
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from .num import verbalize_digit
|
||||
|
@ -32,14 +31,12 @@ def phone2str(phone_string: str, mobile=True) -> str:
|
|||
if mobile:
|
||||
sp_parts = phone_string.strip('+').split()
|
||||
result = ''.join(
|
||||
[verbalize_digit(
|
||||
part, alt_one=True) for part in sp_parts])
|
||||
[verbalize_digit(part, alt_one=True) for part in sp_parts])
|
||||
return result
|
||||
else:
|
||||
sil_parts = phone_string.split('-')
|
||||
result = ''.join(
|
||||
[verbalize_digit(
|
||||
part, alt_one=True) for part in sil_parts])
|
||||
[verbalize_digit(part, alt_one=True) for part in sil_parts])
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from .num import num2str
|
||||
|
|
|
@ -11,16 +11,37 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from .chronology import RE_TIME, RE_DATE, RE_DATE2
|
||||
from .chronology import replace_time, replace_date, replace_date2
|
||||
from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
|
||||
from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM, RE_DECIMAL_NUM, RE_POSITIVE_QUANTIFIERS
|
||||
from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num, replace_negative_num, replace_positive_quantifier
|
||||
from .phonecode import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone, replace_mobile
|
||||
from .chronology import RE_DATE
|
||||
from .chronology import RE_DATE2
|
||||
from .chronology import RE_TIME
|
||||
from .chronology import replace_date
|
||||
from .chronology import replace_date2
|
||||
from .chronology import replace_time
|
||||
from .constants import F2H_ASCII_LETTERS
|
||||
from .constants import F2H_DIGITS
|
||||
from .constants import F2H_SPACE
|
||||
from .num import RE_DECIMAL_NUM
|
||||
from .num import RE_DEFAULT_NUM
|
||||
from .num import RE_FRAC
|
||||
from .num import RE_INTEGER
|
||||
from .num import RE_NUMBER
|
||||
from .num import RE_PERCENTAGE
|
||||
from .num import RE_POSITIVE_QUANTIFIERS
|
||||
from .num import RE_RANGE
|
||||
from .num import replace_default_num
|
||||
from .num import replace_frac
|
||||
from .num import replace_negative_num
|
||||
from .num import replace_number
|
||||
from .num import replace_percentage
|
||||
from .num import replace_positive_quantifier
|
||||
from .num import replace_range
|
||||
from .phonecode import RE_MOBILE_PHONE
|
||||
from .phonecode import RE_TELEPHONE
|
||||
from .phonecode import replace_mobile
|
||||
from .phonecode import replace_phone
|
||||
from .quantifier import RE_TEMPERATURE
|
||||
from .quantifier import replace_temperature
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@ than words are used in transcriptions produced by `reorganize_baker.py`.
|
|||
We make this choice to better leverage other software for chinese text to
|
||||
pinyin tools like pypinyin. This is the convention for G2P in Chinese.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
|
@ -41,10 +39,10 @@ SPECIALS = ['sil', 'sp']
|
|||
def rule(C, V, R, T):
|
||||
"""Generate a syllable given the initial, the final, erhua indicator, and tone.
|
||||
Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
|
||||
|
||||
|
||||
Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to
|
||||
'u' in syllables when certain conditions are satisfied.
|
||||
|
||||
|
||||
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
|
||||
Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
|
||||
When a syllable is impossible or does not have any characters with this pronunciation, return None
|
||||
|
@ -86,8 +84,8 @@ def rule(C, V, R, T):
|
|||
return None
|
||||
|
||||
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
|
||||
if V in ['ua', 'uai', 'uang'
|
||||
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
if V in ['ua', 'uai',
|
||||
'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
return None
|
||||
|
||||
# sh 和 ong 不能拼
|
||||
|
|
|
@ -11,6 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.normalizer.normalizer import *
|
||||
from parakeet.frontend.normalizer.numbers import *
|
||||
|
|
|
@ -11,10 +11,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from builtins import str as unicode
|
||||
|
||||
from parakeet.frontend.normalizer.numbers import normalize_numbers
|
||||
|
||||
|
||||
|
|
|
@ -11,11 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# number expansion is not that easy
|
||||
import inflect
|
||||
import re
|
||||
|
||||
import inflect
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
|
|
|
@ -11,16 +11,18 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union
|
||||
from g2p_en import G2p
|
||||
from g2pM import G2pM
|
||||
|
||||
from parakeet.frontend import Vocab
|
||||
from parakeet.frontend.normalizer.normalizer import normalize
|
||||
from parakeet.frontend.punctuation import get_punctuations
|
||||
|
||||
# discard opencc untill we find an easy solution to install it on windows
|
||||
# from opencc import OpenCC
|
||||
from parakeet.frontend.punctuation import get_punctuations
|
||||
from parakeet.frontend.normalizer.normalizer import normalize
|
||||
|
||||
__all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"]
|
||||
|
||||
|
@ -65,14 +67,14 @@ class English(Phonetics):
|
|||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
phonemes = ([] if start is None else [start]) \
|
||||
+ self.backend(sentence) \
|
||||
+ ([] if end is None else [end])
|
||||
+ self.backend(sentence) \
|
||||
+ ([] if end is None else [end])
|
||||
phonemes = [item for item in phonemes if item in self.vocab.stoi]
|
||||
return phonemes
|
||||
|
||||
def numericalize(self, phonemes):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
|
@ -91,7 +93,7 @@ class English(Phonetics):
|
|||
|
||||
def reverse(self, ids):
|
||||
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
ids: List[int]
|
||||
|
@ -183,7 +185,7 @@ class EnglishCharacter(Phonetics):
|
|||
----------
|
||||
str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
"""
|
||||
return [self.vocab.reverse(i) for i in ids]
|
||||
|
||||
|
@ -244,8 +246,8 @@ class Chinese(Phonetics):
|
|||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
phonemes = ([] if start is None else [start]) \
|
||||
+ phonemes \
|
||||
+ ([] if end is None else [end])
|
||||
+ phonemes \
|
||||
+ ([] if end is None else [end])
|
||||
return self._filter_symbols(phonemes)
|
||||
|
||||
def _filter_symbols(self, phonemes):
|
||||
|
@ -261,7 +263,7 @@ class Chinese(Phonetics):
|
|||
|
||||
def numericalize(self, phonemes):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
|
@ -298,7 +300,7 @@ class Chinese(Phonetics):
|
|||
|
||||
def reverse(self, ids):
|
||||
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
ids: List[int]
|
||||
|
|
|
@ -19,13 +19,15 @@ text -> pinyin to other part of a TTS system. Other NLP techniques may be used
|
|||
(e.g. tokenization, tagging, NER...)
|
||||
"""
|
||||
import re
|
||||
from itertools import product
|
||||
|
||||
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||
from pypinyin.core import DefaultConverter
|
||||
from pypinyin.core import Pinyin
|
||||
from pypinyin.core import Style
|
||||
|
||||
from parakeet.frontend.phonectic import Phonetics
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
import pypinyin
|
||||
from pypinyin.core import Pinyin, Style
|
||||
from pypinyin.core import DefaultConverter
|
||||
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||
from itertools import product
|
||||
|
||||
_punctuations = [',', '。', '?', '!']
|
||||
_initials = [
|
||||
|
@ -33,10 +35,10 @@ _initials = [
|
|||
'ch', 'sh', 'r', 'z', 'c', 's'
|
||||
]
|
||||
_finals = [
|
||||
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en',
|
||||
'ang', 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian',
|
||||
'ien', 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang',
|
||||
'ueng', 'v', 've', 'van', 'ven', 'veng'
|
||||
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang',
|
||||
'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien',
|
||||
'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
|
||||
'v', 've', 'van', 'ven', 'veng'
|
||||
]
|
||||
_ernized_symbol = ['&r']
|
||||
_phones = _initials + _finals + _ernized_symbol + _punctuations
|
||||
|
@ -76,12 +78,12 @@ class ParakeetPinyin(Phonetics):
|
|||
|
||||
def phoneticize(self, sentence, add_start_end=False):
|
||||
""" Normalize the input text sequence and convert it into pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -95,12 +97,12 @@ class ParakeetPinyin(Phonetics):
|
|||
|
||||
def numericalize(self, phonemes, tones):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
The list of pronunciation sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[int]
|
||||
|
@ -112,12 +114,12 @@ class ParakeetPinyin(Phonetics):
|
|||
|
||||
def __call__(self, sentence, add_start_end=False):
|
||||
""" Convert the input text sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -159,12 +161,12 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
|
||||
def phoneticize(self, sentence, add_start_end=False):
|
||||
""" Normalize the input text sequence and convert it into pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -178,12 +180,12 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
|
||||
def numericalize(self, phonemes):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
The list of pronunciation sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[int]
|
||||
|
@ -194,12 +196,12 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
|
||||
def __call__(self, sentence, add_start_end=False):
|
||||
""" Convert the input text sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -232,17 +234,17 @@ def _convert_to_parakeet_convension(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un","uen")\
|
||||
.replace("ui", "uei")\
|
||||
syllable = syllable.replace("un", "uen") \
|
||||
.replace("ui", "uei") \
|
||||
.replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii")\
|
||||
.replace("ci", "cii")\
|
||||
.replace("si", "sii")\
|
||||
.replace("zhi", "zhiii")\
|
||||
.replace("chi", "chiii")\
|
||||
.replace("shi", "shiii")\
|
||||
syllable = syllable.replace("zi", "zii") \
|
||||
.replace("ci", "cii") \
|
||||
.replace("si", "sii") \
|
||||
.replace("zhi", "zhiii") \
|
||||
.replace("chi", "chiii") \
|
||||
.replace("shi", "shiii") \
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
|
@ -252,8 +254,8 @@ def _convert_to_parakeet_convension(syllable):
|
|||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv")\
|
||||
.replace("qu", "qv")\
|
||||
syllable = syllable.replace("ju", "jv") \
|
||||
.replace("qu", "qv") \
|
||||
.replace("xu", "xv")
|
||||
|
||||
return syllable + tone
|
||||
|
|
|
@ -12,9 +12,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import string
|
||||
|
||||
__all__ = ["get_punctuations"]
|
||||
|
||||
EN_PUNCT = [
|
||||
|
|
|
@ -11,8 +11,8 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Tuple
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
import jieba
|
||||
from pypinyin import lazy_pinyin
|
||||
|
@ -76,8 +76,7 @@ class ToneSandhi():
|
|||
|
||||
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
||||
for j, item in enumerate(word):
|
||||
if j - 1 >= 0 and item == word[j - 1] and pos[
|
||||
0] in {"n", "v", "a"}:
|
||||
if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
|
||||
finals[j] = finals[j][:-1] + "5"
|
||||
ge_idx = word.find("个")
|
||||
if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
|
||||
|
@ -125,8 +124,8 @@ class ToneSandhi():
|
|||
else:
|
||||
for i, char in enumerate(word):
|
||||
# "不" before tone4 should be bu2, e.g. 不怕
|
||||
if char == "不" and i + 1 < len(word) and finals[i + 1][
|
||||
-1] == "4":
|
||||
if char == "不" and i + 1 < len(word) and finals[i +
|
||||
1][-1] == "4":
|
||||
finals[i] = finals[i][:-1] + "2"
|
||||
return finals
|
||||
|
||||
|
@ -266,12 +265,12 @@ class ToneSandhi():
|
|||
assert len(sub_finals_list) == len(seg)
|
||||
merge_last = [False] * len(seg)
|
||||
for i, (word, pos) in enumerate(seg):
|
||||
if i - 1 >= 0 and self._all_tone_three(sub_finals_list[
|
||||
i - 1]) and self._all_tone_three(sub_finals_list[
|
||||
i]) and not merge_last[i - 1]:
|
||||
if i - 1 >= 0 and self._all_tone_three(
|
||||
sub_finals_list[i - 1]) and self._all_tone_three(
|
||||
sub_finals_list[i]) and not merge_last[i - 1]:
|
||||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(seg[
|
||||
i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(
|
||||
seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||||
merge_last[i] = True
|
||||
else:
|
||||
|
@ -299,8 +298,8 @@ class ToneSandhi():
|
|||
if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
|
||||
merge_last[i - 1]:
|
||||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(seg[
|
||||
i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(
|
||||
seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||||
merge_last[i] = True
|
||||
else:
|
||||
|
|
|
@ -11,9 +11,8 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Dict, Iterable, List
|
||||
from collections import OrderedDict
|
||||
from typing import Iterable
|
||||
|
||||
__all__ = ["Vocab"]
|
||||
|
||||
|
@ -25,13 +24,13 @@ class Vocab(object):
|
|||
-----------
|
||||
symbols: Iterable[str]
|
||||
Common symbols.
|
||||
|
||||
|
||||
padding_symbol: str, optional
|
||||
Symbol for pad. Defaults to "<pad>".
|
||||
|
||||
unk_symbol: str, optional
|
||||
Symbol for unknow. Defaults to "<unk>"
|
||||
|
||||
|
||||
start_symbol: str, optional
|
||||
Symbol for start. Defaults to "<s>"
|
||||
|
||||
|
|
|
@ -11,13 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#from parakeet.models.clarinet import *
|
||||
from parakeet.models.waveflow import *
|
||||
#from parakeet.models.wavenet import *
|
||||
|
||||
from parakeet.models.transformer_tts import *
|
||||
#from parakeet.models.deepvoice3 import *
|
||||
# from parakeet.models.fastspeech import *
|
||||
from parakeet.models.tacotron2 import *
|
||||
from parakeet.models.fastspeech2 import *
|
||||
|
|
|
@ -12,20 +12,24 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fastspeech2 related modules for paddle"""
|
||||
from typing import Sequence
|
||||
from typing import Tuple
|
||||
|
||||
from typing import Dict, Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss
|
||||
from typeguard import check_argument_types
|
||||
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
|
||||
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
|
||||
from parakeet.modules.fastspeech2_predictor.postnet import Postnet
|
||||
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding, ScaledPositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
|
||||
from parakeet.modules.nets_utils import initialize, make_non_pad_mask, make_pad_mask
|
||||
from typeguard import check_argument_types
|
||||
from parakeet.modules.nets_utils import initialize
|
||||
from parakeet.modules.nets_utils import make_non_pad_mask
|
||||
from parakeet.modules.nets_utils import make_pad_mask
|
||||
|
||||
|
||||
class FastSpeech2(nn.Layer):
|
||||
|
@ -252,36 +256,36 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
text : Tensor
|
||||
Batch of padded token ids (B, Tmax).
|
||||
text_lengths : Tensor)
|
||||
Batch of lengths of each input (B,).
|
||||
speech : Tensor
|
||||
Batch of padded target features (B, Lmax, odim).
|
||||
speech_lengths : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
durations : Tensor
|
||||
Batch of padded durations (B, Tmax).
|
||||
pitch : Tensor
|
||||
Batch of padded token-averaged pitch (B, Tmax, 1).
|
||||
energy : Tensor
|
||||
Batch of padded token-averaged energy (B, Tmax, 1).
|
||||
text : Tensor
|
||||
Batch of padded token ids (B, Tmax).
|
||||
text_lengths : Tensor)
|
||||
Batch of lengths of each input (B,).
|
||||
speech : Tensor
|
||||
Batch of padded target features (B, Lmax, odim).
|
||||
speech_lengths : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
durations : Tensor
|
||||
Batch of padded durations (B, Tmax).
|
||||
pitch : Tensor
|
||||
Batch of padded token-averaged pitch (B, Tmax, 1).
|
||||
energy : Tensor
|
||||
Batch of padded token-averaged energy (B, Tmax, 1).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
mel outs before postnet
|
||||
Tensor
|
||||
mel outs after postnet
|
||||
Tensor
|
||||
duration predictor's output
|
||||
Tensor
|
||||
pitch predictor's output
|
||||
Tensor
|
||||
energy predictor's output
|
||||
Tensor
|
||||
speech
|
||||
Tensor
|
||||
speech_lengths, modified if reduction_factor >1
|
||||
Tensor
|
||||
mel outs before postnet
|
||||
Tensor
|
||||
mel outs after postnet
|
||||
Tensor
|
||||
duration predictor's output
|
||||
Tensor
|
||||
pitch predictor's output
|
||||
Tensor
|
||||
energy predictor's output
|
||||
Tensor
|
||||
speech
|
||||
Tensor
|
||||
speech_lengths, modified if reduction_factor > 1
|
||||
"""
|
||||
|
||||
xs = text
|
||||
|
@ -294,9 +298,8 @@ class FastSpeech2(nn.Layer):
|
|||
xs, ilens, ys, olens, ds, ps, es, is_inference=False)
|
||||
# modify mod part of groundtruth
|
||||
if self.reduction_factor > 1:
|
||||
olens = paddle.to_tensor([
|
||||
olen - olen % self.reduction_factor for olen in olens.numpy()
|
||||
])
|
||||
olens = paddle.to_tensor(
|
||||
[olen - olen % self.reduction_factor for olen in olens.numpy()])
|
||||
max_olen = max(olens)
|
||||
ys = ys[:, :max_olen]
|
||||
|
||||
|
@ -389,26 +392,26 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
text : Tensor
|
||||
Input sequence of characters (T,).
|
||||
speech : Tensor, optional
|
||||
Feature sequence to extract style (N, idim).
|
||||
durations : Tensor, optional
|
||||
Groundtruth of duration (T,).
|
||||
pitch : Tensor, optional
|
||||
Groundtruth of token-averaged pitch (T, 1).
|
||||
energy : Tensor, optional
|
||||
Groundtruth of token-averaged energy (T, 1).
|
||||
alpha : float, optional
|
||||
Alpha to control the speed.
|
||||
use_teacher_forcing : bool, optional
|
||||
Whether to use teacher forcing.
|
||||
If true, groundtruth of duration, pitch and energy will be used.
|
||||
text : Tensor
|
||||
Input sequence of characters (T,).
|
||||
speech : Tensor, optional
|
||||
Feature sequence to extract style (N, idim).
|
||||
durations : Tensor, optional
|
||||
Groundtruth of duration (T,).
|
||||
pitch : Tensor, optional
|
||||
Groundtruth of token-averaged pitch (T, 1).
|
||||
energy : Tensor, optional
|
||||
Groundtruth of token-averaged energy (T, 1).
|
||||
alpha : float, optional
|
||||
Alpha to control the speed.
|
||||
use_teacher_forcing : bool, optional
|
||||
Whether to use teacher forcing.
|
||||
If true, groundtruth of duration, pitch and energy will be used.
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Output sequence of features (L, odim).
|
||||
Tensor
|
||||
Output sequence of features (L, odim).
|
||||
"""
|
||||
x, y = text, speech
|
||||
d, p, e = durations, pitch, energy
|
||||
|
@ -448,21 +451,21 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
ilens : Tensor
|
||||
Batch of lengths (B,).
|
||||
ilens : Tensor
|
||||
Batch of lengths (B,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tensor
|
||||
Mask tensor for self-attention.
|
||||
dtype=paddle.bool
|
||||
Tensor
|
||||
Mask tensor for self-attention.
|
||||
dtype=paddle.bool
|
||||
|
||||
Examples
|
||||
-------
|
||||
>>> ilens = [5, 3]
|
||||
>>> self._source_mask(ilens)
|
||||
tensor([[[1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 0, 0]]]) bool
|
||||
>>> ilens = [5, 3]
|
||||
>>> self._source_mask(ilens)
|
||||
tensor([[[1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 0, 0]]]) bool
|
||||
|
||||
"""
|
||||
x_masks = make_non_pad_mask(ilens)
|
||||
|
@ -502,17 +505,16 @@ class FastSpeech2Inference(nn.Layer):
|
|||
class FastSpeech2Loss(nn.Layer):
|
||||
"""Loss function module for FastSpeech2."""
|
||||
|
||||
def __init__(self,
|
||||
use_masking: bool=True,
|
||||
def __init__(self, use_masking: bool=True,
|
||||
use_weighted_masking: bool=False):
|
||||
"""Initialize feed-forward Transformer loss module.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_masking : bool
|
||||
Whether to apply masking for padded part in loss calculation.
|
||||
use_weighted_masking : bool
|
||||
Whether to weighted masking in loss calculation.
|
||||
use_masking : bool
|
||||
Whether to apply masking for padded part in loss calculation.
|
||||
use_weighted_masking : bool
|
||||
Whether to weighted masking in loss calculation.
|
||||
"""
|
||||
assert check_argument_types()
|
||||
super().__init__()
|
||||
|
@ -539,45 +541,45 @@ class FastSpeech2Loss(nn.Layer):
|
|||
ps: paddle.Tensor,
|
||||
es: paddle.Tensor,
|
||||
ilens: paddle.Tensor,
|
||||
olens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor,
|
||||
paddle.Tensor, paddle.Tensor]:
|
||||
olens: paddle.Tensor,
|
||||
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
after_outs : Tensor
|
||||
Batch of outputs after postnets (B, Lmax, odim).
|
||||
before_outs : Tensor
|
||||
Batch of outputs before postnets (B, Lmax, odim).
|
||||
d_outs : Tensor
|
||||
Batch of outputs of duration predictor (B, Tmax).
|
||||
p_outs : Tensor
|
||||
Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||
e_outs : Tensor
|
||||
Batch of outputs of energy predictor (B, Tmax, 1).
|
||||
ys : Tensor
|
||||
Batch of target features (B, Lmax, odim).
|
||||
ds : Tensor
|
||||
Batch of durations (B, Tmax).
|
||||
ps : Tensor
|
||||
Batch of target token-averaged pitch (B, Tmax, 1).
|
||||
es : Tensor
|
||||
Batch of target token-averaged energy (B, Tmax, 1).
|
||||
ilens : Tensor
|
||||
Batch of the lengths of each input (B,).
|
||||
olens : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
after_outs : Tensor
|
||||
Batch of outputs after postnets (B, Lmax, odim).
|
||||
before_outs : Tensor
|
||||
Batch of outputs before postnets (B, Lmax, odim).
|
||||
d_outs : Tensor
|
||||
Batch of outputs of duration predictor (B, Tmax).
|
||||
p_outs : Tensor
|
||||
Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||
e_outs : Tensor
|
||||
Batch of outputs of energy predictor (B, Tmax, 1).
|
||||
ys : Tensor
|
||||
Batch of target features (B, Lmax, odim).
|
||||
ds : Tensor
|
||||
Batch of durations (B, Tmax).
|
||||
ps : Tensor
|
||||
Batch of target token-averaged pitch (B, Tmax, 1).
|
||||
es : Tensor
|
||||
Batch of target token-averaged energy (B, Tmax, 1).
|
||||
ilens : Tensor
|
||||
Batch of the lengths of each input (B,).
|
||||
olens : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
L1 loss value.
|
||||
Tensor
|
||||
Duration predictor loss value.
|
||||
Tensor
|
||||
Pitch predictor loss value.
|
||||
Tensor
|
||||
Energy predictor loss value.
|
||||
Tensor
|
||||
L1 loss value.
|
||||
Tensor
|
||||
Duration predictor loss value.
|
||||
Tensor
|
||||
Pitch predictor loss value.
|
||||
Tensor
|
||||
Energy predictor loss value.
|
||||
|
||||
"""
|
||||
# apply mask to remove padded part
|
||||
|
@ -612,9 +614,9 @@ class FastSpeech2Loss(nn.Layer):
|
|||
# make weighted mask and apply it
|
||||
if self.use_weighted_masking:
|
||||
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
|
||||
out_weights = out_masks.cast(
|
||||
dtype=paddle.float32) / out_masks.cast(
|
||||
dtype=paddle.float32).sum(axis=1, keepdim=True)
|
||||
out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
|
||||
dtype=paddle.float32).sum(
|
||||
axis=1, keepdim=True)
|
||||
out_weights /= ys.shape[0] * ys.shape[2]
|
||||
duration_masks = make_non_pad_mask(ilens)
|
||||
duration_weights = (duration_masks.cast(dtype=paddle.float32) /
|
||||
|
|
|
@ -11,17 +11,14 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from scipy.interpolate import interp1d
|
||||
from sklearn.metrics import roc_curve
|
||||
from scipy.optimize import brentq
|
||||
from sklearn.metrics import roc_curve
|
||||
|
||||
|
||||
class LSTMSpeakerEncoder(nn.Layer):
|
||||
|
@ -81,8 +78,7 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
# print("p1: ", p1.shape)
|
||||
p2 = paddle.bmm(
|
||||
embeds.reshape([-1, 1, embed_dim]),
|
||||
normalized_centroids_excl.reshape(
|
||||
[-1, embed_dim, 1])) # (NM, 1, 1)
|
||||
normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1)
|
||||
p2 = p2.reshape([-1]) # (NM)
|
||||
|
||||
# begin: alternative implementation for scatter
|
||||
|
@ -94,9 +90,8 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
index = index * speakers_per_batch + paddle.arange(
|
||||
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
|
||||
index = paddle.reshape(index, [-1])
|
||||
ones = paddle.ones([
|
||||
speakers_per_batch * utterances_per_speaker * speakers_per_batch
|
||||
])
|
||||
ones = paddle.ones(
|
||||
[speakers_per_batch * utterances_per_speaker * speakers_per_batch])
|
||||
zeros = paddle.zeros_like(index, dtype=ones.dtype)
|
||||
mask_p1 = paddle.scatter(ones, index, zeros)
|
||||
p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
|
||||
|
@ -113,6 +108,9 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
g = p._grad_ivar()
|
||||
g[...] = g * 0.01
|
||||
|
||||
def inv_argmax(self, i, num):
|
||||
return np.eye(1, num, i, dtype=np.int)[0]
|
||||
|
||||
def loss(self, embeds):
|
||||
"""
|
||||
Computes the softmax loss according the section 2.1 of GE2E.
|
||||
|
@ -138,8 +136,8 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
# EER (not backpropagated)
|
||||
with paddle.no_grad():
|
||||
ground_truth = target.numpy()
|
||||
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
|
||||
labels = np.array([inv_argmax(i) for i in ground_truth])
|
||||
labels = np.array(
|
||||
[self.inv_argmax(i, speakers_per_batch) for i in ground_truth])
|
||||
preds = sim_matrix.numpy()
|
||||
|
||||
# Snippet from https://yangcha.github.io/EER-ROC/
|
||||
|
|
|
@ -11,13 +11,14 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from typing import List, Dict, Any, Union, Optional, Tuple
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
@ -63,8 +64,8 @@ class Stretch2D(nn.Layer):
|
|||
|
||||
|
||||
class UpsampleNet(nn.Layer):
|
||||
"""A Layer to upsample spectrogram by applying consecutive stretch and
|
||||
convolutions.
|
||||
"""A Layer to upsample spectrogram by applying consecutive stretch and
|
||||
convolutions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -81,10 +82,10 @@ class UpsampleNet(nn.Layer):
|
|||
use_causal_conv : bool, optional
|
||||
Whether to use causal padding before convolution, by default False
|
||||
|
||||
If True, Causal padding is used along the time axis, i.e. padding
|
||||
amount is ``receptive field - 1`` and 0 for before and after,
|
||||
If True, Causal padding is used along the time axis, i.e. padding
|
||||
amount is ``receptive field - 1`` and 0 for before and after,
|
||||
respectively.
|
||||
|
||||
|
||||
If False, "same" padding is used along the time axis.
|
||||
"""
|
||||
|
||||
|
@ -158,7 +159,7 @@ class ConvInUpsampleNet(nn.Layer):
|
|||
aux_context_window : int, optional
|
||||
Context window of the first 1D convolution applied to the input. It
|
||||
related to the kernel size of the convolution, by default 0
|
||||
|
||||
|
||||
If use causal convolution, the kernel size is ``window + 1``, else
|
||||
the kernel size is ``2 * window + 1``.
|
||||
use_causal_conv : bool, optional
|
||||
|
@ -167,7 +168,7 @@ class ConvInUpsampleNet(nn.Layer):
|
|||
If True, Causal padding is used along the time axis, i.e. padding
|
||||
amount is ``receptive field - 1`` and 0 for before and after,
|
||||
respectively.
|
||||
|
||||
|
||||
If False, "same" padding is used along the time axis.
|
||||
"""
|
||||
|
||||
|
@ -276,10 +277,7 @@ class ResidualBlock(nn.Layer):
|
|||
|
||||
gate_out_channels = gate_channels // 2
|
||||
self.conv1x1_out = nn.Conv1D(
|
||||
gate_out_channels,
|
||||
residual_channels,
|
||||
kernel_size=1,
|
||||
bias_attr=bias)
|
||||
gate_out_channels, residual_channels, kernel_size=1, bias_attr=bias)
|
||||
self.conv1x1_skip = nn.Conv1D(
|
||||
gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias)
|
||||
|
||||
|
@ -428,13 +426,18 @@ class PWGGenerator(nn.Layer):
|
|||
use_causal_conv=use_causal_conv)
|
||||
self.conv_layers.append(conv)
|
||||
|
||||
self.last_conv_layers = nn.Sequential(
|
||||
nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels, skip_channels, 1, bias_attr=True),
|
||||
nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels, out_channels, 1, bias_attr=True))
|
||||
self.last_conv_layers = nn.Sequential(nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels,
|
||||
skip_channels,
|
||||
1,
|
||||
bias_attr=True),
|
||||
nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels,
|
||||
out_channels,
|
||||
1,
|
||||
bias_attr=True))
|
||||
|
||||
if use_weight_norm:
|
||||
self.apply_weight_norm()
|
||||
|
@ -548,18 +551,18 @@ class PWGDiscriminator(nn.Layer):
|
|||
by default True
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels: int=1,
|
||||
out_channels: int=1,
|
||||
kernel_size: int=3,
|
||||
layers: int=10,
|
||||
conv_channels: int=64,
|
||||
dilation_factor: int=1,
|
||||
nonlinear_activation: str="LeakyReLU",
|
||||
nonlinear_activation_params: Dict[
|
||||
str, Any]={"negative_slope": 0.2},
|
||||
bias: bool=True,
|
||||
use_weight_norm: bool=True):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int=1,
|
||||
out_channels: int=1,
|
||||
kernel_size: int=3,
|
||||
layers: int=10,
|
||||
conv_channels: int=64,
|
||||
dilation_factor: int=1,
|
||||
nonlinear_activation: str="LeakyReLU",
|
||||
nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
|
||||
bias: bool=True,
|
||||
use_weight_norm: bool=True):
|
||||
super().__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
assert dilation_factor > 0
|
||||
|
@ -693,8 +696,7 @@ class ResidualPWGDiscriminator(nn.Layer):
|
|||
layers_per_stack = layers // stacks
|
||||
|
||||
self.first_conv = nn.Sequential(
|
||||
nn.Conv1D(
|
||||
in_channels, residual_channels, 1, bias_attr=True),
|
||||
nn.Conv1D(in_channels, residual_channels, 1, bias_attr=True),
|
||||
getattr(nn, nonlinear_activation)(**nonlinear_activation_params))
|
||||
|
||||
self.conv_layers = nn.LayerList()
|
||||
|
@ -714,11 +716,9 @@ class ResidualPWGDiscriminator(nn.Layer):
|
|||
|
||||
self.last_conv_layers = nn.Sequential(
|
||||
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||
nn.Conv1D(
|
||||
skip_channels, skip_channels, 1, bias_attr=True),
|
||||
nn.Conv1D(skip_channels, skip_channels, 1, bias_attr=True),
|
||||
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||
nn.Conv1D(
|
||||
skip_channels, out_channels, 1, bias_attr=True))
|
||||
nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True))
|
||||
|
||||
if use_weight_norm:
|
||||
self.apply_weight_norm()
|
||||
|
|
|
@ -11,18 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from parakeet.modules.positional_encoding import sinusoid_position_encoding
|
||||
from parakeet.modules.expansion import expand
|
||||
from parakeet.modules.positional_encoding import sinusoid_position_encoding
|
||||
|
||||
|
||||
class ResidualBlock(nn.Layer):
|
||||
|
@ -38,8 +31,7 @@ class ResidualBlock(nn.Layer):
|
|||
padding="same",
|
||||
data_format="NLC"),
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1D(
|
||||
channels, data_format="NLC"), ) for _ in range(n)
|
||||
nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
|
||||
]
|
||||
self.blocks = nn.Sequential(*blocks)
|
||||
|
||||
|
@ -95,16 +87,14 @@ class SpeedySpeechEncoder(nn.Layer):
|
|||
nn.Linear(hidden_size, hidden_size),
|
||||
nn.ReLU(), )
|
||||
res_blocks = [
|
||||
ResidualBlock(
|
||||
hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
]
|
||||
self.res_blocks = nn.Sequential(*res_blocks)
|
||||
|
||||
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
|
||||
self.postnet2 = nn.Sequential(
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1D(
|
||||
hidden_size, data_format="NLC"),
|
||||
nn.BatchNorm1D(hidden_size, data_format="NLC"),
|
||||
nn.Linear(hidden_size, hidden_size), )
|
||||
|
||||
def forward(self, text, tones):
|
||||
|
@ -120,13 +110,9 @@ class DurationPredictor(nn.Layer):
|
|||
def __init__(self, hidden_size):
|
||||
super().__init__()
|
||||
self.layers = nn.Sequential(
|
||||
ResidualBlock(
|
||||
hidden_size, 4, 1, n=1),
|
||||
ResidualBlock(
|
||||
hidden_size, 3, 1, n=1),
|
||||
ResidualBlock(
|
||||
hidden_size, 1, 1, n=1),
|
||||
nn.Linear(hidden_size, 1))
|
||||
ResidualBlock(hidden_size, 4, 1, n=1),
|
||||
ResidualBlock(hidden_size, 3, 1, n=1),
|
||||
ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))
|
||||
|
||||
def forward(self, x):
|
||||
return paddle.squeeze(self.layers(x), -1)
|
||||
|
@ -136,15 +122,13 @@ class SpeedySpeechDecoder(nn.Layer):
|
|||
def __init__(self, hidden_size, output_size, kernel_size, dilations):
|
||||
super().__init__()
|
||||
res_blocks = [
|
||||
ResidualBlock(
|
||||
hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
]
|
||||
self.res_blocks = nn.Sequential(*res_blocks)
|
||||
|
||||
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
|
||||
self.postnet2 = nn.Sequential(
|
||||
ResidualBlock(
|
||||
hidden_size, kernel_size, 1, n=2),
|
||||
ResidualBlock(hidden_size, kernel_size, 1, n=2),
|
||||
nn.Linear(hidden_size, output_size))
|
||||
|
||||
def forward(self, x):
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue