format
This commit is contained in:
parent
30f344a6d0
commit
c497fd843d
|
@ -0,0 +1,28 @@
|
|||
# This file is used by clang-format to autoformat paddle source code
|
||||
#
|
||||
# The clang-format is part of llvm toolchain.
|
||||
# It need to install llvm and clang to format source code style.
|
||||
#
|
||||
# The basic usage is,
|
||||
# clang-format -i -style=file PATH/TO/SOURCE/CODE
|
||||
#
|
||||
# The -style=file implicit use ".clang-format" file located in one of
|
||||
# parent directory.
|
||||
# The -i means inplace change.
|
||||
#
|
||||
# The document of clang-format is
|
||||
# http://clang.llvm.org/docs/ClangFormat.html
|
||||
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
IndentWidth: 4
|
||||
TabWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
MaxEmptyLinesToKeep: 2
|
||||
AccessModifierOffset: -2 # The private/protected/public has no indent in class
|
||||
Standard: Cpp11
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
BinPackParameters: false
|
||||
BinPackArguments: false
|
||||
...
|
|
@ -0,0 +1,50 @@
|
|||
[flake8]
|
||||
|
||||
########## OPTIONS ##########
|
||||
# Set the maximum length that any line (with some exceptions) may be.
|
||||
max-line-length = 120
|
||||
|
||||
|
||||
################### FILE PATTERNS ##########################
|
||||
# Provide a comma-separated list of glob patterns to exclude from checks.
|
||||
exclude =
|
||||
# git folder
|
||||
.git,
|
||||
# python cache
|
||||
__pycache__,
|
||||
third_party/,
|
||||
# Provide a comma-separate list of glob patterns to include for checks.
|
||||
filename =
|
||||
*.py
|
||||
|
||||
|
||||
########## RULES ##########
|
||||
|
||||
# ERROR CODES
|
||||
#
|
||||
# E/W - PEP8 errors/warnings (pycodestyle)
|
||||
# F - linting errors (pyflakes)
|
||||
# C - McCabe complexity error (mccabe)
|
||||
#
|
||||
# W503 - line break before binary operator
|
||||
|
||||
# Specify a list of codes to ignore.
|
||||
ignore =
|
||||
W503
|
||||
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
|
||||
W291,W293,W605
|
||||
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
|
||||
|
||||
# Specify the list of error codes you wish Flake8 to report.
|
||||
select =
|
||||
E,
|
||||
W,
|
||||
F,
|
||||
C
|
|
@ -1,11 +1,11 @@
|
|||
repos:
|
||||
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
|
||||
rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
|
||||
- repo: https://github.com/pre-commit/mirrors-yapf.git
|
||||
sha: v0.16.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
files: \.py$
|
||||
exclude: (?=third_party).*(\.py)$
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: a11d9314b22d8f8c7556443875b731ef05965464
|
||||
sha: a11d9314b22d8f8c7556443875b731ef05965464
|
||||
hooks:
|
||||
- id: check-merge-conflict
|
||||
- id: check-symlinks
|
||||
|
@ -15,8 +15,23 @@ repos:
|
|||
files: \.md$
|
||||
- id: trailing-whitespace
|
||||
files: \.md$
|
||||
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
||||
rev: v1.0.1
|
||||
- id: requirements-txt-fixer
|
||||
exclude: (?=third_party).*$
|
||||
- id: check-yaml
|
||||
- id: check-json
|
||||
- id: pretty-format-json
|
||||
args:
|
||||
- --no-sort-keys
|
||||
- --autofix
|
||||
- id: check-merge-conflict
|
||||
- id: flake8
|
||||
aergs:
|
||||
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
|
||||
- --builtins=G,request
|
||||
- --jobs=1
|
||||
exclude: (?=third_party).*(\.py)$
|
||||
- repo : https://github.com/Lucas-C/pre-commit-hooks
|
||||
sha: v1.0.1
|
||||
hooks:
|
||||
- id: forbid-crlf
|
||||
files: \.md$
|
||||
|
@ -28,9 +43,15 @@ repos:
|
|||
files: \.md$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: clang-format
|
||||
name: clang-format
|
||||
description: Format files with ClangFormat
|
||||
entry: bash .pre-commit-hooks/clang-format.hook -i
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
|
||||
- id: copyright_checker
|
||||
name: copyright_checker
|
||||
entry: python ./tools/copyright.hook
|
||||
entry: python .pre-commit-hooks/copyright-check.hook
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
|
||||
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
|
||||
exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
readonly VERSION="3.9"
|
||||
|
||||
version=$(clang-format -version)
|
||||
|
||||
# if ! [[ $version == *"$VERSION"* ]]; then
|
||||
# echo "clang-format version check failed."
|
||||
# echo "a version contains '$VERSION' is needed, but get '$version'"
|
||||
# echo "you can install the right version, and make an soft-link to '\$PATH' env"
|
||||
# exit -1
|
||||
# fi
|
||||
|
||||
clang-format $@
|
|
@ -0,0 +1,133 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
COPYRIGHT = '''
|
||||
Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
LANG_COMMENT_MARK = None
|
||||
|
||||
NEW_LINE_MARK = None
|
||||
|
||||
COPYRIGHT_HEADER = None
|
||||
|
||||
if platform.system() == "Windows":
|
||||
NEW_LINE_MARK = "\r\n"
|
||||
else:
|
||||
NEW_LINE_MARK = '\n'
|
||||
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
|
||||
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
|
||||
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
|
||||
date, err = process.communicate()
|
||||
date = date.decode("utf-8").rstrip("\n")
|
||||
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
|
||||
|
||||
|
||||
def generate_copyright(template, lang='C'):
|
||||
if lang == 'Python':
|
||||
LANG_COMMENT_MARK = '#'
|
||||
else:
|
||||
LANG_COMMENT_MARK = "//"
|
||||
|
||||
lines = template.split(NEW_LINE_MARK)
|
||||
BLANK = " "
|
||||
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
|
||||
for lino, line in enumerate(lines):
|
||||
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
|
||||
if len(line) == 0:
|
||||
BLANK = ""
|
||||
else:
|
||||
BLANK = " "
|
||||
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
|
||||
|
||||
return ans + "\n"
|
||||
|
||||
|
||||
def lang_type(filename):
|
||||
if filename.endswith(".py"):
|
||||
return "Python"
|
||||
elif filename.endswith(".h"):
|
||||
return "C"
|
||||
elif filename.endswith(".c"):
|
||||
return "C"
|
||||
elif filename.endswith(".hpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cc"):
|
||||
return "C"
|
||||
elif filename.endswith(".cpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cu"):
|
||||
return "C"
|
||||
elif filename.endswith(".cuh"):
|
||||
return "C"
|
||||
elif filename.endswith(".go"):
|
||||
return "C"
|
||||
elif filename.endswith(".proto"):
|
||||
return "C"
|
||||
else:
|
||||
print("Unsupported filetype %s", filename)
|
||||
exit(0)
|
||||
|
||||
|
||||
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Checker for copyright declaration.')
|
||||
parser.add_argument('filenames', nargs='*', help='Filenames to check')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
retv = 0
|
||||
for filename in args.filenames:
|
||||
fd = io.open(filename, encoding="utf-8")
|
||||
first_line = fd.readline()
|
||||
second_line = fd.readline()
|
||||
if "COPYRIGHT (C)" in first_line.upper(): continue
|
||||
if first_line.startswith("#!") or PYTHON_ENCODE.match(
|
||||
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
|
||||
continue
|
||||
original_contents = io.open(filename, encoding="utf-8").read()
|
||||
new_contents = generate_copyright(
|
||||
COPYRIGHT, lang_type(filename)) + original_contents
|
||||
print('Auto Insert Copyright Header {}'.format(filename))
|
||||
retv = 1
|
||||
with io.open(filename, 'w') as output_file:
|
||||
output_file.write(new_contents)
|
||||
|
||||
return retv
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
|
@ -0,0 +1,3 @@
|
|||
[style]
|
||||
based_on_style = pep8
|
||||
column_limit = 80
|
|
@ -11,15 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.data.batch import batch_sequences
|
||||
|
||||
|
||||
|
@ -24,8 +24,7 @@ def collate_baker_examples(examples):
|
|||
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
|
||||
energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(
|
||||
item["durations"], dtype=np.int64) for item in examples
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
text_lengths = np.array([item["text_lengths"] for item in examples])
|
||||
speech_lengths = np.array([item["speech_lengths"] for item in examples])
|
||||
|
@ -54,4 +53,4 @@ def collate_baker_examples(examples):
|
|||
"pitch": pitch,
|
||||
"energy": energy
|
||||
}
|
||||
return batch
|
||||
return batch
|
||||
|
|
|
@ -12,18 +12,17 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Calculate statistics of feature files."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from config import get_cfg_default
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -75,8 +74,8 @@ def main():
|
|||
|
||||
# check directory existence
|
||||
if args.output is None:
|
||||
args.output = Path(args.metadata).parent.with_name(args.field_name +
|
||||
"_stats.npy")
|
||||
args.output = Path(
|
||||
args.metadata).parent.with_name(args.field_name + "_stats.npy")
|
||||
else:
|
||||
args.output = Path(args.output)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
@ -11,11 +11,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from yacs.config import CfgNode as Configuration
|
||||
import yaml
|
||||
from yacs.config import CfgNode as Configuration
|
||||
|
||||
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
|
||||
|
||||
|
|
|
@ -11,8 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.models.fastspeech2 import FastSpeech2, FastSpeech2Loss
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Loss
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
|
|
|
@ -11,10 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.frontend.cn_frontend import Frontend as cnFrontend
|
||||
|
||||
|
||||
|
@ -87,8 +88,7 @@ class Frontend():
|
|||
phones.append(phone)
|
||||
return phones, tones
|
||||
|
||||
def get_input_ids(self, sentence, merge_sentences=True,
|
||||
get_tone_ids=False):
|
||||
def get_input_ids(self, sentence, merge_sentences=True, get_tone_ids=False):
|
||||
phonemes = self.frontend.get_phonemes(
|
||||
sentence, merge_sentences=merge_sentences)
|
||||
result = {}
|
||||
|
|
|
@ -11,16 +11,14 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def readtg(config, tg_path):
|
||||
|
|
|
@ -50,10 +50,7 @@ def main():
|
|||
required=True,
|
||||
help="speech statistics file.")
|
||||
parser.add_argument(
|
||||
"--pitch-stats",
|
||||
type=str,
|
||||
required=True,
|
||||
help="pitch statistics file.")
|
||||
"--pitch-stats", type=str, required=True, help="pitch statistics file.")
|
||||
parser.add_argument(
|
||||
"--energy-stats",
|
||||
type=str,
|
||||
|
|
|
@ -262,10 +262,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
parser.add_argument(
|
||||
"--rootdir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dur-file",
|
||||
default=None,
|
||||
|
|
|
@ -67,8 +67,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
|
|||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer,
|
||||
model)
|
||||
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
|
|
|
@ -154,8 +154,7 @@ def train_sp(args, config):
|
|||
output_dir = Path(args.output_dir)
|
||||
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
|
||||
|
||||
evaluator = FastSpeech2Evaluator(model, dev_dataloader,
|
||||
**config["updater"])
|
||||
evaluator = FastSpeech2Evaluator(model, dev_dataloader, **config["updater"])
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||
|
|
|
@ -30,9 +30,7 @@ except ModuleNotFoundError:
|
|||
INT16_MAX = (2**15) - 1
|
||||
|
||||
|
||||
def normalize_volume(wav,
|
||||
target_dBFS,
|
||||
increase_only=False,
|
||||
def normalize_volume(wav, target_dBFS, increase_only=False,
|
||||
decrease_only=False):
|
||||
# this function implements Loudness normalization, instead of peak
|
||||
# normalization, See https://en.wikipedia.org/wiki/Audio_normalization
|
||||
|
@ -44,8 +42,9 @@ def normalize_volume(wav,
|
|||
if increase_only and decrease_only:
|
||||
raise ValueError("Both increase only and decrease only are set")
|
||||
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
|
||||
if ((dBFS_change < 0 and increase_only) or
|
||||
(dBFS_change > 0 and decrease_only)):
|
||||
if dBFS_change < 0 and increase_only:
|
||||
return wav
|
||||
if dBFS_change > 0 and decrease_only:
|
||||
return wav
|
||||
gain = 10**(dBFS_change / 20)
|
||||
return wav * gain
|
||||
|
@ -59,9 +58,14 @@ def trim_long_silences(wav,
|
|||
"""
|
||||
Ensures that segments without voice in the waveform remain no longer than a
|
||||
threshold determined by the VAD parameters in params.py.
|
||||
|
||||
:param wav: the raw waveform as a numpy array of floats
|
||||
:return: the same waveform with silences trimmed away (length <= original wav length)
|
||||
Parameters
|
||||
----------
|
||||
wav : np.array
|
||||
the raw waveform as a numpy array of floats
|
||||
Returns
|
||||
----------
|
||||
np.array
|
||||
the same waveform with silences trimmed away (length <= original wav length)
|
||||
"""
|
||||
# Compute the voice detection window size
|
||||
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
||||
|
@ -117,20 +121,25 @@ def compute_partial_slices(n_samples: int,
|
|||
|
||||
The returned ranges may be indexing further than the length of the waveform. It is
|
||||
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int
|
||||
the number of samples in the waveform.
|
||||
partial_utterance_n_frames : int
|
||||
the number of mel spectrogram frames in each partial utterance.
|
||||
|
||||
:param n_samples: the number of samples in the waveform
|
||||
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
||||
utterance
|
||||
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
||||
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
||||
utterances are entirely disjoint.
|
||||
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
||||
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
||||
utterances.
|
||||
min_pad_coverage : int
|
||||
when reaching the last partial utterance, it may or may not have enough frames.
|
||||
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
overlap : float
|
||||
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
|
||||
Returns
|
||||
----------
|
||||
the waveform slices and mel spectrogram slices as lists of array slices.
|
||||
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
|
||||
"""
|
||||
assert 0 <= overlap < 1
|
||||
assert 0 < min_pad_coverage <= 1
|
||||
|
@ -138,8 +147,8 @@ def compute_partial_slices(n_samples: int,
|
|||
# librosa's function to compute num_frames from num_samples
|
||||
n_frames = int(np.ceil((n_samples + 1) / hop_length))
|
||||
# frame shift between ajacent partials
|
||||
frame_step = max(
|
||||
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
frame_step = max(1,
|
||||
int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
|
||||
# Compute the slices
|
||||
wav_slices, mel_slices = [], []
|
||||
|
|
|
@ -57,7 +57,7 @@ def _process_speaker(speaker_dir: Path,
|
|||
try:
|
||||
with sources_fpath.open("rt") as sources_file:
|
||||
existing_names = {line.split(",")[0] for line in sources_file}
|
||||
except:
|
||||
except Exception as e:
|
||||
existing_names = {}
|
||||
else:
|
||||
existing_names = {}
|
||||
|
@ -114,9 +114,7 @@ def process_librispeech(processor,
|
|||
output_dir, "*.flac", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb1(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_voxceleb1(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb1"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
@ -126,10 +124,7 @@ def process_voxceleb1(processor,
|
|||
metadata = [line.strip().split("\t") for line in metafile][1:]
|
||||
|
||||
# speaker id -> nationality
|
||||
nationalities = {
|
||||
line[0]: line[3]
|
||||
for line in metadata if line[-1] == "dev"
|
||||
}
|
||||
nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
|
||||
keep_speaker_ids = [
|
||||
speaker_id for speaker_id, nationality in nationalities.items()
|
||||
if nationality.lower() in anglophone_nationalites
|
||||
|
@ -147,9 +142,7 @@ def process_voxceleb1(processor,
|
|||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb2(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_voxceleb2(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb2"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
@ -171,9 +164,7 @@ def process_aidatatang_200zh(processor,
|
|||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_magicdata(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_magicdata(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "magicdata/train"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
|
|
@ -52,7 +52,8 @@ if __name__ == "__main__":
|
|||
if not args.no_trim:
|
||||
try:
|
||||
import webrtcvad
|
||||
except:
|
||||
print(webrtcvad.__version__)
|
||||
except Exception as e:
|
||||
raise ModuleNotFoundError(
|
||||
"Package 'webrtcvad' not found. This package enables "
|
||||
"noise removal and is recommended. Please install and "
|
||||
|
@ -96,5 +97,5 @@ if __name__ == "__main__":
|
|||
|
||||
for dataset in args.datasets:
|
||||
print("Preprocessing %s" % dataset)
|
||||
preprocess_func[dataset](processor, args.datasets_root,
|
||||
args.output_dir, args.skip_existing)
|
||||
preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
|
||||
args.skip_existing)
|
||||
|
|
|
@ -83,12 +83,11 @@ class Ge2eExperiment(ExperimentBase):
|
|||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
|
||||
self.visualizer.add_scalar("train/eer", eer, self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"param/w",
|
||||
float(self.model_core.similarity_weight), self.iteration)
|
||||
self.visualizer.add_scalar("param/w",
|
||||
float(self.model_core.similarity_weight),
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("param/b",
|
||||
float(self.model_core.similarity_bias),
|
||||
self.iteration)
|
||||
|
|
|
@ -109,8 +109,7 @@ class Clip(object):
|
|||
|
||||
"""
|
||||
if len(x) < c.shape[1] * self.hop_size:
|
||||
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)),
|
||||
mode="edge")
|
||||
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), mode="edge")
|
||||
|
||||
# check the legnth is valid
|
||||
assert len(x) == c.shape[
|
||||
|
|
|
@ -17,18 +17,12 @@ import argparse
|
|||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import json
|
||||
import jsonlines
|
||||
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.utils.h5_utils import read_hdf5
|
||||
from parakeet.utils.h5_utils import write_hdf5
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
|
|
|
@ -15,18 +15,15 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
|
|
|
@ -13,7 +13,9 @@
|
|||
# limitations under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import List, Dict, Any
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
|
@ -39,8 +41,8 @@ def process_sentence(config: Dict[str, Any],
|
|||
# reading
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(y).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
assert np.abs(
|
||||
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
duration = librosa.get_duration(y, sr=sr)
|
||||
|
||||
# trim according to the alignment file
|
||||
|
@ -80,8 +82,8 @@ def process_sentence(config: Dict[str, Any],
|
|||
# adjust time to make num_samples == num_frames * hop_length
|
||||
num_frames = logmel.shape[0]
|
||||
if y.size < num_frames * config.hop_length:
|
||||
y = np.pad(y, (0, num_frames * config.hop_length - y.size),
|
||||
mode="reflect")
|
||||
y = np.pad(
|
||||
y, (0, num_frames * config.hop_length - y.size), mode="reflect")
|
||||
else:
|
||||
y = y[:num_frames * config.hop_length]
|
||||
num_sample = y.shape[0]
|
||||
|
@ -139,10 +141,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features .")
|
||||
parser.add_argument(
|
||||
"--rootdir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
|
|
|
@ -20,17 +20,11 @@ from paddle.nn import Layer
|
|||
from paddle.optimizer import Optimizer
|
||||
from paddle.optimizer.lr import LRScheduler
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from timer import timer
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.trainer import Trainer
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
|
||||
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
|
||||
from parakeet.utils.profile import synchronize
|
||||
|
||||
|
||||
class PWGUpdater(StandardUpdater):
|
||||
|
|
|
@ -12,20 +12,17 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
from timer import timer
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from timer import timer
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
|
||||
|
|
|
@ -130,8 +130,7 @@ def train_sp(args, config):
|
|||
parameters=generator.parameters(),
|
||||
**config["generator_optimizer_params"])
|
||||
lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
|
||||
gradient_clip_d = nn.ClipGradByGlobalNorm(config[
|
||||
"discriminator_grad_norm"])
|
||||
gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"])
|
||||
optimizer_d = Adam(
|
||||
learning_rate=lr_schedule_d,
|
||||
grad_clip=gradient_clip_d,
|
||||
|
@ -184,8 +183,7 @@ def train_sp(args, config):
|
|||
stop_trigger=(config.train_max_steps, "iteration"),
|
||||
out=output_dir, )
|
||||
|
||||
trainer.extend(
|
||||
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
|
||||
trainer.extend(evaluator, trigger=(config.eval_interval_steps, 'iteration'))
|
||||
if dist.get_rank() == 0:
|
||||
writer = LogWriter(str(trainer.out))
|
||||
trainer.extend(VisualDL(writer), trigger=(1, 'iteration'))
|
||||
|
|
|
@ -22,8 +22,7 @@ def collate_baker_examples(examples):
|
|||
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
|
||||
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(
|
||||
item["durations"], dtype=np.int64) for item in examples
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
num_phones = np.array([item["num_phones"] for item in examples])
|
||||
num_frames = np.array([item["num_frames"] for item in examples])
|
||||
|
|
|
@ -15,21 +15,14 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import json
|
||||
import jsonlines
|
||||
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.utils.h5_utils import read_hdf5
|
||||
from parakeet.utils.h5_utils import write_hdf5
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@ from pathlib import Path
|
|||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import pypinyin
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
import jieba
|
||||
import phkit
|
||||
|
|
|
@ -15,9 +15,8 @@
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle import inference
|
||||
import soundfile as sf
|
||||
from paddle import inference
|
||||
|
||||
from frontend import text_analysis
|
||||
|
||||
|
@ -73,8 +72,8 @@ def main():
|
|||
|
||||
speedyspeech_predictor.run()
|
||||
output_names = speedyspeech_predictor.get_output_names()
|
||||
output_handle = speedyspeech_predictor.get_output_handle(output_names[
|
||||
0])
|
||||
output_handle = speedyspeech_predictor.get_output_handle(
|
||||
output_names[0])
|
||||
output_data = output_handle.copy_to_cpu()
|
||||
|
||||
input_names = pwg_predictor.get_input_names()
|
||||
|
|
|
@ -15,19 +15,16 @@
|
|||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from copy import copy
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
@ -100,7 +97,10 @@ def main():
|
|||
for item in metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
|
||||
dataset = DataTable(metadata, converters={'feats': np.load, })
|
||||
dataset = DataTable(
|
||||
metadata, converters={
|
||||
'feats': np.load,
|
||||
})
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# restore scaler
|
||||
|
|
|
@ -13,7 +13,9 @@
|
|||
# limitations under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import List, Dict, Any
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
|
@ -41,8 +43,8 @@ def process_sentence(config: Dict[str, Any],
|
|||
# reading
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(y).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
assert np.abs(
|
||||
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
duration = librosa.get_duration(y, sr=sr)
|
||||
|
||||
# intervals with empty lables are ignored
|
||||
|
@ -162,10 +164,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
parser.add_argument(
|
||||
"--rootdir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
|
|
|
@ -13,15 +13,13 @@
|
|||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle.nn import functional as F
|
||||
from paddle.fluid.layers import huber_loss
|
||||
|
||||
from parakeet.modules.ssim import ssim
|
||||
from paddle.nn import functional as F
|
||||
from parakeet.modules.losses import masked_l1_loss, weighted_mean
|
||||
from parakeet.modules.ssim import ssim
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
|
||||
|
||||
class SpeedySpeechUpdater(StandardUpdater):
|
||||
|
|
|
@ -11,30 +11,25 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle import distributed as dist
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
|
||||
|
@ -79,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec(
|
||||
[-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
@ -91,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference,
|
||||
input_spec=[InputSpec(
|
||||
[-1, 80], dtype=paddle.float32), ])
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
|
@ -119,9 +113,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config",
|
||||
type=str,
|
||||
help="config file for speedyspeech.")
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
python synthesize.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \
|
||||
--speedyspeech-stat=dump/train/stats.npy \
|
||||
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
|
||||
--pwg-params=../../parallelwave_gan/baker/converted.pdparams \
|
||||
|
|
|
@ -13,28 +13,22 @@
|
|||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle import distributed as dist
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from frontend import text_analysis
|
||||
|
@ -57,8 +51,7 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(
|
||||
paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
@ -81,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec(
|
||||
[-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
@ -93,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
|
|||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference,
|
||||
input_spec=[InputSpec(
|
||||
[-1, 80], dtype=paddle.float32), ])
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
|
@ -119,9 +111,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config",
|
||||
type=str,
|
||||
help="config file for speedyspeech.")
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def validate_textgrid(text_grid, num_samples, sr):
|
||||
|
|
|
@ -72,7 +72,9 @@ def train_sp(args, config):
|
|||
fields=[
|
||||
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||
],
|
||||
converters={"feats": np.load, }, )
|
||||
converters={
|
||||
"feats": np.load,
|
||||
}, )
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
metadata_dir = Path(args.dev_metadata).parent
|
||||
|
@ -83,7 +85,9 @@ def train_sp(args, config):
|
|||
fields=[
|
||||
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||
],
|
||||
converters={"feats": np.load, }, )
|
||||
converters={
|
||||
"feats": np.load,
|
||||
}, )
|
||||
|
||||
# collate function and dataloader
|
||||
train_sampler = DistributedBatchSampler(
|
||||
|
|
|
@ -46,8 +46,7 @@ class LJSpeech(Dataset):
|
|||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.,
|
||||
padding_stop_token=1.0):
|
||||
def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
self.padding_stop_token = padding_stop_token
|
||||
|
|
|
@ -63,8 +63,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
@ -14,14 +14,13 @@
|
|||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.frontend import EnglishCharacter # pylint: disable=unused-import
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
|
@ -74,8 +73,7 @@ class Experiment(ExperimentBase):
|
|||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -65,8 +65,8 @@ def collate_aishell3_examples(examples):
|
|||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
|
||||
).astype(np.float32)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
|
|
|
@ -121,8 +121,8 @@ def convert(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace(
|
||||
"ui", "uei").replace("iu", "iou")
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
|
|
|
@ -68,8 +68,7 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
|||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(
|
||||
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -109,8 +109,7 @@ class Experiment(ExperimentBase):
|
|||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T),
|
||||
self.iteration)
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
|
|
@ -40,6 +40,7 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir):
|
|||
raw_text = raw_dict[utt_id]
|
||||
text = text_cleaner(raw_text)
|
||||
g2p_phones = frontend.get_phonemes(text)
|
||||
g2p_phones = sum(g2p_phones, [])
|
||||
gt_phones = ref_dict[utt_id].split(" ")
|
||||
# delete silence tokens in predicted phones and ground truth phones
|
||||
g2p_phones = [phn for phn in g2p_phones if phn not in SILENCE_TOKENS]
|
||||
|
|
|
@ -53,10 +53,10 @@ class Transform(object):
|
|||
ids, mel = example # ids already have <s> and </s>
|
||||
ids = np.array(ids, dtype=np.int64)
|
||||
# add start and end frame
|
||||
mel = np.pad(mel, [(0, 0), (1, 1)],
|
||||
mode='constant',
|
||||
constant_values=[(0, 0),
|
||||
(self.start_value, self.end_value)])
|
||||
mel = np.pad(
|
||||
mel, [(0, 0), (1, 1)],
|
||||
mode='constant',
|
||||
constant_values=[(0, 0), (self.start_value, self.end_value)])
|
||||
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
|
||||
stop_labels[-1] = 2
|
||||
# actually this thing can also be done within the model
|
||||
|
|
|
@ -64,8 +64,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
# also save meta data into text format for inspection
|
||||
with open(target_path / "metadata.txt", 'wt') as f:
|
||||
|
@ -73,8 +72,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
phoneme_str = "|".join(phonemes)
|
||||
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.txt"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.txt"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ def main(config, args):
|
|||
display.plot_multilayer_multihead_alignments(attns)
|
||||
plt.savefig(str(output_dir / f"sentence_{i}.png"))
|
||||
|
||||
mel_output = mel_output.T #(C, T)
|
||||
mel_output = mel_output.T # (C, T)
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
|
|
|
@ -76,8 +76,7 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
ljspeech_dataset = LJSpeech(args.data)
|
||||
transform = Transform(config.data.mel_start_value,
|
||||
config.data.mel_end_value)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
|
||||
transform)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
@ -159,8 +158,7 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -90,8 +90,8 @@ def rule(C, V, R, T):
|
|||
return None
|
||||
|
||||
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
|
||||
if V in ['ua', 'uai', 'uang'
|
||||
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
if V in ['ua', 'uai',
|
||||
'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
return None
|
||||
|
||||
# sh 和 ong 不能拼
|
||||
|
|
|
@ -28,8 +28,8 @@ from config import get_cfg_defaults
|
|||
|
||||
|
||||
class Transform(object):
|
||||
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels,
|
||||
fmin, fmax):
|
||||
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, fmin,
|
||||
fmax):
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.win_length = win_length
|
||||
|
@ -79,11 +79,8 @@ class Transform(object):
|
|||
spectrogram_magnitude = np.abs(spectrogram)
|
||||
|
||||
# Compute mel-spectrograms.
|
||||
mel_filter_bank = librosa.filters.mel(sr=sr,
|
||||
n_fft=n_fft,
|
||||
n_mels=n_mels,
|
||||
fmin=fmin,
|
||||
fmax=fmax)
|
||||
mel_filter_bank = librosa.filters.mel(
|
||||
sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
|
||||
|
||||
# log scale mel_spectrogram.
|
||||
|
|
|
@ -39,8 +39,7 @@ def main(config, args):
|
|||
mel = np.load(str(file_path))
|
||||
with paddle.amp.auto_cast():
|
||||
audio = model.predict(mel)
|
||||
audio_path = output_dir / (
|
||||
os.path.splitext(file_path.name)[0] + ".wav")
|
||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
|
||||
sf.write(audio_path, audio, config.data.sample_rate)
|
||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||
|
||||
|
|
|
@ -114,8 +114,7 @@ class Experiment(ExperimentBase):
|
|||
msg += "loss: {:>.6f}".format(loss_value)
|
||||
self.logger.info(msg)
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -13,6 +13,3 @@
|
|||
# limitations under the License.
|
||||
|
||||
__version__ = "0.0.0"
|
||||
|
||||
import logging
|
||||
from parakeet import audio, data, datasets, frontend, models, modules, training, utils
|
||||
|
|
|
@ -11,6 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .audio import AudioProcessor
|
||||
from .spec_normalizer import NormalizerBase, LogMagnitude
|
|
@ -11,10 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
__all__ = ["AudioProcessor"]
|
||||
|
||||
|
@ -53,11 +52,12 @@ class AudioProcessor(object):
|
|||
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
|
||||
|
||||
def _create_mel_filter(self):
|
||||
mel_filter = librosa.filters.mel(self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
mel_filter = librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
return mel_filter
|
||||
|
||||
def read_wav(self, filename):
|
||||
|
|
|
@ -13,20 +13,3 @@
|
|||
# limitations under the License.
|
||||
"""Parakeet's infrastructure for data processing.
|
||||
"""
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.data.batch import *
|
||||
from parakeet.data.dataset import *
|
||||
from parakeet.data.get_feats import *
|
||||
|
|
|
@ -61,9 +61,10 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
|
|||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[0]
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_id))
|
||||
np.pad(
|
||||
example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_id))
|
||||
|
||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||
|
||||
|
@ -103,9 +104,10 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
|
|||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[-1]
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
np.pad(
|
||||
example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||
|
||||
|
||||
|
@ -152,14 +154,16 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
|
|||
pad_len = max_len - example.shape[time_idx]
|
||||
if time_major:
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len), (0, 0)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
np.pad(
|
||||
example, [(0, pad_len), (0, 0)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
else:
|
||||
batch.append(
|
||||
np.pad(example, [(0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
np.pad(
|
||||
example, [(0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||
|
||||
|
||||
|
@ -178,10 +182,8 @@ def batch_sequences(sequences, axis=0, pad_value=0):
|
|||
for seq, length in zip(sequences, seq_lengths):
|
||||
padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
|
||||
ndim - axis - 1)
|
||||
padded_seq = np.pad(seq,
|
||||
padding,
|
||||
mode='constant',
|
||||
constant_values=pad_value)
|
||||
padded_seq = np.pad(
|
||||
seq, padding, mode='constant', constant_values=pad_value)
|
||||
padded_sequences.append(padded_seq)
|
||||
batch = np.stack(padded_sequences)
|
||||
return batch
|
||||
|
|
|
@ -11,9 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import six
|
||||
import paddle
|
||||
from paddle.io import Dataset
|
||||
|
||||
__all__ = [
|
||||
|
@ -69,7 +67,7 @@ class CacheDataset(Dataset):
|
|||
return len(self._dataset)
|
||||
|
||||
def __getitem__(self, i):
|
||||
if not i in self._cache:
|
||||
if i not in self._cache:
|
||||
self._cache[i] = self._dataset[i]
|
||||
return self._cache[i]
|
||||
|
||||
|
@ -86,9 +84,8 @@ class TupleDataset(Dataset):
|
|||
length = len(datasets[0])
|
||||
for i, dataset in enumerate(datasets):
|
||||
if len(dataset) != length:
|
||||
raise ValueError(
|
||||
"all the datasets should have the same length."
|
||||
"dataset {} has a different length".format(i))
|
||||
raise ValueError("all the datasets should have the same length."
|
||||
"dataset {} has a different length".format(i))
|
||||
self._datasets = datasets
|
||||
self._length = length
|
||||
|
||||
|
@ -115,7 +112,7 @@ class DictDataset(Dataset):
|
|||
A compound dataset made from several datasets of the same length. An
|
||||
example of the `DictDataset` is a dict of examples from the constituent
|
||||
datasets.
|
||||
|
||||
|
||||
WARNING: paddle does not have a good support for DictDataset, because
|
||||
every batch yield from a DataLoader is a list, but it cannot be a dict.
|
||||
So you have to provide a collate function because you cannot use the
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import pyworld
|
||||
|
@ -46,11 +45,12 @@ class LogMelFBank():
|
|||
self.mel_filter = self._create_mel_filter()
|
||||
|
||||
def _create_mel_filter(self):
|
||||
mel_filter = librosa.filters.mel(sr=self.sr,
|
||||
n_fft=self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
mel_filter = librosa.filters.mel(
|
||||
sr=self.sr,
|
||||
n_fft=self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
return mel_filter
|
||||
|
||||
def _stft(self, wav):
|
||||
|
@ -121,11 +121,12 @@ class Pitch():
|
|||
use_log_f0=True) -> np.array:
|
||||
input = input.astype(np.float)
|
||||
frame_period = 1000 * self.hop_length / self.sr
|
||||
f0, timeaxis = pyworld.dio(input,
|
||||
fs=self.sr,
|
||||
f0_floor=self.f0min,
|
||||
f0_ceil=self.f0max,
|
||||
frame_period=frame_period)
|
||||
f0, timeaxis = pyworld.dio(
|
||||
input,
|
||||
fs=self.sr,
|
||||
f0_floor=self.f0min,
|
||||
f0_ceil=self.f0max,
|
||||
frame_period=frame_period)
|
||||
f0 = pyworld.stonemask(input, f0, timeaxis, self.sr)
|
||||
if use_continuous_f0:
|
||||
f0 = self._convert_to_continuous_f0(f0)
|
||||
|
@ -195,8 +196,7 @@ class Energy():
|
|||
input_power = np.abs(input_stft)**2
|
||||
energy = np.sqrt(
|
||||
np.clip(
|
||||
np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float(
|
||||
'inf')))
|
||||
np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float('inf')))
|
||||
return energy
|
||||
|
||||
def _average_by_duration(self, input: np.array, d: np.array) -> np.array:
|
||||
|
|
|
@ -11,6 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.datasets.common import *
|
||||
from parakeet.datasets.ljspeech import *
|
|
@ -11,14 +11,13 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle.io import Dataset
|
||||
import os
|
||||
import librosa
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
|
||||
|
||||
|
||||
|
@ -57,7 +56,7 @@ class AudioSegmentDataset(Dataset):
|
|||
|
||||
|
||||
class AudioDataset(Dataset):
|
||||
"""A simple dataset adaptor for the audio files.
|
||||
"""A simple dataset adaptor for the audio files.
|
||||
Read -> trim silence -> normalize
|
||||
"""
|
||||
|
||||
|
|
|
@ -11,12 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Union, Optional, Callable, Tuple, List, Dict, Any
|
||||
from pathlib import Path
|
||||
from multiprocessing import Manager
|
||||
from typing import Any
|
||||
from typing import Callable
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
|
||||
from paddle.io import Dataset
|
||||
from pathlib import Path
|
||||
|
||||
__all__ = ["LJSpeechMetaData"]
|
||||
|
||||
|
|
|
@ -11,11 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.vocab import *
|
||||
from parakeet.frontend.phonectic import *
|
||||
from parakeet.frontend.punctuation import *
|
||||
from parakeet.frontend.normalizer import *
|
||||
from parakeet.frontend.cn_normalization import *
|
||||
from parakeet.frontend.tone_sandhi import *
|
||||
from parakeet.frontend.generate_lexicon import *
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.phonectic import Phonetics
|
||||
"""
|
||||
A phonology system with ARPABET symbols and limited punctuations. The G2P
|
||||
|
@ -200,8 +199,7 @@ class ARPABET(Phonetics):
|
|||
The list of pronunciation id sequence.
|
||||
"""
|
||||
return self.numericalize(
|
||||
self.phoneticize(
|
||||
sentence, add_start_end=add_start_end))
|
||||
self.phoneticize(sentence, add_start_end=add_start_end))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -217,9 +215,9 @@ class ARPABETWithStress(Phonetics):
|
|||
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
|
||||
'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
|
||||
'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K',
|
||||
'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P',
|
||||
'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2',
|
||||
'V', 'W', 'Y', 'Z', 'ZH'
|
||||
'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R',
|
||||
'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V',
|
||||
'W', 'Y', 'Z', 'ZH'
|
||||
]
|
||||
punctuations = [',', '.', '?', '!']
|
||||
symbols = phonemes + punctuations
|
||||
|
@ -294,8 +292,7 @@ class ARPABETWithStress(Phonetics):
|
|||
The list of pronunciation id sequence.
|
||||
"""
|
||||
return self.numericalize(
|
||||
self.phoneticize(
|
||||
sentence, add_start_end=add_start_end))
|
||||
self.phoneticize(sentence, add_start_end=add_start_end))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
|
|
@ -11,17 +11,16 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import re
|
||||
|
||||
import jieba.posseg as psg
|
||||
import numpy as np
|
||||
import paddle
|
||||
import re
|
||||
from g2pM import G2pM
|
||||
from parakeet.frontend.tone_sandhi import ToneSandhi
|
||||
from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
|
||||
from parakeet.frontend.generate_lexicon import generate_lexicon
|
||||
from parakeet.frontend.tone_sandhi import ToneSandhi
|
||||
|
||||
|
||||
class Frontend():
|
||||
|
|
|
@ -11,5 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.cn_normalization.text_normlization import *
|
|
@ -11,10 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
|
||||
from .num import DIGITS
|
||||
from .num import num2str
|
||||
from .num import verbalize_cardinal
|
||||
from .num import verbalize_digit
|
||||
|
||||
|
||||
def _time_num2str(num_string: str) -> str:
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
import string
|
||||
|
||||
from pypinyin.constants import SUPPORT_UCS4
|
||||
|
||||
# 全角半角转换
|
||||
|
@ -32,10 +32,7 @@ F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
|
|||
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
|
||||
|
||||
# 标点符号全角 -> 半角映射表 (num: 32)
|
||||
F2H_PUNCTUATIONS = {
|
||||
chr(ord(char) + 65248): char
|
||||
for char in string.punctuation
|
||||
}
|
||||
F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
|
||||
# 标点符号半角 -> 全角映射表
|
||||
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
|
||||
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
Rules to verbalize numbers into Chinese characters.
|
||||
https://zh.wikipedia.org/wiki/中文数字#現代中文
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from typing import List
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from .num import verbalize_digit
|
||||
|
@ -32,14 +31,12 @@ def phone2str(phone_string: str, mobile=True) -> str:
|
|||
if mobile:
|
||||
sp_parts = phone_string.strip('+').split()
|
||||
result = ''.join(
|
||||
[verbalize_digit(
|
||||
part, alt_one=True) for part in sp_parts])
|
||||
[verbalize_digit(part, alt_one=True) for part in sp_parts])
|
||||
return result
|
||||
else:
|
||||
sil_parts = phone_string.split('-')
|
||||
result = ''.join(
|
||||
[verbalize_digit(
|
||||
part, alt_one=True) for part in sil_parts])
|
||||
[verbalize_digit(part, alt_one=True) for part in sil_parts])
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from .num import num2str
|
||||
|
|
|
@ -11,16 +11,37 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from .chronology import RE_TIME, RE_DATE, RE_DATE2
|
||||
from .chronology import replace_time, replace_date, replace_date2
|
||||
from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
|
||||
from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM, RE_DECIMAL_NUM, RE_POSITIVE_QUANTIFIERS
|
||||
from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num, replace_negative_num, replace_positive_quantifier
|
||||
from .phonecode import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone, replace_mobile
|
||||
from .chronology import RE_DATE
|
||||
from .chronology import RE_DATE2
|
||||
from .chronology import RE_TIME
|
||||
from .chronology import replace_date
|
||||
from .chronology import replace_date2
|
||||
from .chronology import replace_time
|
||||
from .constants import F2H_ASCII_LETTERS
|
||||
from .constants import F2H_DIGITS
|
||||
from .constants import F2H_SPACE
|
||||
from .num import RE_DECIMAL_NUM
|
||||
from .num import RE_DEFAULT_NUM
|
||||
from .num import RE_FRAC
|
||||
from .num import RE_INTEGER
|
||||
from .num import RE_NUMBER
|
||||
from .num import RE_PERCENTAGE
|
||||
from .num import RE_POSITIVE_QUANTIFIERS
|
||||
from .num import RE_RANGE
|
||||
from .num import replace_default_num
|
||||
from .num import replace_frac
|
||||
from .num import replace_negative_num
|
||||
from .num import replace_number
|
||||
from .num import replace_percentage
|
||||
from .num import replace_positive_quantifier
|
||||
from .num import replace_range
|
||||
from .phonecode import RE_MOBILE_PHONE
|
||||
from .phonecode import RE_TELEPHONE
|
||||
from .phonecode import replace_mobile
|
||||
from .phonecode import replace_phone
|
||||
from .quantifier import RE_TEMPERATURE
|
||||
from .quantifier import replace_temperature
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@ than words are used in transcriptions produced by `reorganize_baker.py`.
|
|||
We make this choice to better leverage other software for chinese text to
|
||||
pinyin tools like pypinyin. This is the convention for G2P in Chinese.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
|
@ -41,10 +39,10 @@ SPECIALS = ['sil', 'sp']
|
|||
def rule(C, V, R, T):
|
||||
"""Generate a syllable given the initial, the final, erhua indicator, and tone.
|
||||
Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
|
||||
|
||||
|
||||
Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to
|
||||
'u' in syllables when certain conditions are satisfied.
|
||||
|
||||
|
||||
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
|
||||
Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
|
||||
When a syllable is impossible or does not have any characters with this pronunciation, return None
|
||||
|
@ -86,8 +84,8 @@ def rule(C, V, R, T):
|
|||
return None
|
||||
|
||||
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
|
||||
if V in ['ua', 'uai', 'uang'
|
||||
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
if V in ['ua', 'uai',
|
||||
'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
|
||||
return None
|
||||
|
||||
# sh 和 ong 不能拼
|
||||
|
|
|
@ -11,6 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.normalizer.normalizer import *
|
||||
from parakeet.frontend.normalizer.numbers import *
|
||||
|
|
|
@ -11,10 +11,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from builtins import str as unicode
|
||||
|
||||
from parakeet.frontend.normalizer.numbers import normalize_numbers
|
||||
|
||||
|
||||
|
|
|
@ -11,11 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# number expansion is not that easy
|
||||
import inflect
|
||||
import re
|
||||
|
||||
import inflect
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
|
|
|
@ -11,16 +11,18 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union
|
||||
from g2p_en import G2p
|
||||
from g2pM import G2pM
|
||||
|
||||
from parakeet.frontend import Vocab
|
||||
from parakeet.frontend.normalizer.normalizer import normalize
|
||||
from parakeet.frontend.punctuation import get_punctuations
|
||||
|
||||
# discard opencc untill we find an easy solution to install it on windows
|
||||
# from opencc import OpenCC
|
||||
from parakeet.frontend.punctuation import get_punctuations
|
||||
from parakeet.frontend.normalizer.normalizer import normalize
|
||||
|
||||
__all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"]
|
||||
|
||||
|
@ -65,14 +67,14 @@ class English(Phonetics):
|
|||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
phonemes = ([] if start is None else [start]) \
|
||||
+ self.backend(sentence) \
|
||||
+ ([] if end is None else [end])
|
||||
+ self.backend(sentence) \
|
||||
+ ([] if end is None else [end])
|
||||
phonemes = [item for item in phonemes if item in self.vocab.stoi]
|
||||
return phonemes
|
||||
|
||||
def numericalize(self, phonemes):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
|
@ -91,7 +93,7 @@ class English(Phonetics):
|
|||
|
||||
def reverse(self, ids):
|
||||
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
ids: List[int]
|
||||
|
@ -183,7 +185,7 @@ class EnglishCharacter(Phonetics):
|
|||
----------
|
||||
str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
"""
|
||||
return [self.vocab.reverse(i) for i in ids]
|
||||
|
||||
|
@ -244,8 +246,8 @@ class Chinese(Phonetics):
|
|||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
phonemes = ([] if start is None else [start]) \
|
||||
+ phonemes \
|
||||
+ ([] if end is None else [end])
|
||||
+ phonemes \
|
||||
+ ([] if end is None else [end])
|
||||
return self._filter_symbols(phonemes)
|
||||
|
||||
def _filter_symbols(self, phonemes):
|
||||
|
@ -261,7 +263,7 @@ class Chinese(Phonetics):
|
|||
|
||||
def numericalize(self, phonemes):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
|
@ -298,7 +300,7 @@ class Chinese(Phonetics):
|
|||
|
||||
def reverse(self, ids):
|
||||
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
ids: List[int]
|
||||
|
|
|
@ -19,13 +19,15 @@ text -> pinyin to other part of a TTS system. Other NLP techniques may be used
|
|||
(e.g. tokenization, tagging, NER...)
|
||||
"""
|
||||
import re
|
||||
from itertools import product
|
||||
|
||||
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||
from pypinyin.core import DefaultConverter
|
||||
from pypinyin.core import Pinyin
|
||||
from pypinyin.core import Style
|
||||
|
||||
from parakeet.frontend.phonectic import Phonetics
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
import pypinyin
|
||||
from pypinyin.core import Pinyin, Style
|
||||
from pypinyin.core import DefaultConverter
|
||||
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||
from itertools import product
|
||||
|
||||
_punctuations = [',', '。', '?', '!']
|
||||
_initials = [
|
||||
|
@ -33,10 +35,10 @@ _initials = [
|
|||
'ch', 'sh', 'r', 'z', 'c', 's'
|
||||
]
|
||||
_finals = [
|
||||
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en',
|
||||
'ang', 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian',
|
||||
'ien', 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang',
|
||||
'ueng', 'v', 've', 'van', 'ven', 'veng'
|
||||
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang',
|
||||
'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien',
|
||||
'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
|
||||
'v', 've', 'van', 'ven', 'veng'
|
||||
]
|
||||
_ernized_symbol = ['&r']
|
||||
_phones = _initials + _finals + _ernized_symbol + _punctuations
|
||||
|
@ -76,12 +78,12 @@ class ParakeetPinyin(Phonetics):
|
|||
|
||||
def phoneticize(self, sentence, add_start_end=False):
|
||||
""" Normalize the input text sequence and convert it into pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -95,12 +97,12 @@ class ParakeetPinyin(Phonetics):
|
|||
|
||||
def numericalize(self, phonemes, tones):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
The list of pronunciation sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[int]
|
||||
|
@ -112,12 +114,12 @@ class ParakeetPinyin(Phonetics):
|
|||
|
||||
def __call__(self, sentence, add_start_end=False):
|
||||
""" Convert the input text sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -159,12 +161,12 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
|
||||
def phoneticize(self, sentence, add_start_end=False):
|
||||
""" Normalize the input text sequence and convert it into pronunciation sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -178,12 +180,12 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
|
||||
def numericalize(self, phonemes):
|
||||
""" Convert pronunciation sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
phonemes: List[str]
|
||||
The list of pronunciation sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[int]
|
||||
|
@ -194,12 +196,12 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
|
||||
def __call__(self, sentence, add_start_end=False):
|
||||
""" Convert the input text sequence into pronunciation id sequence.
|
||||
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
sentence: str
|
||||
The input text sequence.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
List[str]
|
||||
|
@ -232,17 +234,17 @@ def _convert_to_parakeet_convension(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un","uen")\
|
||||
.replace("ui", "uei")\
|
||||
syllable = syllable.replace("un", "uen") \
|
||||
.replace("ui", "uei") \
|
||||
.replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii")\
|
||||
.replace("ci", "cii")\
|
||||
.replace("si", "sii")\
|
||||
.replace("zhi", "zhiii")\
|
||||
.replace("chi", "chiii")\
|
||||
.replace("shi", "shiii")\
|
||||
syllable = syllable.replace("zi", "zii") \
|
||||
.replace("ci", "cii") \
|
||||
.replace("si", "sii") \
|
||||
.replace("zhi", "zhiii") \
|
||||
.replace("chi", "chiii") \
|
||||
.replace("shi", "shiii") \
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
|
@ -252,8 +254,8 @@ def _convert_to_parakeet_convension(syllable):
|
|||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv")\
|
||||
.replace("qu", "qv")\
|
||||
syllable = syllable.replace("ju", "jv") \
|
||||
.replace("qu", "qv") \
|
||||
.replace("xu", "xv")
|
||||
|
||||
return syllable + tone
|
||||
|
|
|
@ -12,9 +12,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import string
|
||||
|
||||
__all__ = ["get_punctuations"]
|
||||
|
||||
EN_PUNCT = [
|
||||
|
|
|
@ -11,8 +11,8 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Tuple
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
import jieba
|
||||
from pypinyin import lazy_pinyin
|
||||
|
@ -76,8 +76,7 @@ class ToneSandhi():
|
|||
|
||||
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
||||
for j, item in enumerate(word):
|
||||
if j - 1 >= 0 and item == word[j - 1] and pos[
|
||||
0] in {"n", "v", "a"}:
|
||||
if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
|
||||
finals[j] = finals[j][:-1] + "5"
|
||||
ge_idx = word.find("个")
|
||||
if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
|
||||
|
@ -125,8 +124,8 @@ class ToneSandhi():
|
|||
else:
|
||||
for i, char in enumerate(word):
|
||||
# "不" before tone4 should be bu2, e.g. 不怕
|
||||
if char == "不" and i + 1 < len(word) and finals[i + 1][
|
||||
-1] == "4":
|
||||
if char == "不" and i + 1 < len(word) and finals[i +
|
||||
1][-1] == "4":
|
||||
finals[i] = finals[i][:-1] + "2"
|
||||
return finals
|
||||
|
||||
|
@ -266,12 +265,12 @@ class ToneSandhi():
|
|||
assert len(sub_finals_list) == len(seg)
|
||||
merge_last = [False] * len(seg)
|
||||
for i, (word, pos) in enumerate(seg):
|
||||
if i - 1 >= 0 and self._all_tone_three(sub_finals_list[
|
||||
i - 1]) and self._all_tone_three(sub_finals_list[
|
||||
i]) and not merge_last[i - 1]:
|
||||
if i - 1 >= 0 and self._all_tone_three(
|
||||
sub_finals_list[i - 1]) and self._all_tone_three(
|
||||
sub_finals_list[i]) and not merge_last[i - 1]:
|
||||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(seg[
|
||||
i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(
|
||||
seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||||
merge_last[i] = True
|
||||
else:
|
||||
|
@ -299,8 +298,8 @@ class ToneSandhi():
|
|||
if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
|
||||
merge_last[i - 1]:
|
||||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(seg[
|
||||
i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(
|
||||
seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||||
merge_last[i] = True
|
||||
else:
|
||||
|
|
|
@ -11,9 +11,8 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Dict, Iterable, List
|
||||
from collections import OrderedDict
|
||||
from typing import Iterable
|
||||
|
||||
__all__ = ["Vocab"]
|
||||
|
||||
|
@ -25,13 +24,13 @@ class Vocab(object):
|
|||
-----------
|
||||
symbols: Iterable[str]
|
||||
Common symbols.
|
||||
|
||||
|
||||
padding_symbol: str, optional
|
||||
Symbol for pad. Defaults to "<pad>".
|
||||
|
||||
unk_symbol: str, optional
|
||||
Symbol for unknow. Defaults to "<unk>"
|
||||
|
||||
|
||||
start_symbol: str, optional
|
||||
Symbol for start. Defaults to "<s>"
|
||||
|
||||
|
|
|
@ -11,13 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#from parakeet.models.clarinet import *
|
||||
from parakeet.models.waveflow import *
|
||||
#from parakeet.models.wavenet import *
|
||||
|
||||
from parakeet.models.transformer_tts import *
|
||||
#from parakeet.models.deepvoice3 import *
|
||||
# from parakeet.models.fastspeech import *
|
||||
from parakeet.models.tacotron2 import *
|
||||
from parakeet.models.fastspeech2 import *
|
||||
|
|
|
@ -12,19 +12,24 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fastspeech2 related modules for paddle"""
|
||||
|
||||
from typing import Dict, Sequence, Tuple
|
||||
from typing import Sequence
|
||||
from typing import Tuple
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss
|
||||
from typeguard import check_argument_types
|
||||
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
|
||||
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
|
||||
from parakeet.modules.fastspeech2_predictor.postnet import Postnet
|
||||
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding, ScaledPositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
|
||||
from parakeet.modules.nets_utils import initialize, make_non_pad_mask, make_pad_mask
|
||||
from typeguard import check_argument_types
|
||||
from parakeet.modules.nets_utils import initialize
|
||||
from parakeet.modules.nets_utils import make_non_pad_mask
|
||||
from parakeet.modules.nets_utils import make_pad_mask
|
||||
|
||||
|
||||
class FastSpeech2(nn.Layer):
|
||||
|
@ -293,9 +298,8 @@ class FastSpeech2(nn.Layer):
|
|||
xs, ilens, ys, olens, ds, ps, es, is_inference=False)
|
||||
# modify mod part of groundtruth
|
||||
if self.reduction_factor > 1:
|
||||
olens = paddle.to_tensor([
|
||||
olen - olen % self.reduction_factor for olen in olens.numpy()
|
||||
])
|
||||
olens = paddle.to_tensor(
|
||||
[olen - olen % self.reduction_factor for olen in olens.numpy()])
|
||||
max_olen = max(olens)
|
||||
ys = ys[:, :max_olen]
|
||||
|
||||
|
@ -501,8 +505,7 @@ class FastSpeech2Inference(nn.Layer):
|
|||
class FastSpeech2Loss(nn.Layer):
|
||||
"""Loss function module for FastSpeech2."""
|
||||
|
||||
def __init__(self,
|
||||
use_masking: bool=True,
|
||||
def __init__(self, use_masking: bool=True,
|
||||
use_weighted_masking: bool=False):
|
||||
"""Initialize feed-forward Transformer loss module.
|
||||
|
||||
|
@ -538,8 +541,8 @@ class FastSpeech2Loss(nn.Layer):
|
|||
ps: paddle.Tensor,
|
||||
es: paddle.Tensor,
|
||||
ilens: paddle.Tensor,
|
||||
olens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor,
|
||||
paddle.Tensor, paddle.Tensor]:
|
||||
olens: paddle.Tensor,
|
||||
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Parameters
|
||||
|
@ -611,9 +614,9 @@ class FastSpeech2Loss(nn.Layer):
|
|||
# make weighted mask and apply it
|
||||
if self.use_weighted_masking:
|
||||
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
|
||||
out_weights = out_masks.cast(
|
||||
dtype=paddle.float32) / out_masks.cast(
|
||||
dtype=paddle.float32).sum(axis=1, keepdim=True)
|
||||
out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
|
||||
dtype=paddle.float32).sum(
|
||||
axis=1, keepdim=True)
|
||||
out_weights /= ys.shape[0] * ys.shape[2]
|
||||
duration_masks = make_non_pad_mask(ilens)
|
||||
duration_weights = (duration_masks.cast(dtype=paddle.float32) /
|
||||
|
|
|
@ -11,17 +11,14 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from scipy.interpolate import interp1d
|
||||
from sklearn.metrics import roc_curve
|
||||
from scipy.optimize import brentq
|
||||
from sklearn.metrics import roc_curve
|
||||
|
||||
|
||||
class LSTMSpeakerEncoder(nn.Layer):
|
||||
|
@ -81,8 +78,7 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
# print("p1: ", p1.shape)
|
||||
p2 = paddle.bmm(
|
||||
embeds.reshape([-1, 1, embed_dim]),
|
||||
normalized_centroids_excl.reshape(
|
||||
[-1, embed_dim, 1])) # (NM, 1, 1)
|
||||
normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1)
|
||||
p2 = p2.reshape([-1]) # (NM)
|
||||
|
||||
# begin: alternative implementation for scatter
|
||||
|
@ -94,9 +90,8 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
index = index * speakers_per_batch + paddle.arange(
|
||||
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
|
||||
index = paddle.reshape(index, [-1])
|
||||
ones = paddle.ones([
|
||||
speakers_per_batch * utterances_per_speaker * speakers_per_batch
|
||||
])
|
||||
ones = paddle.ones(
|
||||
[speakers_per_batch * utterances_per_speaker * speakers_per_batch])
|
||||
zeros = paddle.zeros_like(index, dtype=ones.dtype)
|
||||
mask_p1 = paddle.scatter(ones, index, zeros)
|
||||
p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
|
||||
|
@ -113,6 +108,9 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
g = p._grad_ivar()
|
||||
g[...] = g * 0.01
|
||||
|
||||
def inv_argmax(self, i, num):
|
||||
return np.eye(1, num, i, dtype=np.int)[0]
|
||||
|
||||
def loss(self, embeds):
|
||||
"""
|
||||
Computes the softmax loss according the section 2.1 of GE2E.
|
||||
|
@ -138,8 +136,8 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
# EER (not backpropagated)
|
||||
with paddle.no_grad():
|
||||
ground_truth = target.numpy()
|
||||
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
|
||||
labels = np.array([inv_argmax(i) for i in ground_truth])
|
||||
labels = np.array(
|
||||
[self.inv_argmax(i, speakers_per_batch) for i in ground_truth])
|
||||
preds = sim_matrix.numpy()
|
||||
|
||||
# Snippet from https://yangcha.github.io/EER-ROC/
|
||||
|
|
|
@ -11,13 +11,14 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from typing import List, Dict, Any, Union, Optional, Tuple
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
@ -63,8 +64,8 @@ class Stretch2D(nn.Layer):
|
|||
|
||||
|
||||
class UpsampleNet(nn.Layer):
|
||||
"""A Layer to upsample spectrogram by applying consecutive stretch and
|
||||
convolutions.
|
||||
"""A Layer to upsample spectrogram by applying consecutive stretch and
|
||||
convolutions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -81,10 +82,10 @@ class UpsampleNet(nn.Layer):
|
|||
use_causal_conv : bool, optional
|
||||
Whether to use causal padding before convolution, by default False
|
||||
|
||||
If True, Causal padding is used along the time axis, i.e. padding
|
||||
amount is ``receptive field - 1`` and 0 for before and after,
|
||||
If True, Causal padding is used along the time axis, i.e. padding
|
||||
amount is ``receptive field - 1`` and 0 for before and after,
|
||||
respectively.
|
||||
|
||||
|
||||
If False, "same" padding is used along the time axis.
|
||||
"""
|
||||
|
||||
|
@ -158,7 +159,7 @@ class ConvInUpsampleNet(nn.Layer):
|
|||
aux_context_window : int, optional
|
||||
Context window of the first 1D convolution applied to the input. It
|
||||
related to the kernel size of the convolution, by default 0
|
||||
|
||||
|
||||
If use causal convolution, the kernel size is ``window + 1``, else
|
||||
the kernel size is ``2 * window + 1``.
|
||||
use_causal_conv : bool, optional
|
||||
|
@ -167,7 +168,7 @@ class ConvInUpsampleNet(nn.Layer):
|
|||
If True, Causal padding is used along the time axis, i.e. padding
|
||||
amount is ``receptive field - 1`` and 0 for before and after,
|
||||
respectively.
|
||||
|
||||
|
||||
If False, "same" padding is used along the time axis.
|
||||
"""
|
||||
|
||||
|
@ -276,10 +277,7 @@ class ResidualBlock(nn.Layer):
|
|||
|
||||
gate_out_channels = gate_channels // 2
|
||||
self.conv1x1_out = nn.Conv1D(
|
||||
gate_out_channels,
|
||||
residual_channels,
|
||||
kernel_size=1,
|
||||
bias_attr=bias)
|
||||
gate_out_channels, residual_channels, kernel_size=1, bias_attr=bias)
|
||||
self.conv1x1_skip = nn.Conv1D(
|
||||
gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias)
|
||||
|
||||
|
@ -428,13 +426,18 @@ class PWGGenerator(nn.Layer):
|
|||
use_causal_conv=use_causal_conv)
|
||||
self.conv_layers.append(conv)
|
||||
|
||||
self.last_conv_layers = nn.Sequential(
|
||||
nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels, skip_channels, 1, bias_attr=True),
|
||||
nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels, out_channels, 1, bias_attr=True))
|
||||
self.last_conv_layers = nn.Sequential(nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels,
|
||||
skip_channels,
|
||||
1,
|
||||
bias_attr=True),
|
||||
nn.ReLU(),
|
||||
nn.Conv1D(
|
||||
skip_channels,
|
||||
out_channels,
|
||||
1,
|
||||
bias_attr=True))
|
||||
|
||||
if use_weight_norm:
|
||||
self.apply_weight_norm()
|
||||
|
@ -548,18 +551,18 @@ class PWGDiscriminator(nn.Layer):
|
|||
by default True
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels: int=1,
|
||||
out_channels: int=1,
|
||||
kernel_size: int=3,
|
||||
layers: int=10,
|
||||
conv_channels: int=64,
|
||||
dilation_factor: int=1,
|
||||
nonlinear_activation: str="LeakyReLU",
|
||||
nonlinear_activation_params: Dict[
|
||||
str, Any]={"negative_slope": 0.2},
|
||||
bias: bool=True,
|
||||
use_weight_norm: bool=True):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int=1,
|
||||
out_channels: int=1,
|
||||
kernel_size: int=3,
|
||||
layers: int=10,
|
||||
conv_channels: int=64,
|
||||
dilation_factor: int=1,
|
||||
nonlinear_activation: str="LeakyReLU",
|
||||
nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
|
||||
bias: bool=True,
|
||||
use_weight_norm: bool=True):
|
||||
super().__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
assert dilation_factor > 0
|
||||
|
@ -693,8 +696,7 @@ class ResidualPWGDiscriminator(nn.Layer):
|
|||
layers_per_stack = layers // stacks
|
||||
|
||||
self.first_conv = nn.Sequential(
|
||||
nn.Conv1D(
|
||||
in_channels, residual_channels, 1, bias_attr=True),
|
||||
nn.Conv1D(in_channels, residual_channels, 1, bias_attr=True),
|
||||
getattr(nn, nonlinear_activation)(**nonlinear_activation_params))
|
||||
|
||||
self.conv_layers = nn.LayerList()
|
||||
|
@ -714,11 +716,9 @@ class ResidualPWGDiscriminator(nn.Layer):
|
|||
|
||||
self.last_conv_layers = nn.Sequential(
|
||||
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||
nn.Conv1D(
|
||||
skip_channels, skip_channels, 1, bias_attr=True),
|
||||
nn.Conv1D(skip_channels, skip_channels, 1, bias_attr=True),
|
||||
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||
nn.Conv1D(
|
||||
skip_channels, out_channels, 1, bias_attr=True))
|
||||
nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True))
|
||||
|
||||
if use_weight_norm:
|
||||
self.apply_weight_norm()
|
||||
|
|
|
@ -11,18 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from parakeet.modules.positional_encoding import sinusoid_position_encoding
|
||||
from parakeet.modules.expansion import expand
|
||||
from parakeet.modules.positional_encoding import sinusoid_position_encoding
|
||||
|
||||
|
||||
class ResidualBlock(nn.Layer):
|
||||
|
@ -38,8 +31,7 @@ class ResidualBlock(nn.Layer):
|
|||
padding="same",
|
||||
data_format="NLC"),
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1D(
|
||||
channels, data_format="NLC"), ) for _ in range(n)
|
||||
nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
|
||||
]
|
||||
self.blocks = nn.Sequential(*blocks)
|
||||
|
||||
|
@ -95,16 +87,14 @@ class SpeedySpeechEncoder(nn.Layer):
|
|||
nn.Linear(hidden_size, hidden_size),
|
||||
nn.ReLU(), )
|
||||
res_blocks = [
|
||||
ResidualBlock(
|
||||
hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
]
|
||||
self.res_blocks = nn.Sequential(*res_blocks)
|
||||
|
||||
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
|
||||
self.postnet2 = nn.Sequential(
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1D(
|
||||
hidden_size, data_format="NLC"),
|
||||
nn.BatchNorm1D(hidden_size, data_format="NLC"),
|
||||
nn.Linear(hidden_size, hidden_size), )
|
||||
|
||||
def forward(self, text, tones):
|
||||
|
@ -120,13 +110,9 @@ class DurationPredictor(nn.Layer):
|
|||
def __init__(self, hidden_size):
|
||||
super().__init__()
|
||||
self.layers = nn.Sequential(
|
||||
ResidualBlock(
|
||||
hidden_size, 4, 1, n=1),
|
||||
ResidualBlock(
|
||||
hidden_size, 3, 1, n=1),
|
||||
ResidualBlock(
|
||||
hidden_size, 1, 1, n=1),
|
||||
nn.Linear(hidden_size, 1))
|
||||
ResidualBlock(hidden_size, 4, 1, n=1),
|
||||
ResidualBlock(hidden_size, 3, 1, n=1),
|
||||
ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))
|
||||
|
||||
def forward(self, x):
|
||||
return paddle.squeeze(self.layers(x), -1)
|
||||
|
@ -136,15 +122,13 @@ class SpeedySpeechDecoder(nn.Layer):
|
|||
def __init__(self, hidden_size, output_size, kernel_size, dilations):
|
||||
super().__init__()
|
||||
res_blocks = [
|
||||
ResidualBlock(
|
||||
hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
|
||||
]
|
||||
self.res_blocks = nn.Sequential(*res_blocks)
|
||||
|
||||
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
|
||||
self.postnet2 = nn.Sequential(
|
||||
ResidualBlock(
|
||||
hidden_size, kernel_size, 1, n=2),
|
||||
ResidualBlock(hidden_size, kernel_size, 1, n=2),
|
||||
nn.Linear(hidden_size, output_size))
|
||||
|
||||
def forward(self, x):
|
||||
|
|
|
@ -11,20 +11,19 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.fluid.layers import sequence_mask
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
from paddle.fluid.layers import sequence_mask
|
||||
from tqdm import trange
|
||||
|
||||
from parakeet.modules.conv import Conv1dBatchNorm
|
||||
from parakeet.modules.attention import LocationSensitiveAttention
|
||||
from parakeet.modules.conv import Conv1dBatchNorm
|
||||
from parakeet.modules.losses import guided_attention_loss
|
||||
from parakeet.utils import checkpoint
|
||||
from tqdm import trange
|
||||
|
||||
__all__ = ["Tacotron2", "Tacotron2Loss"]
|
||||
|
||||
|
@ -74,8 +73,7 @@ class DecoderPreNet(nn.Layer):
|
|||
|
||||
"""
|
||||
|
||||
x = F.dropout(
|
||||
F.relu(self.linear1(x)), self.dropout_rate, training=True)
|
||||
x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate, training=True)
|
||||
output = F.dropout(
|
||||
F.relu(self.linear2(x)), self.dropout_rate, training=True)
|
||||
return output
|
||||
|
@ -745,10 +743,10 @@ class Tacotron2(nn.Layer):
|
|||
|
||||
if global_condition is not None:
|
||||
global_condition = global_condition.unsqueeze(1)
|
||||
global_condition = paddle.expand(
|
||||
global_condition, [-1, encoder_outputs.shape[1], -1])
|
||||
encoder_outputs = paddle.concat(
|
||||
[encoder_outputs, global_condition], -1)
|
||||
global_condition = paddle.expand(global_condition,
|
||||
[-1, encoder_outputs.shape[1], -1])
|
||||
encoder_outputs = paddle.concat([encoder_outputs, global_condition],
|
||||
-1)
|
||||
|
||||
# [B, T_enc, 1]
|
||||
mask = sequence_mask(
|
||||
|
@ -813,10 +811,10 @@ class Tacotron2(nn.Layer):
|
|||
|
||||
if global_condition is not None:
|
||||
global_condition = global_condition.unsqueeze(1)
|
||||
global_condition = paddle.expand(
|
||||
global_condition, [-1, encoder_outputs.shape[1], -1])
|
||||
encoder_outputs = paddle.concat(
|
||||
[encoder_outputs, global_condition], -1)
|
||||
global_condition = paddle.expand(global_condition,
|
||||
[-1, encoder_outputs.shape[1], -1])
|
||||
encoder_outputs = paddle.concat([encoder_outputs, global_condition],
|
||||
-1)
|
||||
if self.decoder.use_stop_token:
|
||||
mel_outputs, alignments, stop_logits = self.decoder.infer(
|
||||
encoder_outputs, max_decoder_steps=max_decoder_steps)
|
||||
|
|
|
@ -11,22 +11,26 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from tqdm import trange
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
from tqdm import trange
|
||||
|
||||
import parakeet
|
||||
from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention
|
||||
from parakeet.modules.transformer import PositionwiseFFN
|
||||
from parakeet.modules import masking
|
||||
from parakeet.modules.conv import Conv1dBatchNorm
|
||||
from parakeet.modules import positional_encoding as pe
|
||||
from parakeet.modules import losses as L
|
||||
from parakeet.utils import checkpoint, scheduler
|
||||
from parakeet.modules import masking
|
||||
from parakeet.modules import positional_encoding as pe
|
||||
from parakeet.modules.attention import _concat_heads
|
||||
from parakeet.modules.attention import _split_heads
|
||||
from parakeet.modules.attention import drop_head
|
||||
from parakeet.modules.attention import scaled_dot_product_attention
|
||||
from parakeet.modules.conv import Conv1dBatchNorm
|
||||
from parakeet.modules.transformer import PositionwiseFFN
|
||||
from parakeet.utils import checkpoint
|
||||
from parakeet.utils import scheduler
|
||||
|
||||
__all__ = ["TransformerTTS", "TransformerTTSLoss"]
|
||||
|
||||
|
@ -404,16 +408,14 @@ class TransformerTTS(nn.Layer):
|
|||
self.toned = False
|
||||
# position encoding matrix may be extended later
|
||||
self.encoder_pe = pe.sinusoid_position_encoding(1000, d_encoder)
|
||||
self.encoder_pe_scalar = self.create_parameter(
|
||||
[1], attr=I.Constant(1.))
|
||||
self.encoder_pe_scalar = self.create_parameter([1], attr=I.Constant(1.))
|
||||
self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn,
|
||||
encoder_layers, dropout)
|
||||
|
||||
# decoder
|
||||
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
|
||||
self.decoder_pe = pe.sinusoid_position_encoding(1000, d_decoder)
|
||||
self.decoder_pe_scalar = self.create_parameter(
|
||||
[1], attr=I.Constant(1.))
|
||||
self.decoder_pe_scalar = self.create_parameter([1], attr=I.Constant(1.))
|
||||
self.decoder = TransformerDecoder(
|
||||
d_decoder,
|
||||
n_heads,
|
||||
|
@ -470,14 +472,13 @@ class TransformerTTS(nn.Layer):
|
|||
self.encoder_pe = pe.sinusoid_position_encoding(new_T,
|
||||
self.d_encoder)
|
||||
pos_enc = self.encoder_pe[:T_enc, :] # (T, C)
|
||||
x = embed.scale(math.sqrt(
|
||||
self.d_encoder)) + pos_enc * self.encoder_pe_scalar
|
||||
x = embed.scale(
|
||||
math.sqrt(self.d_encoder)) + pos_enc * self.encoder_pe_scalar
|
||||
x = F.dropout(x, self.dropout, training=self.training)
|
||||
|
||||
# TODO(chenfeiyu): unsqueeze a decoder_time_steps=1 for the mask
|
||||
encoder_padding_mask = paddle.unsqueeze(
|
||||
masking.id_mask(
|
||||
text, self.padding_idx, dtype=x.dtype), 1)
|
||||
masking.id_mask(text, self.padding_idx, dtype=x.dtype), 1)
|
||||
x, attention_weights = self.encoder(x, encoder_padding_mask,
|
||||
self.drop_n_heads)
|
||||
return x, attention_weights, encoder_padding_mask
|
||||
|
@ -492,8 +493,8 @@ class TransformerTTS(nn.Layer):
|
|||
self.decoder_pe = pe.sinusoid_position_encoding(new_T,
|
||||
self.d_decoder)
|
||||
pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
|
||||
x = x.scale(math.sqrt(
|
||||
self.d_decoder)) + pos_enc * self.decoder_pe_scalar
|
||||
x = x.scale(
|
||||
math.sqrt(self.d_decoder)) + pos_enc * self.decoder_pe_scalar
|
||||
x = F.dropout(x, self.dropout, training=self.training)
|
||||
|
||||
no_future_mask = masking.future_mask(T_dec, dtype=input.dtype)
|
||||
|
@ -547,9 +548,8 @@ class TransformerTTS(nn.Layer):
|
|||
# stop condition: (if any ouput frame of the output multiframes hits the stop condition)
|
||||
# import pdb; pdb.set_trace()
|
||||
if paddle.any(
|
||||
paddle.argmax(
|
||||
stop_logits[0, -self.r:, :], axis=-1) ==
|
||||
self.stop_prob_index):
|
||||
paddle.argmax(stop_logits[0, -self.r:, :],
|
||||
axis=-1) == self.stop_prob_index):
|
||||
if verbose:
|
||||
print("Hits stop condition.")
|
||||
break
|
||||
|
@ -602,8 +602,7 @@ class TransformerTTSLoss(nn.Layer):
|
|||
|
||||
def forward(self, mel_output, mel_intermediate, mel_target, stop_logits,
|
||||
stop_probs):
|
||||
mask = masking.feature_mask(
|
||||
mel_target, axis=-1, dtype=mel_target.dtype)
|
||||
mask = masking.feature_mask(mel_target, axis=-1, dtype=mel_target.dtype)
|
||||
mask1 = paddle.unsqueeze(mask, -1)
|
||||
mel_loss1 = L.masked_l1_loss(mel_output, mel_target, mask1)
|
||||
mel_loss2 = L.masked_l1_loss(mel_intermediate, mel_target, mask1)
|
||||
|
|
|
@ -11,10 +11,11 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
import math
|
||||
from typing import List, Union, Tuple
|
||||
import time
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
@ -22,8 +23,8 @@ from paddle import nn
|
|||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from parakeet.utils import checkpoint
|
||||
from parakeet.modules import geometry as geo
|
||||
from parakeet.utils import checkpoint
|
||||
|
||||
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
||||
|
||||
|
@ -120,7 +121,7 @@ class UpsampleNet(nn.LayerList):
|
|||
If trim_conv_artifact is ``True``, the output time steps is less
|
||||
than ``time_steps \* upsample_factors``.
|
||||
"""
|
||||
x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T)
|
||||
x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T)
|
||||
for layer in self:
|
||||
x = layer(x)
|
||||
if trim_conv_artifact:
|
||||
|
@ -795,7 +796,7 @@ class ConditionalWaveFlow(nn.LayerList):
|
|||
The synthesized audio, where``T <= T_mel \* upsample_factors``.
|
||||
"""
|
||||
start = time.time()
|
||||
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
|
||||
condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T)
|
||||
batch_size, _, time_steps = condition.shape
|
||||
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
||||
x = self.decoder.inverse(z, condition)
|
||||
|
@ -893,12 +894,12 @@ class WaveFlowLoss(nn.Layer):
|
|||
class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
|
||||
def forward(self, mel):
|
||||
"""Generate raw audio given mel spectrogram.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mel : np.ndarray [shape=(C_mel, T_mel)]
|
||||
Mel spectrogram of an utterance(in log-magnitude).
|
||||
|
||||
Mel spectrogram of an utterance(in log-magnitude).
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray [shape=(T,)]
|
||||
|
|
|
@ -11,11 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.modules.attention import *
|
||||
from parakeet.modules.conv import *
|
||||
from parakeet.modules.geometry import *
|
||||
from parakeet.modules.losses import *
|
||||
from parakeet.modules.masking import *
|
||||
from parakeet.modules.positional_encoding import *
|
||||
from parakeet.modules.transformer import *
|
||||
|
|
|
@ -11,19 +11,15 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
||||
def scaled_dot_product_attention(q,
|
||||
k,
|
||||
v,
|
||||
mask=None,
|
||||
dropout=0.0,
|
||||
def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
|
||||
training=True):
|
||||
r"""Scaled dot product attention with masking.
|
||||
|
||||
|
@ -33,24 +29,19 @@ def scaled_dot_product_attention(q,
|
|||
|
||||
Parameters
|
||||
-----------
|
||||
|
||||
q : Tensor [shape=(\*, T_q, d)]
|
||||
the query tensor.
|
||||
|
||||
k : Tensor [shape=(\*, T_k, d)]
|
||||
the key tensor.
|
||||
|
||||
v : Tensor [shape=(\*, T_k, d_v)]
|
||||
the value tensor.
|
||||
|
||||
mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
|
||||
the mask tensor, zeros correspond to paddings. Defaults to None.
|
||||
|
||||
|
||||
Returns
|
||||
----------
|
||||
out : Tensor [shape=(\*, T_q, d_v)]
|
||||
out : Tensor [shape=(\*, T_q, d_v)]
|
||||
the context vector.
|
||||
|
||||
attn_weights : Tensor [shape=(\*, T_q, T_k)]
|
||||
the attention weights.
|
||||
"""
|
||||
|
@ -74,10 +65,8 @@ def drop_head(x, drop_n_heads, training=True):
|
|||
----------
|
||||
x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
|
||||
The input, multiple context vectors.
|
||||
|
||||
drop_n_heads : int [0<= drop_n_heads <= num_heads]
|
||||
Number of vectors to drop.
|
||||
|
||||
training : bool
|
||||
A flag indicating whether it is in training. If `False`, no dropout is
|
||||
applied.
|
||||
|
@ -127,17 +116,14 @@ class MonoheadAttention(nn.Layer):
|
|||
----------
|
||||
model_dim : int
|
||||
Feature size of the query.
|
||||
|
||||
dropout : float, optional
|
||||
Dropout probability of scaled dot product attention and final context
|
||||
Dropout probability of scaled dot product attention and final context
|
||||
vector. Defaults to 0.0.
|
||||
|
||||
k_dim : int, optional
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
provided, it is set to `model_dim / num_heads`. Defaults to None.
|
||||
|
||||
v_dim : int, optional
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
provided, it is set to `model_dim / num_heads`. Defaults to None.
|
||||
"""
|
||||
|
||||
|
@ -162,23 +148,19 @@ class MonoheadAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
-----------
|
||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
The queries.
|
||||
|
||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
The keys.
|
||||
|
||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
The values.
|
||||
|
||||
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
|
||||
The mask.
|
||||
|
||||
Returns
|
||||
----------
|
||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
The context vector.
|
||||
|
||||
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
|
||||
The attention weights.
|
||||
"""
|
||||
|
@ -200,20 +182,16 @@ class MultiheadAttention(nn.Layer):
|
|||
-----------
|
||||
model_dim: int
|
||||
The feature size of query.
|
||||
|
||||
num_heads : int
|
||||
The number of attention heads.
|
||||
|
||||
dropout : float, optional
|
||||
Dropout probability of scaled dot product attention and final context
|
||||
Dropout probability of scaled dot product attention and final context
|
||||
vector. Defaults to 0.0.
|
||||
|
||||
k_dim : int, optional
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
provided, it is set to ``model_dim / num_heads``. Defaults to None.
|
||||
|
||||
v_dim : int, optional
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
Feature size of the key of each scaled dot product attention. If not
|
||||
provided, it is set to ``model_dim / num_heads``. Defaults to None.
|
||||
|
||||
Raises
|
||||
|
@ -248,23 +226,19 @@ class MultiheadAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
-----------
|
||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
The queries.
|
||||
|
||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
The keys.
|
||||
|
||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
||||
The values.
|
||||
|
||||
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
|
||||
The mask.
|
||||
|
||||
Returns
|
||||
----------
|
||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
||||
The context vector.
|
||||
|
||||
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
|
||||
The attention weights.
|
||||
"""
|
||||
|
@ -290,16 +264,12 @@ class LocationSensitiveAttention(nn.Layer):
|
|||
-----------
|
||||
d_query: int
|
||||
The feature size of query.
|
||||
|
||||
d_key : int
|
||||
The feature size of key.
|
||||
|
||||
d_attention : int
|
||||
The feature size of dimension.
|
||||
|
||||
The feature size of dimension.
|
||||
location_filters : int
|
||||
Filter size of attention convolution.
|
||||
|
||||
location_kernel_size : int
|
||||
Kernel size of attention convolution.
|
||||
"""
|
||||
|
@ -337,27 +307,22 @@ class LocationSensitiveAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
-----------
|
||||
query : Tensor [shape=(batch_size, d_query)]
|
||||
query : Tensor [shape=(batch_size, d_query)]
|
||||
The queries.
|
||||
|
||||
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
|
||||
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
|
||||
The keys after linear layer.
|
||||
|
||||
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
|
||||
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
|
||||
The values.
|
||||
|
||||
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
|
||||
Attention weights concat.
|
||||
|
||||
mask : Tensor, optional
|
||||
The mask. Shape should be (batch_size, times_steps_k, 1).
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
----------
|
||||
attention_context : Tensor [shape=(batch_size, d_attention)]
|
||||
attention_context : Tensor [shape=(batch_size, d_attention)]
|
||||
The context vector.
|
||||
|
||||
attention_weights : Tensor [shape=(batch_size, time_steps_k)]
|
||||
The attention weights.
|
||||
"""
|
||||
|
|
|
@ -11,20 +11,19 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import paddle
|
||||
from librosa.util import pad_center
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from scipy import signal
|
||||
import librosa
|
||||
from librosa.util import pad_center
|
||||
import numpy as np
|
||||
|
||||
__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
|
||||
|
||||
|
||||
def quantize(values, n_bands):
|
||||
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
|
||||
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
|
||||
[0, n_bands).
|
||||
|
||||
Parameters
|
||||
|
@ -33,7 +32,7 @@ def quantize(values, n_bands):
|
|||
The floating point value.
|
||||
|
||||
n_bands : int
|
||||
The number of bands. The output integer Tensor's value is in the range
|
||||
The number of bands. The output integer Tensor's value is in the range
|
||||
[0, n_bans).
|
||||
|
||||
Returns
|
||||
|
@ -46,7 +45,7 @@ def quantize(values, n_bands):
|
|||
|
||||
|
||||
def dequantize(quantized, n_bands, dtype=None):
|
||||
"""Linearlly dequantize an integer Tensor into a float Tensor in the range
|
||||
"""Linearlly dequantize an integer Tensor into a float Tensor in the range
|
||||
[-1, 1).
|
||||
|
||||
Parameters
|
||||
|
@ -55,7 +54,7 @@ def dequantize(quantized, n_bands, dtype=None):
|
|||
The quantized value in the range [0, n_bands).
|
||||
|
||||
n_bands : int
|
||||
Number of bands. The input integer Tensor's value is in the range
|
||||
Number of bands. The input integer Tensor's value is in the range
|
||||
[0, n_bans).
|
||||
|
||||
dtype : str, optional
|
||||
|
@ -73,43 +72,36 @@ def dequantize(quantized, n_bands, dtype=None):
|
|||
|
||||
|
||||
class STFT(nn.Layer):
|
||||
"""A module for computing stft transformation in a differentiable way.
|
||||
"""A module for computing stft transformation in a differentiable way.
|
||||
|
||||
Parameters
|
||||
------------
|
||||
n_fft : int
|
||||
Number of samples in a frame.
|
||||
|
||||
hop_length : int
|
||||
Number of samples shifted between adjacent frames.
|
||||
|
||||
win_length : int
|
||||
Length of the window.
|
||||
|
||||
window : str, optional
|
||||
Name of window function, see `scipy.signal.get_window` for more
|
||||
Name of window function, see `scipy.signal.get_window` for more
|
||||
details. Defaults to "hanning".
|
||||
|
||||
center : bool
|
||||
If True, the signal y is padded so that frame D[:, t] is centered
|
||||
at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
|
||||
Defaults to True.
|
||||
|
||||
pad_mode : string or function
|
||||
If center=True, this argument is passed to np.pad for padding the edges
|
||||
of the signal y. By default (pad_mode="reflect"), y is padded on both
|
||||
sides with its own reflection, mirrored around its first and last
|
||||
If center=True, this argument is passed to np.pad for padding the edges
|
||||
of the signal y. By default (pad_mode="reflect"), y is padded on both
|
||||
sides with its own reflection, mirrored around its first and last
|
||||
sample respectively. If center=False, this argument is ignored.
|
||||
|
||||
|
||||
|
||||
Notes
|
||||
-----------
|
||||
It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
|
||||
It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
|
||||
details.
|
||||
|
||||
Given a audio which ``T`` samples, it the STFT transformation outputs a
|
||||
spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
|
||||
Given a audio which ``T`` samples, it the STFT transformation outputs a
|
||||
spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
|
||||
and ``frames = 1 + T // hop_lenghth``.
|
||||
|
||||
Ony ``center`` and ``reflect`` padding is supported now.
|
||||
|
@ -144,19 +136,19 @@ class STFT(nn.Layer):
|
|||
# pad window to n_fft size
|
||||
if n_fft != win_length:
|
||||
window = pad_center(window, n_fft, mode="constant")
|
||||
#lpad = (n_fft - win_length) // 2
|
||||
#rpad = n_fft - win_length - lpad
|
||||
#window = np.pad(window, ((lpad, pad), ), 'constant')
|
||||
# lpad = (n_fft - win_length) // 2
|
||||
# rpad = n_fft - win_length - lpad
|
||||
# window = np.pad(window, ((lpad, pad), ), 'constant')
|
||||
|
||||
# calculate weights
|
||||
#r = np.arange(0, n_fft)
|
||||
#M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
|
||||
#w_real = np.reshape(window *
|
||||
#np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
#(self.n_bin, 1, self.n_fft))
|
||||
#w_imag = np.reshape(window *
|
||||
#np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
#(self.n_bin, 1, self.n_fft))
|
||||
# r = np.arange(0, n_fft)
|
||||
# M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
|
||||
# w_real = np.reshape(window *
|
||||
# np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
# (self.n_bin, 1, self.n_fft))
|
||||
# w_imag = np.reshape(window *
|
||||
# np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
# (self.n_bin, 1, self.n_fft))
|
||||
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
|
||||
w_real = weight.real
|
||||
w_imag = weight.imag
|
||||
|
@ -174,17 +166,18 @@ class STFT(nn.Layer):
|
|||
The input waveform.
|
||||
Returns
|
||||
------------
|
||||
real : Tensor [shape=(B, C, frames)]
|
||||
real : Tensor [shape=(B, C, frames)]
|
||||
The real part of the spectrogram.
|
||||
|
||||
imag : Tensor [shape=(B, C, frames)]
|
||||
imag : Tensor [shape=(B, C, frames)]
|
||||
The image part of the spectrogram.
|
||||
"""
|
||||
x = paddle.unsqueeze(x, axis=1)
|
||||
if self.center:
|
||||
x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
|
||||
data_format='NCL',
|
||||
mode=self.pad_mode)
|
||||
x = F.pad(
|
||||
x, [self.n_fft // 2, self.n_fft // 2],
|
||||
data_format='NCL',
|
||||
mode=self.pad_mode)
|
||||
|
||||
# to BCT, C=1
|
||||
out = F.conv1d(x, self.weight, stride=self.hop_length)
|
||||
|
@ -199,7 +192,7 @@ class STFT(nn.Layer):
|
|||
The input waveform.
|
||||
Returns
|
||||
------------
|
||||
Tensor [shape=(B, C, T)]
|
||||
Tensor [shape=(B, C, T)]
|
||||
The power spectrum.
|
||||
"""
|
||||
real, imag = self.forward(x)
|
||||
|
@ -214,7 +207,7 @@ class STFT(nn.Layer):
|
|||
The input waveform.
|
||||
Returns
|
||||
------------
|
||||
Tensor [shape=(B, C, T)]
|
||||
Tensor [shape=(B, C, T)]
|
||||
The magnitude of the spectrum.
|
||||
"""
|
||||
power = self.power(x)
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
|
@ -22,48 +21,40 @@ __all__ = [
|
|||
|
||||
|
||||
class Conv1dCell(nn.Conv1D):
|
||||
"""A subclass of Conv1D layer, which can be used in an autoregressive
|
||||
"""A subclass of Conv1D layer, which can be used in an autoregressive
|
||||
decoder like an RNN cell.
|
||||
|
||||
When used in autoregressive decoding, it performs causal temporal
|
||||
convolution incrementally. At each time step, it takes a step input and
|
||||
returns a step output.
|
||||
When used in autoregressive decoding, it performs causal temporal
|
||||
convolution incrementally. At each time step, it takes a step input and
|
||||
returns a step output.
|
||||
|
||||
Notes
|
||||
------
|
||||
It is done by caching an internal buffer of length ``receptive_file - 1``.
|
||||
when adding a step input, the buffer is shited by one step, the latest
|
||||
input is added to be buffer and the oldest step is discarded. And it
|
||||
returns a step output. For single step case, convolution is equivalent to a
|
||||
It is done by caching an internal buffer of length ``receptive_file - 1``.
|
||||
when adding a step input, the buffer is shited by one step, the latest
|
||||
input is added to be buffer and the oldest step is discarded. And it
|
||||
returns a step output. For single step case, convolution is equivalent to a
|
||||
linear transformation.
|
||||
|
||||
That it can be used as a cell depends on several restrictions:
|
||||
|
||||
1. stride must be 1;
|
||||
2. padding must be a causal padding (recpetive_field - 1, 0).
|
||||
|
||||
Thus, these arguments are removed from the ``__init__`` method of this
|
||||
Thus, these arguments are removed from the ``__init__`` method of this
|
||||
class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
in_channels: int
|
||||
The feature size of the input.
|
||||
|
||||
out_channels: int
|
||||
The feature size of the output.
|
||||
|
||||
kernel_size: int or Tuple[int]
|
||||
The size of the kernel.
|
||||
|
||||
dilation: int or Tuple[int]
|
||||
The dilation of the convolution, by default 1
|
||||
|
||||
weight_attr: ParamAttr, Initializer, str or bool, optional
|
||||
The parameter attribute of the convolution kernel, by default None.
|
||||
|
||||
bias_attr: ParamAttr, Initializer, str or bool, optional
|
||||
The parameter attribute of the bias. If ``False``, this layer does not
|
||||
The parameter attribute of the bias. If ``False``, this layer does not
|
||||
have a bias, by default None.
|
||||
|
||||
Examples
|
||||
|
@ -114,7 +105,7 @@ class Conv1dCell(nn.Conv1D):
|
|||
|
||||
Warnings
|
||||
---------
|
||||
This method should be called before a sequence of calls to
|
||||
This method should be called before a sequence of calls to
|
||||
``add_input``.
|
||||
|
||||
Raises
|
||||
|
@ -165,12 +156,12 @@ class Conv1dCell(nn.Conv1D):
|
|||
|
||||
Parameters
|
||||
-----------
|
||||
x_t : Tensor [shape=(batch_size, in_channels)]
|
||||
x_t : Tensor [shape=(batch_size, in_channels)]
|
||||
The step input.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_t :Tensor [shape=(batch_size, out_channels)]
|
||||
y_t :Tensor [shape=(batch_size, out_channels)]
|
||||
The step output.
|
||||
"""
|
||||
batch_size = x_t.shape[0]
|
||||
|
@ -199,36 +190,27 @@ class Conv1dBatchNorm(nn.Layer):
|
|||
----------
|
||||
in_channels : int
|
||||
The feature size of the input.
|
||||
|
||||
out_channels : int
|
||||
The feature size of the output.
|
||||
|
||||
kernel_size : int
|
||||
The size of the convolution kernel.
|
||||
|
||||
stride : int, optional
|
||||
The stride of the convolution, by default 1.
|
||||
|
||||
padding : int, str or Tuple[int], optional
|
||||
The padding of the convolution.
|
||||
The padding of the convolution.
|
||||
If int, a symmetrical padding is applied before convolution;
|
||||
If str, it should be "same" or "valid";
|
||||
If Tuple[int], its length should be 2, meaning
|
||||
If Tuple[int], its length should be 2, meaning
|
||||
``(pad_before, pad_after)``, by default 0.
|
||||
|
||||
weight_attr : ParamAttr, Initializer, str or bool, optional
|
||||
The parameter attribute of the convolution kernel, by default None.
|
||||
|
||||
bias_attr : ParamAttr, Initializer, str or bool, optional
|
||||
The parameter attribute of the bias of the convolution, by default
|
||||
The parameter attribute of the bias of the convolution, by default
|
||||
None.
|
||||
|
||||
data_format : str ["NCL" or "NLC"], optional
|
||||
The data layout of the input, by default "NCL"
|
||||
|
||||
momentum : float, optional
|
||||
The momentum of the BatchNorm1D layer, by default 0.9
|
||||
|
||||
epsilon : [type], optional
|
||||
The epsilon of the BatchNorm1D layer, by default 1e-05
|
||||
"""
|
||||
|
|
|
@ -11,9 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue