This commit is contained in:
TianYuan 2021-08-17 07:29:30 +00:00
parent 30f344a6d0
commit c497fd843d
157 changed files with 1005 additions and 1100 deletions

28
.clang-format Normal file
View File

@ -0,0 +1,28 @@
# This file is used by clang-format to autoformat paddle source code
#
# The clang-format is part of llvm toolchain.
# It need to install llvm and clang to format source code style.
#
# The basic usage is,
# clang-format -i -style=file PATH/TO/SOURCE/CODE
#
# The -style=file implicit use ".clang-format" file located in one of
# parent directory.
# The -i means inplace change.
#
# The document of clang-format is
# http://clang.llvm.org/docs/ClangFormat.html
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
---
Language: Cpp
BasedOnStyle: Google
IndentWidth: 4
TabWidth: 4
ContinuationIndentWidth: 4
MaxEmptyLinesToKeep: 2
AccessModifierOffset: -2 # The private/protected/public has no indent in class
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
...

50
.flake8 Normal file
View File

@ -0,0 +1,50 @@
[flake8]
########## OPTIONS ##########
# Set the maximum length that any line (with some exceptions) may be.
max-line-length = 120
################### FILE PATTERNS ##########################
# Provide a comma-separated list of glob patterns to exclude from checks.
exclude =
# git folder
.git,
# python cache
__pycache__,
third_party/,
# Provide a comma-separate list of glob patterns to include for checks.
filename =
*.py
########## RULES ##########
# ERROR CODES
#
# E/W - PEP8 errors/warnings (pycodestyle)
# F - linting errors (pyflakes)
# C - McCabe complexity error (mccabe)
#
# W503 - line break before binary operator
# Specify a list of codes to ignore.
ignore =
W503
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
W291,W293,W605
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,
# these ignores are from flake8-comprehensions; please fix!
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
# Specify the list of error codes you wish Flake8 to report.
select =
E,
W,
F,
C

View File

@ -1,11 +1,11 @@
repos:
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
- repo: https://github.com/pre-commit/mirrors-yapf.git
sha: v0.16.0
hooks:
- id: yapf
files: \.py$
exclude: (?=third_party).*(\.py)$
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: a11d9314b22d8f8c7556443875b731ef05965464
sha: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
@ -15,8 +15,23 @@ repos:
files: \.md$
- id: trailing-whitespace
files: \.md$
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1
- id: requirements-txt-fixer
exclude: (?=third_party).*$
- id: check-yaml
- id: check-json
- id: pretty-format-json
args:
- --no-sort-keys
- --autofix
- id: check-merge-conflict
- id: flake8
aergs:
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- --builtins=G,request
- --jobs=1
exclude: (?=third_party).*(\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
sha: v1.0.1
hooks:
- id: forbid-crlf
files: \.md$
@ -28,9 +43,15 @@ repos:
files: \.md$
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: bash .pre-commit-hooks/clang-format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
- id: copyright_checker
name: copyright_checker
entry: python ./tools/copyright.hook
entry: python .pre-commit-hooks/copyright-check.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
set -e
readonly VERSION="3.9"
version=$(clang-format -version)
# if ! [[ $version == *"$VERSION"* ]]; then
# echo "clang-format version check failed."
# echo "a version contains '$VERSION' is needed, but get '$version'"
# echo "you can install the right version, and make an soft-link to '\$PATH' env"
# exit -1
# fi
clang-format $@

View File

@ -0,0 +1,133 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import io
import os
import re
import sys
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())

3
.style.yapf Normal file
View File

@ -0,0 +1,3 @@
[style]
based_on_style = pep8
column_limit = 80

View File

@ -11,15 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.

View File

@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from parakeet.data.batch import batch_sequences
@ -24,8 +24,7 @@ def collate_baker_examples(examples):
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
durations = [
np.array(
item["durations"], dtype=np.int64) for item in examples
np.array(item["durations"], dtype=np.int64) for item in examples
]
text_lengths = np.array([item["text_lengths"] for item in examples])
speech_lengths = np.array([item["speech_lengths"] for item in examples])
@ -54,4 +53,4 @@ def collate_baker_examples(examples):
"pitch": pitch,
"energy": energy
}
return batch
return batch

View File

@ -12,18 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Calculate statistics of feature files."""
import argparse
import logging
from pathlib import Path
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from config import get_cfg_default
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from config import get_cfg_default
from parakeet.datasets.data_table import DataTable
def main():
@ -75,8 +74,8 @@ def main():
# check directory existence
if args.output is None:
args.output = Path(args.metadata).parent.with_name(args.field_name +
"_stats.npy")
args.output = Path(
args.metadata).parent.with_name(args.field_name + "_stats.npy")
else:
args.output = Path(args.output)
args.output.parent.mkdir(parents=True, exist_ok=True)

View File

@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from yacs.config import CfgNode as Configuration
import yaml
from yacs.config import CfgNode as Configuration
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()

View File

@ -11,8 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.fastspeech2 import FastSpeech2, FastSpeech2Loss
from parakeet.models.fastspeech2 import FastSpeech2Loss
from parakeet.training.extensions.evaluator import StandardEvaluator
from parakeet.training.reporter import report
from parakeet.training.updaters.standard_updater import StandardUpdater

View File

@ -11,10 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import numpy as np
import paddle
from parakeet.frontend.cn_frontend import Frontend as cnFrontend
@ -87,8 +88,7 @@ class Frontend():
phones.append(phone)
return phones, tones
def get_input_ids(self, sentence, merge_sentences=True,
get_tone_ids=False):
def get_input_ids(self, sentence, merge_sentences=True, get_tone_ids=False):
phonemes = self.frontend.get_phonemes(
sentence, merge_sentences=merge_sentences)
result = {}

View File

@ -11,16 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pathlib import Path
import librosa
import numpy as np
from praatio import tgio
from config import get_cfg_default
from praatio import tgio
def readtg(config, tg_path):

View File

@ -50,10 +50,7 @@ def main():
required=True,
help="speech statistics file.")
parser.add_argument(
"--pitch-stats",
type=str,
required=True,
help="pitch statistics file.")
"--pitch-stats", type=str, required=True, help="pitch statistics file.")
parser.add_argument(
"--energy-stats",
type=str,

View File

@ -262,10 +262,7 @@ def main():
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features.")
parser.add_argument(
"--rootdir",
default=None,
type=str,
help="directory to baker dataset.")
"--rootdir", default=None, type=str, help="directory to baker dataset.")
parser.add_argument(
"--dur-file",
default=None,

View File

@ -67,8 +67,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
std = paddle.to_tensor(std)
pwg_normalizer = ZScore(mu, std)
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer,
model)
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model)
pwg_inference = PWGInference(pwg_normalizer, vocoder)
output_dir = Path(args.output_dir)

View File

@ -154,8 +154,7 @@ def train_sp(args, config):
output_dir = Path(args.output_dir)
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
evaluator = FastSpeech2Evaluator(model, dev_dataloader,
**config["updater"])
evaluator = FastSpeech2Evaluator(model, dev_dataloader, **config["updater"])
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))

View File

@ -30,9 +30,7 @@ except ModuleNotFoundError:
INT16_MAX = (2**15) - 1
def normalize_volume(wav,
target_dBFS,
increase_only=False,
def normalize_volume(wav, target_dBFS, increase_only=False,
decrease_only=False):
# this function implements Loudness normalization, instead of peak
# normalization, See https://en.wikipedia.org/wiki/Audio_normalization
@ -44,8 +42,9 @@ def normalize_volume(wav,
if increase_only and decrease_only:
raise ValueError("Both increase only and decrease only are set")
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
if ((dBFS_change < 0 and increase_only) or
(dBFS_change > 0 and decrease_only)):
if dBFS_change < 0 and increase_only:
return wav
if dBFS_change > 0 and decrease_only:
return wav
gain = 10**(dBFS_change / 20)
return wav * gain
@ -59,9 +58,14 @@ def trim_long_silences(wav,
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
:param wav: the raw waveform as a numpy array of floats
:return: the same waveform with silences trimmed away (length <= original wav length)
Parameters
----------
wav : np.array
the raw waveform as a numpy array of floats
Returns
----------
np.array
the same waveform with silences trimmed away (length <= original wav length)
"""
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000
@ -117,20 +121,25 @@ def compute_partial_slices(n_samples: int,
The returned ranges may be indexing further than the length of the waveform. It is
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
Parameters
----------
n_samples : int
the number of samples in the waveform.
partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance.
:param n_samples: the number of samples in the waveform
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
utterance
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
utterance, this parameter is ignored so that the function always returns at least 1 slice.
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
utterances are entirely disjoint.
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
respectively the waveform and the mel spectrogram with these slices to obtain the partial
utterances.
min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames.
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
utterance, this parameter is ignored so that the function always returns at least 1 slice.
overlap : float
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns
----------
the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
"""
assert 0 <= overlap < 1
assert 0 < min_pad_coverage <= 1
@ -138,8 +147,8 @@ def compute_partial_slices(n_samples: int,
# librosa's function to compute num_frames from num_samples
n_frames = int(np.ceil((n_samples + 1) / hop_length))
# frame shift between ajacent partials
frame_step = max(
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
frame_step = max(1,
int(np.round(partial_utterance_n_frames * (1 - overlap))))
# Compute the slices
wav_slices, mel_slices = [], []

View File

@ -57,7 +57,7 @@ def _process_speaker(speaker_dir: Path,
try:
with sources_fpath.open("rt") as sources_file:
existing_names = {line.split(",")[0] for line in sources_file}
except:
except Exception as e:
existing_names = {}
else:
existing_names = {}
@ -114,9 +114,7 @@ def process_librispeech(processor,
output_dir, "*.flac", skip_existing)
def process_voxceleb1(processor,
datasets_root,
output_dir,
def process_voxceleb1(processor, datasets_root, output_dir,
skip_existing=False):
dataset_name = "VoxCeleb1"
dataset_root = datasets_root / dataset_name
@ -126,10 +124,7 @@ def process_voxceleb1(processor,
metadata = [line.strip().split("\t") for line in metafile][1:]
# speaker id -> nationality
nationalities = {
line[0]: line[3]
for line in metadata if line[-1] == "dev"
}
nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
keep_speaker_ids = [
speaker_id for speaker_id, nationality in nationalities.items()
if nationality.lower() in anglophone_nationalites
@ -147,9 +142,7 @@ def process_voxceleb1(processor,
output_dir, "*.wav", skip_existing)
def process_voxceleb2(processor,
datasets_root,
output_dir,
def process_voxceleb2(processor, datasets_root, output_dir,
skip_existing=False):
dataset_name = "VoxCeleb2"
dataset_root = datasets_root / dataset_name
@ -171,9 +164,7 @@ def process_aidatatang_200zh(processor,
output_dir, "*.wav", skip_existing)
def process_magicdata(processor,
datasets_root,
output_dir,
def process_magicdata(processor, datasets_root, output_dir,
skip_existing=False):
dataset_name = "magicdata/train"
dataset_root = datasets_root / dataset_name

View File

@ -52,7 +52,8 @@ if __name__ == "__main__":
if not args.no_trim:
try:
import webrtcvad
except:
print(webrtcvad.__version__)
except Exception as e:
raise ModuleNotFoundError(
"Package 'webrtcvad' not found. This package enables "
"noise removal and is recommended. Please install and "
@ -96,5 +97,5 @@ if __name__ == "__main__":
for dataset in args.datasets:
print("Preprocessing %s" % dataset)
preprocess_func[dataset](processor, args.datasets_root,
args.output_dir, args.skip_existing)
preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
args.skip_existing)

View File

@ -83,12 +83,11 @@ class Ge2eExperiment(ExperimentBase):
self.logger.info(msg)
if dist.get_rank() == 0:
self.visualizer.add_scalar("train/loss", loss_value,
self.iteration)
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
self.visualizer.add_scalar("train/eer", eer, self.iteration)
self.visualizer.add_scalar(
"param/w",
float(self.model_core.similarity_weight), self.iteration)
self.visualizer.add_scalar("param/w",
float(self.model_core.similarity_weight),
self.iteration)
self.visualizer.add_scalar("param/b",
float(self.model_core.similarity_bias),
self.iteration)

View File

@ -109,8 +109,7 @@ class Clip(object):
"""
if len(x) < c.shape[1] * self.hop_size:
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)),
mode="edge")
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), mode="edge")
# check the legnth is valid
assert len(x) == c.shape[

View File

@ -17,18 +17,12 @@ import argparse
import logging
import os
import numpy as np
import yaml
import json
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
from parakeet.utils.h5_utils import read_hdf5
from parakeet.utils.h5_utils import write_hdf5
from config import get_cfg_default

View File

@ -15,18 +15,15 @@
import argparse
import logging
import os
from operator import itemgetter
from pathlib import Path
import numpy as np
import yaml
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
from config import get_cfg_default

View File

@ -13,7 +13,9 @@
# limitations under the License.
from operator import itemgetter
from typing import List, Dict, Any
from typing import Any
from typing import Dict
from typing import List
import argparse
import jsonlines
@ -39,8 +41,8 @@ def process_sentence(config: Dict[str, Any],
# reading
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
assert np.abs(y).max(
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
assert np.abs(
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
duration = librosa.get_duration(y, sr=sr)
# trim according to the alignment file
@ -80,8 +82,8 @@ def process_sentence(config: Dict[str, Any],
# adjust time to make num_samples == num_frames * hop_length
num_frames = logmel.shape[0]
if y.size < num_frames * config.hop_length:
y = np.pad(y, (0, num_frames * config.hop_length - y.size),
mode="reflect")
y = np.pad(
y, (0, num_frames * config.hop_length - y.size), mode="reflect")
else:
y = y[:num_frames * config.hop_length]
num_sample = y.shape[0]
@ -139,10 +141,7 @@ def main():
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features .")
parser.add_argument(
"--rootdir",
default=None,
type=str,
help="directory to baker dataset.")
"--rootdir", default=None, type=str, help="directory to baker dataset.")
parser.add_argument(
"--dumpdir",
type=str,

View File

@ -20,17 +20,11 @@ from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddle.optimizer.lr import LRScheduler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from timer import timer
from parakeet.datasets.data_table import DataTable
from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState
from parakeet.training.extensions.evaluator import StandardEvaluator
from parakeet.training.trainer import Trainer
from parakeet.training.reporter import report
from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
from parakeet.utils.profile import synchronize
class PWGUpdater(StandardUpdater):

View File

@ -12,20 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from timer import timer
import logging
import argparse
import os
from pathlib import Path
from timer import timer
import yaml
import jsonlines
import paddle
import numpy as np
import paddle
import soundfile as sf
import yaml
from paddle import distributed as dist
from parakeet.datasets.data_table import DataTable
from parakeet.models.parallel_wavegan import PWGGenerator

View File

@ -130,8 +130,7 @@ def train_sp(args, config):
parameters=generator.parameters(),
**config["generator_optimizer_params"])
lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
gradient_clip_d = nn.ClipGradByGlobalNorm(config[
"discriminator_grad_norm"])
gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"])
optimizer_d = Adam(
learning_rate=lr_schedule_d,
grad_clip=gradient_clip_d,
@ -184,8 +183,7 @@ def train_sp(args, config):
stop_trigger=(config.train_max_steps, "iteration"),
out=output_dir, )
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(evaluator, trigger=(config.eval_interval_steps, 'iteration'))
if dist.get_rank() == 0:
writer = LogWriter(str(trainer.out))
trainer.extend(VisualDL(writer), trigger=(1, 'iteration'))

View File

@ -22,8 +22,7 @@ def collate_baker_examples(examples):
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
durations = [
np.array(
item["durations"], dtype=np.int64) for item in examples
np.array(item["durations"], dtype=np.int64) for item in examples
]
num_phones = np.array([item["num_phones"] for item in examples])
num_frames = np.array([item["num_frames"] for item in examples])

View File

@ -15,21 +15,14 @@
import argparse
import logging
import os
from pathlib import Path
import numpy as np
import yaml
import json
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
from parakeet.utils.h5_utils import read_hdf5
from parakeet.utils.h5_utils import write_hdf5
from config import get_cfg_default

View File

@ -17,7 +17,6 @@ from pathlib import Path
import numpy as np
import paddle
import pypinyin
from pypinyin import lazy_pinyin, Style
import jieba
import phkit

View File

@ -15,9 +15,8 @@
import argparse
from pathlib import Path
import numpy as np
from paddle import inference
import soundfile as sf
from paddle import inference
from frontend import text_analysis
@ -73,8 +72,8 @@ def main():
speedyspeech_predictor.run()
output_names = speedyspeech_predictor.get_output_names()
output_handle = speedyspeech_predictor.get_output_handle(output_names[
0])
output_handle = speedyspeech_predictor.get_output_handle(
output_names[0])
output_data = output_handle.copy_to_cpu()
input_names = pwg_predictor.get_input_names()

View File

@ -15,19 +15,16 @@
import argparse
import logging
import os
from copy import copy
from operator import itemgetter
from pathlib import Path
import numpy as np
import yaml
import jsonlines
import numpy as np
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.frontend.vocab import Vocab
from parakeet.datasets.data_table import DataTable
from parakeet.frontend.vocab import Vocab
from config import get_cfg_default
@ -100,7 +97,10 @@ def main():
for item in metadata:
item["feats"] = str(metadata_dir / item["feats"])
dataset = DataTable(metadata, converters={'feats': np.load, })
dataset = DataTable(
metadata, converters={
'feats': np.load,
})
logging.info(f"The number of files = {len(dataset)}.")
# restore scaler

View File

@ -13,7 +13,9 @@
# limitations under the License.
from operator import itemgetter
from typing import List, Dict, Any
from typing import Any
from typing import Dict
from typing import List
import argparse
import jsonlines
@ -41,8 +43,8 @@ def process_sentence(config: Dict[str, Any],
# reading
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
assert np.abs(y).max(
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
assert np.abs(
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
duration = librosa.get_duration(y, sr=sr)
# intervals with empty lables are ignored
@ -162,10 +164,7 @@ def main():
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features.")
parser.add_argument(
"--rootdir",
default=None,
type=str,
help="directory to baker dataset.")
"--rootdir", default=None, type=str, help="directory to baker dataset.")
parser.add_argument(
"--dumpdir",
type=str,

View File

@ -13,15 +13,13 @@
# limitations under the License.
import paddle
from paddle.nn import functional as F
from paddle.fluid.layers import huber_loss
from parakeet.modules.ssim import ssim
from paddle.nn import functional as F
from parakeet.modules.losses import masked_l1_loss, weighted_mean
from parakeet.modules.ssim import ssim
from parakeet.training.extensions.evaluator import StandardEvaluator
from parakeet.training.reporter import report
from parakeet.training.updaters.standard_updater import StandardUpdater
from parakeet.training.extensions.evaluator import StandardEvaluator
from parakeet.models.speedyspeech import SpeedySpeech
class SpeedySpeechUpdater(StandardUpdater):

View File

@ -11,30 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import logging
import argparse
import dataclasses
from pathlib import Path
import yaml
import jsonlines
import paddle
import numpy as np
import soundfile as sf
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle import distributed as dist
import yaml
from paddle import jit
from paddle.static import InputSpec
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechInference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
@ -79,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
speedyspeech_inference = jit.to_static(
speedyspeech_inference,
input_spec=[
InputSpec(
[-1], dtype=paddle.int64), InputSpec(
[-1], dtype=paddle.int64)
InputSpec([-1], dtype=paddle.int64), InputSpec(
[-1], dtype=paddle.int64)
])
paddle.jit.save(speedyspeech_inference,
os.path.join(args.inference_dir, "speedyspeech"))
@ -91,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
pwg_inference = PWGInference(pwg_normalizer, vocoder)
pwg_inference.eval()
pwg_inference = jit.to_static(
pwg_inference,
input_spec=[InputSpec(
[-1, 80], dtype=paddle.float32), ])
pwg_inference, input_spec=[
InputSpec([-1, 80], dtype=paddle.float32),
])
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
@ -119,9 +113,7 @@ def main():
parser = argparse.ArgumentParser(
description="Synthesize with speedyspeech & parallel wavegan.")
parser.add_argument(
"--speedyspeech-config",
type=str,
help="config file for speedyspeech.")
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
parser.add_argument(
"--speedyspeech-checkpoint",
type=str,

View File

@ -1,6 +1,6 @@
python synthesize.py \
--speedyspeech-config=conf/default.yaml \
--speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \
--speedyspeech-stat=dump/train/stats.npy \
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
--pwg-params=../../parallelwave_gan/baker/converted.pdparams \

View File

@ -13,28 +13,22 @@
# limitations under the License.
import os
import sys
import logging
import argparse
import dataclasses
from pathlib import Path
import yaml
import jsonlines
import paddle
import numpy as np
import soundfile as sf
import paddle
import yaml
from paddle import jit
from paddle.static import InputSpec
from paddle import nn
from paddle.nn import functional as F
from paddle import distributed as dist
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechInference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
from frontend import text_analysis
@ -57,8 +51,7 @@ def evaluate(args, speedyspeech_config, pwg_config):
model.eval()
vocoder = PWGGenerator(**pwg_config["generator_params"])
vocoder.set_state_dict(
paddle.load(args.pwg_checkpoint)["generator_params"])
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
vocoder.remove_weight_norm()
vocoder.eval()
print("model done!")
@ -81,9 +74,8 @@ def evaluate(args, speedyspeech_config, pwg_config):
speedyspeech_inference = jit.to_static(
speedyspeech_inference,
input_spec=[
InputSpec(
[-1], dtype=paddle.int64), InputSpec(
[-1], dtype=paddle.int64)
InputSpec([-1], dtype=paddle.int64), InputSpec(
[-1], dtype=paddle.int64)
])
paddle.jit.save(speedyspeech_inference,
os.path.join(args.inference_dir, "speedyspeech"))
@ -93,9 +85,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
pwg_inference = PWGInference(pwg_normalizer, vocoder)
pwg_inference.eval()
pwg_inference = jit.to_static(
pwg_inference,
input_spec=[InputSpec(
[-1, 80], dtype=paddle.float32), ])
pwg_inference, input_spec=[
InputSpec([-1, 80], dtype=paddle.float32),
])
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
@ -119,9 +111,7 @@ def main():
parser = argparse.ArgumentParser(
description="Synthesize with speedyspeech & parallel wavegan.")
parser.add_argument(
"--speedyspeech-config",
type=str,
help="config file for speedyspeech.")
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
parser.add_argument(
"--speedyspeech-checkpoint",
type=str,

View File

@ -13,7 +13,6 @@
# limitations under the License.
import librosa
from praatio import tgio
def validate_textgrid(text_grid, num_samples, sr):

View File

@ -72,7 +72,9 @@ def train_sp(args, config):
fields=[
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
],
converters={"feats": np.load, }, )
converters={
"feats": np.load,
}, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
metadata_dir = Path(args.dev_metadata).parent
@ -83,7 +85,9 @@ def train_sp(args, config):
fields=[
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
],
converters={"feats": np.load, }, )
converters={
"feats": np.load,
}, )
# collate function and dataloader
train_sampler = DistributedBatchSampler(

View File

@ -46,8 +46,7 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples."""
def __init__(self, padding_idx=0, padding_value=0.,
padding_stop_token=1.0):
def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
self.padding_idx = padding_idx
self.padding_value = padding_value
self.padding_stop_token = padding_stop_token

View File

@ -63,8 +63,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
with open(target_path / "metadata.pkl", 'wb') as f:
pickle.dump(records, f)
if verbose:
print("saved metadata into {}".format(target_path /
"metadata.pkl"))
print("saved metadata into {}".format(target_path / "metadata.pkl"))
print("Done.")

View File

@ -14,14 +14,13 @@
import time
from collections import defaultdict
import numpy as np
import paddle
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle import distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler
from parakeet.data import dataset
from parakeet.frontend import EnglishCharacter # pylint: disable=unused-import
from parakeet.training.cli import default_argument_parser
from parakeet.training.experiment import ExperimentBase
from parakeet.utils import display, mp_tools
@ -74,8 +73,7 @@ class Experiment(ExperimentBase):
if dist.get_rank() == 0:
for k, v in losses_np.items():
self.visualizer.add_scalar(f"train_loss/{k}", v,
self.iteration)
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
@mp_tools.rank_zero_only
@paddle.no_grad()

View File

@ -65,8 +65,8 @@ def collate_aishell3_examples(examples):
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
T_dec = np.max(spec_lengths)
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
).astype(np.float32)
stop_tokens = (
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
phones, _ = batch_text_id(phones)
tones, _ = batch_text_id(tones)
mel, _ = batch_spec(mel)

View File

@ -121,8 +121,8 @@ def convert(syllable):
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un", "uen").replace(
"ui", "uei").replace("iu", "iou")
syllable = syllable.replace("un", "uen").replace("ui",
"uei").replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\

View File

@ -68,8 +68,7 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
alignment_dir=alignment_dir)
with Pool(16) as p:
list(
tqdm(
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
if __name__ == "__main__":

View File

@ -109,8 +109,7 @@ class Experiment(ExperimentBase):
mel_pred = outputs['mel_outputs_postnet']
self.visualizer.add_figure(
f"valid_sentence_{i}_predicted_spectrogram",
display.plot_spectrogram(mel_pred[0].numpy().T),
self.iteration)
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
# write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}

View File

@ -13,7 +13,6 @@
# limitations under the License.
import argparse
import re
from pathlib import Path

View File

@ -40,6 +40,7 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir):
raw_text = raw_dict[utt_id]
text = text_cleaner(raw_text)
g2p_phones = frontend.get_phonemes(text)
g2p_phones = sum(g2p_phones, [])
gt_phones = ref_dict[utt_id].split(" ")
# delete silence tokens in predicted phones and ground truth phones
g2p_phones = [phn for phn in g2p_phones if phn not in SILENCE_TOKENS]

View File

@ -53,10 +53,10 @@ class Transform(object):
ids, mel = example # ids already have <s> and </s>
ids = np.array(ids, dtype=np.int64)
# add start and end frame
mel = np.pad(mel, [(0, 0), (1, 1)],
mode='constant',
constant_values=[(0, 0),
(self.start_value, self.end_value)])
mel = np.pad(
mel, [(0, 0), (1, 1)],
mode='constant',
constant_values=[(0, 0), (self.start_value, self.end_value)])
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
stop_labels[-1] = 2
# actually this thing can also be done within the model

View File

@ -64,8 +64,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
with open(target_path / "metadata.pkl", 'wb') as f:
pickle.dump(records, f)
if verbose:
print("saved metadata into {}".format(target_path /
"metadata.pkl"))
print("saved metadata into {}".format(target_path / "metadata.pkl"))
# also save meta data into text format for inspection
with open(target_path / "metadata.txt", 'wt') as f:
@ -73,8 +72,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
phoneme_str = "|".join(phonemes)
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
if verbose:
print("saved metadata into {}".format(target_path /
"metadata.txt"))
print("saved metadata into {}".format(target_path / "metadata.txt"))
print("Done.")

View File

@ -60,7 +60,7 @@ def main(config, args):
display.plot_multilayer_multihead_alignments(attns)
plt.savefig(str(output_dir / f"sentence_{i}.png"))
mel_output = mel_output.T #(C, T)
mel_output = mel_output.T # (C, T)
np.save(str(output_dir / f"sentence_{i}"), mel_output)
if args.verbose:
print("spectrogram saved at {}".format(output_dir /

View File

@ -76,8 +76,7 @@ class TransformerTTSExperiment(ExperimentBase):
ljspeech_dataset = LJSpeech(args.data)
transform = Transform(config.data.mel_start_value,
config.data.mel_end_value)
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
transform)
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
@ -159,8 +158,7 @@ class TransformerTTSExperiment(ExperimentBase):
if dist.get_rank() == 0:
for k, v in losses_np.items():
self.visualizer.add_scalar(f"train_loss/{k}", v,
self.iteration)
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
@mp_tools.rank_zero_only
@paddle.no_grad()

View File

@ -90,8 +90,8 @@ def rule(C, V, R, T):
return None
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
if V in ['ua', 'uai', 'uang'
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
if V in ['ua', 'uai',
'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
return None
# sh 和 ong 不能拼

View File

@ -28,8 +28,8 @@ from config import get_cfg_defaults
class Transform(object):
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels,
fmin, fmax):
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels, fmin,
fmax):
self.sample_rate = sample_rate
self.n_fft = n_fft
self.win_length = win_length
@ -79,11 +79,8 @@ class Transform(object):
spectrogram_magnitude = np.abs(spectrogram)
# Compute mel-spectrograms.
mel_filter_bank = librosa.filters.mel(sr=sr,
n_fft=n_fft,
n_mels=n_mels,
fmin=fmin,
fmax=fmax)
mel_filter_bank = librosa.filters.mel(
sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
# log scale mel_spectrogram.

View File

@ -39,8 +39,7 @@ def main(config, args):
mel = np.load(str(file_path))
with paddle.amp.auto_cast():
audio = model.predict(mel)
audio_path = output_dir / (
os.path.splitext(file_path.name)[0] + ".wav")
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
sf.write(audio_path, audio, config.data.sample_rate)
print("[synthesize] {} -> {}".format(file_path, audio_path))

View File

@ -114,8 +114,7 @@ class Experiment(ExperimentBase):
msg += "loss: {:>.6f}".format(loss_value)
self.logger.info(msg)
if dist.get_rank() == 0:
self.visualizer.add_scalar("train/loss", loss_value,
self.iteration)
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
@mp_tools.rank_zero_only
@paddle.no_grad()

View File

@ -13,6 +13,3 @@
# limitations under the License.
__version__ = "0.0.0"
import logging
from parakeet import audio, data, datasets, frontend, models, modules, training, utils

View File

@ -11,6 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor
from .spec_normalizer import NormalizerBase, LogMagnitude

View File

@ -11,10 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import soundfile as sf
import numpy as np
import soundfile as sf
__all__ = ["AudioProcessor"]
@ -53,11 +52,12 @@ class AudioProcessor(object):
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
def _create_mel_filter(self):
mel_filter = librosa.filters.mel(self.sample_rate,
self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax)
mel_filter = librosa.filters.mel(
self.sample_rate,
self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax)
return mel_filter
def read_wav(self, filename):

View File

@ -13,20 +13,3 @@
# limitations under the License.
"""Parakeet's infrastructure for data processing.
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.data.batch import *
from parakeet.data.dataset import *
from parakeet.data.get_feats import *

View File

@ -61,9 +61,10 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
for example in minibatch:
pad_len = max_len - example.shape[0]
batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_id))
np.pad(
example, [(0, pad_len)],
mode='constant',
constant_values=pad_id))
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
@ -103,9 +104,10 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
for example in minibatch:
pad_len = max_len - example.shape[-1]
batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_value))
np.pad(
example, [(0, pad_len)],
mode='constant',
constant_values=pad_value))
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
@ -152,14 +154,16 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
pad_len = max_len - example.shape[time_idx]
if time_major:
batch.append(
np.pad(example, [(0, pad_len), (0, 0)],
mode='constant',
constant_values=pad_value))
np.pad(
example, [(0, pad_len), (0, 0)],
mode='constant',
constant_values=pad_value))
else:
batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value))
np.pad(
example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value))
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
@ -178,10 +182,8 @@ def batch_sequences(sequences, axis=0, pad_value=0):
for seq, length in zip(sequences, seq_lengths):
padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
ndim - axis - 1)
padded_seq = np.pad(seq,
padding,
mode='constant',
constant_values=pad_value)
padded_seq = np.pad(
seq, padding, mode='constant', constant_values=pad_value)
padded_sequences.append(padded_seq)
batch = np.stack(padded_sequences)
return batch

View File

@ -11,9 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import paddle
from paddle.io import Dataset
__all__ = [
@ -69,7 +67,7 @@ class CacheDataset(Dataset):
return len(self._dataset)
def __getitem__(self, i):
if not i in self._cache:
if i not in self._cache:
self._cache[i] = self._dataset[i]
return self._cache[i]
@ -86,9 +84,8 @@ class TupleDataset(Dataset):
length = len(datasets[0])
for i, dataset in enumerate(datasets):
if len(dataset) != length:
raise ValueError(
"all the datasets should have the same length."
"dataset {} has a different length".format(i))
raise ValueError("all the datasets should have the same length."
"dataset {} has a different length".format(i))
self._datasets = datasets
self._length = length
@ -115,7 +112,7 @@ class DictDataset(Dataset):
A compound dataset made from several datasets of the same length. An
example of the `DictDataset` is a dict of examples from the constituent
datasets.
WARNING: paddle does not have a good support for DictDataset, because
every batch yield from a DataLoader is a list, but it cannot be a dict.
So you have to provide a collate function because you cannot use the

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import numpy as np
import pyworld
@ -46,11 +45,12 @@ class LogMelFBank():
self.mel_filter = self._create_mel_filter()
def _create_mel_filter(self):
mel_filter = librosa.filters.mel(sr=self.sr,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax)
mel_filter = librosa.filters.mel(
sr=self.sr,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax)
return mel_filter
def _stft(self, wav):
@ -121,11 +121,12 @@ class Pitch():
use_log_f0=True) -> np.array:
input = input.astype(np.float)
frame_period = 1000 * self.hop_length / self.sr
f0, timeaxis = pyworld.dio(input,
fs=self.sr,
f0_floor=self.f0min,
f0_ceil=self.f0max,
frame_period=frame_period)
f0, timeaxis = pyworld.dio(
input,
fs=self.sr,
f0_floor=self.f0min,
f0_ceil=self.f0max,
frame_period=frame_period)
f0 = pyworld.stonemask(input, f0, timeaxis, self.sr)
if use_continuous_f0:
f0 = self._convert_to_continuous_f0(f0)
@ -195,8 +196,7 @@ class Energy():
input_power = np.abs(input_stft)**2
energy = np.sqrt(
np.clip(
np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float(
'inf')))
np.sum(input_power, axis=0), a_min=1.0e-10, a_max=float('inf')))
return energy
def _average_by_duration(self, input: np.array, d: np.array) -> np.array:

View File

@ -11,6 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets.common import *
from parakeet.datasets.ljspeech import *

View File

@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.io import Dataset
import os
import librosa
from pathlib import Path
import numpy as np
from typing import List
import librosa
import numpy as np
from paddle.io import Dataset
__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
@ -57,7 +56,7 @@ class AudioSegmentDataset(Dataset):
class AudioDataset(Dataset):
"""A simple dataset adaptor for the audio files.
"""A simple dataset adaptor for the audio files.
Read -> trim silence -> normalize
"""

View File

@ -11,12 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union, Optional, Callable, Tuple, List, Dict, Any
from pathlib import Path
from multiprocessing import Manager
from typing import Any
from typing import Callable
from typing import Dict
from typing import List
import numpy as np
from paddle.io import Dataset

View File

@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from paddle.io import Dataset
from pathlib import Path
__all__ = ["LJSpeechMetaData"]

View File

@ -11,11 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.vocab import *
from parakeet.frontend.phonectic import *
from parakeet.frontend.punctuation import *
from parakeet.frontend.normalizer import *
from parakeet.frontend.cn_normalization import *
from parakeet.frontend.tone_sandhi import *
from parakeet.frontend.generate_lexicon import *

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.phonectic import Phonetics
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
@ -200,8 +199,7 @@ class ARPABET(Phonetics):
The list of pronunciation id sequence.
"""
return self.numericalize(
self.phoneticize(
sentence, add_start_end=add_start_end))
self.phoneticize(sentence, add_start_end=add_start_end))
@property
def vocab_size(self):
@ -217,9 +215,9 @@ class ARPABETWithStress(Phonetics):
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K',
'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P',
'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2',
'V', 'W', 'Y', 'Z', 'ZH'
'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R',
'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V',
'W', 'Y', 'Z', 'ZH'
]
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
@ -294,8 +292,7 @@ class ARPABETWithStress(Phonetics):
The list of pronunciation id sequence.
"""
return self.numericalize(
self.phoneticize(
sentence, add_start_end=add_start_end))
self.phoneticize(sentence, add_start_end=add_start_end))
@property
def vocab_size(self):

View File

@ -11,17 +11,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import jieba.posseg as psg
import numpy as np
import paddle
import re
from g2pM import G2pM
from parakeet.frontend.tone_sandhi import ToneSandhi
from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
from pypinyin import lazy_pinyin, Style
from pypinyin import lazy_pinyin
from pypinyin import Style
from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer
from parakeet.frontend.generate_lexicon import generate_lexicon
from parakeet.frontend.tone_sandhi import ToneSandhi
class Frontend():

View File

@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.cn_normalization.text_normlization import *

View File

@ -11,10 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
from .num import DIGITS
from .num import num2str
from .num import verbalize_cardinal
from .num import verbalize_digit
def _time_num2str(num_string: str) -> str:

View File

@ -11,9 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import string
from pypinyin.constants import SUPPORT_UCS4
# 全角半角转换
@ -32,10 +32,7 @@ F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
# 标点符号全角 -> 半角映射表 (num: 32)
F2H_PUNCTUATIONS = {
chr(ord(char) + 65248): char
for char in string.punctuation
}
F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
# 标点符号半角 -> 全角映射表
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}

View File

@ -15,7 +15,6 @@
Rules to verbalize numbers into Chinese characters.
https://zh.wikipedia.org/wiki/中文数字#現代中文
"""
import re
from collections import OrderedDict
from typing import List

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from .num import verbalize_digit
@ -32,14 +31,12 @@ def phone2str(phone_string: str, mobile=True) -> str:
if mobile:
sp_parts = phone_string.strip('+').split()
result = ''.join(
[verbalize_digit(
part, alt_one=True) for part in sp_parts])
[verbalize_digit(part, alt_one=True) for part in sp_parts])
return result
else:
sil_parts = phone_string.split('-')
result = ''.join(
[verbalize_digit(
part, alt_one=True) for part in sil_parts])
[verbalize_digit(part, alt_one=True) for part in sil_parts])
return result

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from .num import num2str

View File

@ -11,16 +11,37 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List
from .chronology import RE_TIME, RE_DATE, RE_DATE2
from .chronology import replace_time, replace_date, replace_date2
from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM, RE_DECIMAL_NUM, RE_POSITIVE_QUANTIFIERS
from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num, replace_negative_num, replace_positive_quantifier
from .phonecode import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone, replace_mobile
from .chronology import RE_DATE
from .chronology import RE_DATE2
from .chronology import RE_TIME
from .chronology import replace_date
from .chronology import replace_date2
from .chronology import replace_time
from .constants import F2H_ASCII_LETTERS
from .constants import F2H_DIGITS
from .constants import F2H_SPACE
from .num import RE_DECIMAL_NUM
from .num import RE_DEFAULT_NUM
from .num import RE_FRAC
from .num import RE_INTEGER
from .num import RE_NUMBER
from .num import RE_PERCENTAGE
from .num import RE_POSITIVE_QUANTIFIERS
from .num import RE_RANGE
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
from .num import replace_number
from .num import replace_percentage
from .num import replace_positive_quantifier
from .num import replace_range
from .phonecode import RE_MOBILE_PHONE
from .phonecode import RE_TELEPHONE
from .phonecode import replace_mobile
from .phonecode import replace_phone
from .quantifier import RE_TEMPERATURE
from .quantifier import replace_temperature

View File

@ -18,8 +18,6 @@ than words are used in transcriptions produced by `reorganize_baker.py`.
We make this choice to better leverage other software for chinese text to
pinyin tools like pypinyin. This is the convention for G2P in Chinese.
"""
import argparse
import re
from collections import OrderedDict
@ -41,10 +39,10 @@ SPECIALS = ['sil', 'sp']
def rule(C, V, R, T):
"""Generate a syllable given the initial, the final, erhua indicator, and tone.
Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu)
Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to
'u' in syllables when certain conditions are satisfied.
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
When a syllable is impossible or does not have any characters with this pronunciation, return None
@ -86,8 +84,8 @@ def rule(C, V, R, T):
return None
# ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼
if V in ['ua', 'uai', 'uang'
] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
if V in ['ua', 'uai',
'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']:
return None
# sh 和 ong 不能拼

View File

@ -11,6 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.normalizer.normalizer import *
from parakeet.frontend.normalizer.numbers import *

View File

@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import unicodedata
from builtins import str as unicode
from parakeet.frontend.normalizer.numbers import normalize_numbers

View File

@ -11,11 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# number expansion is not that easy
import inflect
import re
import inflect
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')

View File

@ -11,16 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABC
from abc import abstractmethod
from abc import ABC, abstractmethod
from typing import Union
from g2p_en import G2p
from g2pM import G2pM
from parakeet.frontend import Vocab
from parakeet.frontend.normalizer.normalizer import normalize
from parakeet.frontend.punctuation import get_punctuations
# discard opencc untill we find an easy solution to install it on windows
# from opencc import OpenCC
from parakeet.frontend.punctuation import get_punctuations
from parakeet.frontend.normalizer.normalizer import normalize
__all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"]
@ -65,14 +67,14 @@ class English(Phonetics):
start = self.vocab.start_symbol
end = self.vocab.end_symbol
phonemes = ([] if start is None else [start]) \
+ self.backend(sentence) \
+ ([] if end is None else [end])
+ self.backend(sentence) \
+ ([] if end is None else [end])
phonemes = [item for item in phonemes if item in self.vocab.stoi]
return phonemes
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
@ -91,7 +93,7 @@ class English(Phonetics):
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters
-----------
ids: List[int]
@ -183,7 +185,7 @@ class EnglishCharacter(Phonetics):
----------
str
The input text sequence.
"""
return [self.vocab.reverse(i) for i in ids]
@ -244,8 +246,8 @@ class Chinese(Phonetics):
start = self.vocab.start_symbol
end = self.vocab.end_symbol
phonemes = ([] if start is None else [start]) \
+ phonemes \
+ ([] if end is None else [end])
+ phonemes \
+ ([] if end is None else [end])
return self._filter_symbols(phonemes)
def _filter_symbols(self, phonemes):
@ -261,7 +263,7 @@ class Chinese(Phonetics):
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
@ -298,7 +300,7 @@ class Chinese(Phonetics):
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters
-----------
ids: List[int]

View File

@ -19,13 +19,15 @@ text -> pinyin to other part of a TTS system. Other NLP techniques may be used
(e.g. tokenization, tagging, NER...)
"""
import re
from itertools import product
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.core import DefaultConverter
from pypinyin.core import Pinyin
from pypinyin.core import Style
from parakeet.frontend.phonectic import Phonetics
from parakeet.frontend.vocab import Vocab
import pypinyin
from pypinyin.core import Pinyin, Style
from pypinyin.core import DefaultConverter
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from itertools import product
_punctuations = ['', '', '', '']
_initials = [
@ -33,10 +35,10 @@ _initials = [
'ch', 'sh', 'r', 'z', 'c', 's'
]
_finals = [
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en',
'ang', 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian',
'ien', 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang',
'ueng', 'v', 've', 'van', 'ven', 'veng'
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang',
'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien',
'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
'v', 've', 'van', 'ven', 'veng'
]
_ernized_symbol = ['&r']
_phones = _initials + _finals + _ernized_symbol + _punctuations
@ -76,12 +78,12 @@ class ParakeetPinyin(Phonetics):
def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
@ -95,12 +97,12 @@ class ParakeetPinyin(Phonetics):
def numericalize(self, phonemes, tones):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
The list of pronunciation sequence.
Returns
----------
List[int]
@ -112,12 +114,12 @@ class ParakeetPinyin(Phonetics):
def __call__(self, sentence, add_start_end=False):
""" Convert the input text sequence into pronunciation id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
@ -159,12 +161,12 @@ class ParakeetPinyinWithTone(Phonetics):
def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
@ -178,12 +180,12 @@ class ParakeetPinyinWithTone(Phonetics):
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
The list of pronunciation sequence.
Returns
----------
List[int]
@ -194,12 +196,12 @@ class ParakeetPinyinWithTone(Phonetics):
def __call__(self, sentence, add_start_end=False):
""" Convert the input text sequence into pronunciation id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
@ -232,17 +234,17 @@ def _convert_to_parakeet_convension(syllable):
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un","uen")\
.replace("ui", "uei")\
syllable = syllable.replace("un", "uen") \
.replace("ui", "uei") \
.replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii")\
.replace("ci", "cii")\
.replace("si", "sii")\
.replace("zhi", "zhiii")\
.replace("chi", "chiii")\
.replace("shi", "shiii")\
syllable = syllable.replace("zi", "zii") \
.replace("ci", "cii") \
.replace("si", "sii") \
.replace("zhi", "zhiii") \
.replace("chi", "chiii") \
.replace("shi", "shiii") \
.replace("ri", "riii")
# rule for y preceding i, u
@ -252,8 +254,8 @@ def _convert_to_parakeet_convension(syllable):
syllable = syllable.replace("wu", "u").replace("w", "u")
# rule for v following j, q, x
syllable = syllable.replace("ju", "jv")\
.replace("qu", "qv")\
syllable = syllable.replace("ju", "jv") \
.replace("qu", "qv") \
.replace("xu", "xv")
return syllable + tone

View File

@ -12,9 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import string
__all__ = ["get_punctuations"]
EN_PUNCT = [

View File

@ -11,8 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple
from typing import List
from typing import Tuple
import jieba
from pypinyin import lazy_pinyin
@ -76,8 +76,7 @@ class ToneSandhi():
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
for j, item in enumerate(word):
if j - 1 >= 0 and item == word[j - 1] and pos[
0] in {"n", "v", "a"}:
if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
finals[j] = finals[j][:-1] + "5"
ge_idx = word.find("")
if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
@ -125,8 +124,8 @@ class ToneSandhi():
else:
for i, char in enumerate(word):
# "不" before tone4 should be bu2, e.g. 不怕
if char == "" and i + 1 < len(word) and finals[i + 1][
-1] == "4":
if char == "" and i + 1 < len(word) and finals[i +
1][-1] == "4":
finals[i] = finals[i][:-1] + "2"
return finals
@ -266,12 +265,12 @@ class ToneSandhi():
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and self._all_tone_three(sub_finals_list[
i - 1]) and self._all_tone_three(sub_finals_list[
i]) and not merge_last[i - 1]:
if i - 1 >= 0 and self._all_tone_three(
sub_finals_list[i - 1]) and self._all_tone_three(
sub_finals_list[i]) and not merge_last[i - 1]:
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if not self._is_reduplication(seg[i - 1][0]) and len(seg[
i - 1][0]) + len(seg[i][0]) <= 3:
if not self._is_reduplication(seg[i - 1][0]) and len(
seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
@ -299,8 +298,8 @@ class ToneSandhi():
if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \
merge_last[i - 1]:
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if not self._is_reduplication(seg[i - 1][0]) and len(seg[
i - 1][0]) + len(seg[i][0]) <= 3:
if not self._is_reduplication(seg[i - 1][0]) and len(
seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:

View File

@ -11,9 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Iterable, List
from collections import OrderedDict
from typing import Iterable
__all__ = ["Vocab"]
@ -25,13 +24,13 @@ class Vocab(object):
-----------
symbols: Iterable[str]
Common symbols.
padding_symbol: str, optional
Symbol for pad. Defaults to "<pad>".
unk_symbol: str, optional
Symbol for unknow. Defaults to "<unk>"
start_symbol: str, optional
Symbol for start. Defaults to "<s>"

View File

@ -11,13 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#from parakeet.models.clarinet import *
from parakeet.models.waveflow import *
#from parakeet.models.wavenet import *
from parakeet.models.transformer_tts import *
#from parakeet.models.deepvoice3 import *
# from parakeet.models.fastspeech import *
from parakeet.models.tacotron2 import *
from parakeet.models.fastspeech2 import *

View File

@ -12,19 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fastspeech2 related modules for paddle"""
from typing import Dict, Sequence, Tuple
from typing import Sequence
from typing import Tuple
import paddle
from paddle import nn
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss
from typeguard import check_argument_types
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
from parakeet.modules.fastspeech2_predictor.postnet import Postnet
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding, ScaledPositionalEncoding
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
from parakeet.modules.nets_utils import initialize, make_non_pad_mask, make_pad_mask
from typeguard import check_argument_types
from parakeet.modules.nets_utils import initialize
from parakeet.modules.nets_utils import make_non_pad_mask
from parakeet.modules.nets_utils import make_pad_mask
class FastSpeech2(nn.Layer):
@ -293,9 +298,8 @@ class FastSpeech2(nn.Layer):
xs, ilens, ys, olens, ds, ps, es, is_inference=False)
# modify mod part of groundtruth
if self.reduction_factor > 1:
olens = paddle.to_tensor([
olen - olen % self.reduction_factor for olen in olens.numpy()
])
olens = paddle.to_tensor(
[olen - olen % self.reduction_factor for olen in olens.numpy()])
max_olen = max(olens)
ys = ys[:, :max_olen]
@ -501,8 +505,7 @@ class FastSpeech2Inference(nn.Layer):
class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2."""
def __init__(self,
use_masking: bool=True,
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
@ -538,8 +541,8 @@ class FastSpeech2Loss(nn.Layer):
ps: paddle.Tensor,
es: paddle.Tensor,
ilens: paddle.Tensor,
olens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor,
paddle.Tensor, paddle.Tensor]:
olens: paddle.Tensor,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Calculate forward propagation.
Parameters
@ -611,9 +614,9 @@ class FastSpeech2Loss(nn.Layer):
# make weighted mask and apply it
if self.use_weighted_masking:
out_masks = make_non_pad_mask(olens).unsqueeze(-1)
out_weights = out_masks.cast(
dtype=paddle.float32) / out_masks.cast(
dtype=paddle.float32).sum(axis=1, keepdim=True)
out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
dtype=paddle.float32).sum(
axis=1, keepdim=True)
out_weights /= ys.shape[0] * ys.shape[2]
duration_masks = make_non_pad_mask(ilens)
duration_weights = (duration_masks.cast(dtype=paddle.float32) /

View File

@ -11,17 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from paddle import nn
from paddle.fluid.param_attr import ParamAttr
from paddle.nn import functional as F
from paddle.nn import initializer as I
from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve
from scipy.optimize import brentq
from sklearn.metrics import roc_curve
class LSTMSpeakerEncoder(nn.Layer):
@ -81,8 +78,7 @@ class LSTMSpeakerEncoder(nn.Layer):
# print("p1: ", p1.shape)
p2 = paddle.bmm(
embeds.reshape([-1, 1, embed_dim]),
normalized_centroids_excl.reshape(
[-1, embed_dim, 1])) # (NM, 1, 1)
normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1)
p2 = p2.reshape([-1]) # NM)
# begin: alternative implementation for scatter
@ -94,9 +90,8 @@ class LSTMSpeakerEncoder(nn.Layer):
index = index * speakers_per_batch + paddle.arange(
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
index = paddle.reshape(index, [-1])
ones = paddle.ones([
speakers_per_batch * utterances_per_speaker * speakers_per_batch
])
ones = paddle.ones(
[speakers_per_batch * utterances_per_speaker * speakers_per_batch])
zeros = paddle.zeros_like(index, dtype=ones.dtype)
mask_p1 = paddle.scatter(ones, index, zeros)
p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
@ -113,6 +108,9 @@ class LSTMSpeakerEncoder(nn.Layer):
g = p._grad_ivar()
g[...] = g * 0.01
def inv_argmax(self, i, num):
return np.eye(1, num, i, dtype=np.int)[0]
def loss(self, embeds):
"""
Computes the softmax loss according the section 2.1 of GE2E.
@ -138,8 +136,8 @@ class LSTMSpeakerEncoder(nn.Layer):
# EER (not backpropagated)
with paddle.no_grad():
ground_truth = target.numpy()
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
labels = np.array([inv_argmax(i) for i in ground_truth])
labels = np.array(
[self.inv_argmax(i, speakers_per_batch) for i in ground_truth])
preds = sim_matrix.numpy()
# Snippet from https://yangcha.github.io/EER-ROC/

View File

@ -11,13 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import List, Dict, Any, Union, Optional, Tuple
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
import numpy as np
import paddle
from paddle import Tensor
from paddle import nn
from paddle.nn import functional as F
@ -63,8 +64,8 @@ class Stretch2D(nn.Layer):
class UpsampleNet(nn.Layer):
"""A Layer to upsample spectrogram by applying consecutive stretch and
convolutions.
"""A Layer to upsample spectrogram by applying consecutive stretch and
convolutions.
Parameters
----------
@ -81,10 +82,10 @@ class UpsampleNet(nn.Layer):
use_causal_conv : bool, optional
Whether to use causal padding before convolution, by default False
If True, Causal padding is used along the time axis, i.e. padding
amount is ``receptive field - 1`` and 0 for before and after,
If True, Causal padding is used along the time axis, i.e. padding
amount is ``receptive field - 1`` and 0 for before and after,
respectively.
If False, "same" padding is used along the time axis.
"""
@ -158,7 +159,7 @@ class ConvInUpsampleNet(nn.Layer):
aux_context_window : int, optional
Context window of the first 1D convolution applied to the input. It
related to the kernel size of the convolution, by default 0
If use causal convolution, the kernel size is ``window + 1``, else
the kernel size is ``2 * window + 1``.
use_causal_conv : bool, optional
@ -167,7 +168,7 @@ class ConvInUpsampleNet(nn.Layer):
If True, Causal padding is used along the time axis, i.e. padding
amount is ``receptive field - 1`` and 0 for before and after,
respectively.
If False, "same" padding is used along the time axis.
"""
@ -276,10 +277,7 @@ class ResidualBlock(nn.Layer):
gate_out_channels = gate_channels // 2
self.conv1x1_out = nn.Conv1D(
gate_out_channels,
residual_channels,
kernel_size=1,
bias_attr=bias)
gate_out_channels, residual_channels, kernel_size=1, bias_attr=bias)
self.conv1x1_skip = nn.Conv1D(
gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias)
@ -428,13 +426,18 @@ class PWGGenerator(nn.Layer):
use_causal_conv=use_causal_conv)
self.conv_layers.append(conv)
self.last_conv_layers = nn.Sequential(
nn.ReLU(),
nn.Conv1D(
skip_channels, skip_channels, 1, bias_attr=True),
nn.ReLU(),
nn.Conv1D(
skip_channels, out_channels, 1, bias_attr=True))
self.last_conv_layers = nn.Sequential(nn.ReLU(),
nn.Conv1D(
skip_channels,
skip_channels,
1,
bias_attr=True),
nn.ReLU(),
nn.Conv1D(
skip_channels,
out_channels,
1,
bias_attr=True))
if use_weight_norm:
self.apply_weight_norm()
@ -548,18 +551,18 @@ class PWGDiscriminator(nn.Layer):
by default True
"""
def __init__(self,
in_channels: int=1,
out_channels: int=1,
kernel_size: int=3,
layers: int=10,
conv_channels: int=64,
dilation_factor: int=1,
nonlinear_activation: str="LeakyReLU",
nonlinear_activation_params: Dict[
str, Any]={"negative_slope": 0.2},
bias: bool=True,
use_weight_norm: bool=True):
def __init__(
self,
in_channels: int=1,
out_channels: int=1,
kernel_size: int=3,
layers: int=10,
conv_channels: int=64,
dilation_factor: int=1,
nonlinear_activation: str="LeakyReLU",
nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
bias: bool=True,
use_weight_norm: bool=True):
super().__init__()
assert kernel_size % 2 == 1
assert dilation_factor > 0
@ -693,8 +696,7 @@ class ResidualPWGDiscriminator(nn.Layer):
layers_per_stack = layers // stacks
self.first_conv = nn.Sequential(
nn.Conv1D(
in_channels, residual_channels, 1, bias_attr=True),
nn.Conv1D(in_channels, residual_channels, 1, bias_attr=True),
getattr(nn, nonlinear_activation)(**nonlinear_activation_params))
self.conv_layers = nn.LayerList()
@ -714,11 +716,9 @@ class ResidualPWGDiscriminator(nn.Layer):
self.last_conv_layers = nn.Sequential(
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
nn.Conv1D(
skip_channels, skip_channels, 1, bias_attr=True),
nn.Conv1D(skip_channels, skip_channels, 1, bias_attr=True),
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
nn.Conv1D(
skip_channels, out_channels, 1, bias_attr=True))
nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True))
if use_weight_norm:
self.apply_weight_norm()

View File

@ -11,18 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
from paddle import Tensor
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from parakeet.modules.positional_encoding import sinusoid_position_encoding
from parakeet.modules.expansion import expand
from parakeet.modules.positional_encoding import sinusoid_position_encoding
class ResidualBlock(nn.Layer):
@ -38,8 +31,7 @@ class ResidualBlock(nn.Layer):
padding="same",
data_format="NLC"),
nn.ReLU(),
nn.BatchNorm1D(
channels, data_format="NLC"), ) for _ in range(n)
nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
]
self.blocks = nn.Sequential(*blocks)
@ -95,16 +87,14 @@ class SpeedySpeechEncoder(nn.Layer):
nn.Linear(hidden_size, hidden_size),
nn.ReLU(), )
res_blocks = [
ResidualBlock(
hidden_size, kernel_size, d, n=2) for d in dilations
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
]
self.res_blocks = nn.Sequential(*res_blocks)
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
self.postnet2 = nn.Sequential(
nn.ReLU(),
nn.BatchNorm1D(
hidden_size, data_format="NLC"),
nn.BatchNorm1D(hidden_size, data_format="NLC"),
nn.Linear(hidden_size, hidden_size), )
def forward(self, text, tones):
@ -120,13 +110,9 @@ class DurationPredictor(nn.Layer):
def __init__(self, hidden_size):
super().__init__()
self.layers = nn.Sequential(
ResidualBlock(
hidden_size, 4, 1, n=1),
ResidualBlock(
hidden_size, 3, 1, n=1),
ResidualBlock(
hidden_size, 1, 1, n=1),
nn.Linear(hidden_size, 1))
ResidualBlock(hidden_size, 4, 1, n=1),
ResidualBlock(hidden_size, 3, 1, n=1),
ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))
def forward(self, x):
return paddle.squeeze(self.layers(x), -1)
@ -136,15 +122,13 @@ class SpeedySpeechDecoder(nn.Layer):
def __init__(self, hidden_size, output_size, kernel_size, dilations):
super().__init__()
res_blocks = [
ResidualBlock(
hidden_size, kernel_size, d, n=2) for d in dilations
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
]
self.res_blocks = nn.Sequential(*res_blocks)
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
self.postnet2 = nn.Sequential(
ResidualBlock(
hidden_size, kernel_size, 1, n=2),
ResidualBlock(hidden_size, kernel_size, 1, n=2),
nn.Linear(hidden_size, output_size))
def forward(self, x):

View File

@ -11,20 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
from paddle import nn
from paddle.fluid.layers import sequence_mask
from paddle.nn import functional as F
from paddle.nn import initializer as I
from paddle.fluid.layers import sequence_mask
from tqdm import trange
from parakeet.modules.conv import Conv1dBatchNorm
from parakeet.modules.attention import LocationSensitiveAttention
from parakeet.modules.conv import Conv1dBatchNorm
from parakeet.modules.losses import guided_attention_loss
from parakeet.utils import checkpoint
from tqdm import trange
__all__ = ["Tacotron2", "Tacotron2Loss"]
@ -74,8 +73,7 @@ class DecoderPreNet(nn.Layer):
"""
x = F.dropout(
F.relu(self.linear1(x)), self.dropout_rate, training=True)
x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate, training=True)
output = F.dropout(
F.relu(self.linear2(x)), self.dropout_rate, training=True)
return output
@ -745,10 +743,10 @@ class Tacotron2(nn.Layer):
if global_condition is not None:
global_condition = global_condition.unsqueeze(1)
global_condition = paddle.expand(
global_condition, [-1, encoder_outputs.shape[1], -1])
encoder_outputs = paddle.concat(
[encoder_outputs, global_condition], -1)
global_condition = paddle.expand(global_condition,
[-1, encoder_outputs.shape[1], -1])
encoder_outputs = paddle.concat([encoder_outputs, global_condition],
-1)
# [B, T_enc, 1]
mask = sequence_mask(
@ -813,10 +811,10 @@ class Tacotron2(nn.Layer):
if global_condition is not None:
global_condition = global_condition.unsqueeze(1)
global_condition = paddle.expand(
global_condition, [-1, encoder_outputs.shape[1], -1])
encoder_outputs = paddle.concat(
[encoder_outputs, global_condition], -1)
global_condition = paddle.expand(global_condition,
[-1, encoder_outputs.shape[1], -1])
encoder_outputs = paddle.concat([encoder_outputs, global_condition],
-1)
if self.decoder.use_stop_token:
mel_outputs, alignments, stop_logits = self.decoder.infer(
encoder_outputs, max_decoder_steps=max_decoder_steps)

View File

@ -11,22 +11,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from tqdm import trange
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from tqdm import trange
import parakeet
from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention
from parakeet.modules.transformer import PositionwiseFFN
from parakeet.modules import masking
from parakeet.modules.conv import Conv1dBatchNorm
from parakeet.modules import positional_encoding as pe
from parakeet.modules import losses as L
from parakeet.utils import checkpoint, scheduler
from parakeet.modules import masking
from parakeet.modules import positional_encoding as pe
from parakeet.modules.attention import _concat_heads
from parakeet.modules.attention import _split_heads
from parakeet.modules.attention import drop_head
from parakeet.modules.attention import scaled_dot_product_attention
from parakeet.modules.conv import Conv1dBatchNorm
from parakeet.modules.transformer import PositionwiseFFN
from parakeet.utils import checkpoint
from parakeet.utils import scheduler
__all__ = ["TransformerTTS", "TransformerTTSLoss"]
@ -404,16 +408,14 @@ class TransformerTTS(nn.Layer):
self.toned = False
# position encoding matrix may be extended later
self.encoder_pe = pe.sinusoid_position_encoding(1000, d_encoder)
self.encoder_pe_scalar = self.create_parameter(
[1], attr=I.Constant(1.))
self.encoder_pe_scalar = self.create_parameter([1], attr=I.Constant(1.))
self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn,
encoder_layers, dropout)
# decoder
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
self.decoder_pe = pe.sinusoid_position_encoding(1000, d_decoder)
self.decoder_pe_scalar = self.create_parameter(
[1], attr=I.Constant(1.))
self.decoder_pe_scalar = self.create_parameter([1], attr=I.Constant(1.))
self.decoder = TransformerDecoder(
d_decoder,
n_heads,
@ -470,14 +472,13 @@ class TransformerTTS(nn.Layer):
self.encoder_pe = pe.sinusoid_position_encoding(new_T,
self.d_encoder)
pos_enc = self.encoder_pe[:T_enc, :] # (T, C)
x = embed.scale(math.sqrt(
self.d_encoder)) + pos_enc * self.encoder_pe_scalar
x = embed.scale(
math.sqrt(self.d_encoder)) + pos_enc * self.encoder_pe_scalar
x = F.dropout(x, self.dropout, training=self.training)
# TODO(chenfeiyu): unsqueeze a decoder_time_steps=1 for the mask
encoder_padding_mask = paddle.unsqueeze(
masking.id_mask(
text, self.padding_idx, dtype=x.dtype), 1)
masking.id_mask(text, self.padding_idx, dtype=x.dtype), 1)
x, attention_weights = self.encoder(x, encoder_padding_mask,
self.drop_n_heads)
return x, attention_weights, encoder_padding_mask
@ -492,8 +493,8 @@ class TransformerTTS(nn.Layer):
self.decoder_pe = pe.sinusoid_position_encoding(new_T,
self.d_decoder)
pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
x = x.scale(math.sqrt(
self.d_decoder)) + pos_enc * self.decoder_pe_scalar
x = x.scale(
math.sqrt(self.d_decoder)) + pos_enc * self.decoder_pe_scalar
x = F.dropout(x, self.dropout, training=self.training)
no_future_mask = masking.future_mask(T_dec, dtype=input.dtype)
@ -547,9 +548,8 @@ class TransformerTTS(nn.Layer):
# stop condition: (if any ouput frame of the output multiframes hits the stop condition)
# import pdb; pdb.set_trace()
if paddle.any(
paddle.argmax(
stop_logits[0, -self.r:, :], axis=-1) ==
self.stop_prob_index):
paddle.argmax(stop_logits[0, -self.r:, :],
axis=-1) == self.stop_prob_index):
if verbose:
print("Hits stop condition.")
break
@ -602,8 +602,7 @@ class TransformerTTSLoss(nn.Layer):
def forward(self, mel_output, mel_intermediate, mel_target, stop_logits,
stop_probs):
mask = masking.feature_mask(
mel_target, axis=-1, dtype=mel_target.dtype)
mask = masking.feature_mask(mel_target, axis=-1, dtype=mel_target.dtype)
mask1 = paddle.unsqueeze(mask, -1)
mel_loss1 = L.masked_l1_loss(mel_output, mel_target, mask1)
mel_loss2 = L.masked_l1_loss(mel_intermediate, mel_target, mask1)

View File

@ -11,10 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import math
from typing import List, Union, Tuple
import time
from typing import List
from typing import Tuple
from typing import Union
import numpy as np
import paddle
@ -22,8 +23,8 @@ from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from parakeet.utils import checkpoint
from parakeet.modules import geometry as geo
from parakeet.utils import checkpoint
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
@ -120,7 +121,7 @@ class UpsampleNet(nn.LayerList):
If trim_conv_artifact is ``True``, the output time steps is less
than ``time_steps \* upsample_factors``.
"""
x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T)
x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T)
for layer in self:
x = layer(x)
if trim_conv_artifact:
@ -795,7 +796,7 @@ class ConditionalWaveFlow(nn.LayerList):
The synthesized audio, where``T <= T_mel \* upsample_factors``.
"""
start = time.time()
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T)
batch_size, _, time_steps = condition.shape
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
x = self.decoder.inverse(z, condition)
@ -893,12 +894,12 @@ class WaveFlowLoss(nn.Layer):
class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
def forward(self, mel):
"""Generate raw audio given mel spectrogram.
Parameters
----------
mel : np.ndarray [shape=(C_mel, T_mel)]
Mel spectrogram of an utterance(in log-magnitude).
Mel spectrogram of an utterance(in log-magnitude).
Returns
-------
np.ndarray [shape=(T,)]

View File

@ -11,11 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.modules.attention import *
from parakeet.modules.conv import *
from parakeet.modules.geometry import *
from parakeet.modules.losses import *
from parakeet.modules.masking import *
from parakeet.modules.positional_encoding import *
from parakeet.modules.transformer import *

View File

@ -11,19 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
from paddle import nn
from paddle.nn import functional as F
def scaled_dot_product_attention(q,
k,
v,
mask=None,
dropout=0.0,
def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
training=True):
r"""Scaled dot product attention with masking.
@ -33,24 +29,19 @@ def scaled_dot_product_attention(q,
Parameters
-----------
q : Tensor [shape=(\*, T_q, d)]
the query tensor.
k : Tensor [shape=(\*, T_k, d)]
the key tensor.
v : Tensor [shape=(\*, T_k, d_v)]
the value tensor.
mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
the mask tensor, zeros correspond to paddings. Defaults to None.
Returns
----------
out : Tensor [shape=(\*, T_q, d_v)]
out : Tensor [shape=(\*, T_q, d_v)]
the context vector.
attn_weights : Tensor [shape=(\*, T_q, T_k)]
the attention weights.
"""
@ -74,10 +65,8 @@ def drop_head(x, drop_n_heads, training=True):
----------
x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
The input, multiple context vectors.
drop_n_heads : int [0<= drop_n_heads <= num_heads]
Number of vectors to drop.
training : bool
A flag indicating whether it is in training. If `False`, no dropout is
applied.
@ -127,17 +116,14 @@ class MonoheadAttention(nn.Layer):
----------
model_dim : int
Feature size of the query.
dropout : float, optional
Dropout probability of scaled dot product attention and final context
Dropout probability of scaled dot product attention and final context
vector. Defaults to 0.0.
k_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None.
v_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None.
"""
@ -162,23 +148,19 @@ class MonoheadAttention(nn.Layer):
Parameters
-----------
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The queries.
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The values.
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
The mask.
Returns
----------
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
@ -200,20 +182,16 @@ class MultiheadAttention(nn.Layer):
-----------
model_dim: int
The feature size of query.
num_heads : int
The number of attention heads.
dropout : float, optional
Dropout probability of scaled dot product attention and final context
Dropout probability of scaled dot product attention and final context
vector. Defaults to 0.0.
k_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
Feature size of the key of each scaled dot product attention. If not
provided, it is set to ``model_dim / num_heads``. Defaults to None.
v_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
Feature size of the key of each scaled dot product attention. If not
provided, it is set to ``model_dim / num_heads``. Defaults to None.
Raises
@ -248,23 +226,19 @@ class MultiheadAttention(nn.Layer):
Parameters
-----------
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The queries.
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The values.
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
The mask.
Returns
----------
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
@ -290,16 +264,12 @@ class LocationSensitiveAttention(nn.Layer):
-----------
d_query: int
The feature size of query.
d_key : int
The feature size of key.
d_attention : int
The feature size of dimension.
The feature size of dimension.
location_filters : int
Filter size of attention convolution.
location_kernel_size : int
Kernel size of attention convolution.
"""
@ -337,27 +307,22 @@ class LocationSensitiveAttention(nn.Layer):
Parameters
-----------
query : Tensor [shape=(batch_size, d_query)]
query : Tensor [shape=(batch_size, d_query)]
The queries.
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
The keys after linear layer.
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
The values.
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
Attention weights concat.
mask : Tensor, optional
The mask. Shape should be (batch_size, times_steps_k, 1).
Defaults to None.
Returns
----------
attention_context : Tensor [shape=(batch_size, d_attention)]
attention_context : Tensor [shape=(batch_size, d_attention)]
The context vector.
attention_weights : Tensor [shape=(batch_size, time_steps_k)]
The attention weights.
"""

View File

@ -11,20 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import numpy as np
import paddle
from librosa.util import pad_center
from paddle import nn
from paddle.nn import functional as F
from scipy import signal
import librosa
from librosa.util import pad_center
import numpy as np
__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
def quantize(values, n_bands):
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
[0, n_bands).
Parameters
@ -33,7 +32,7 @@ def quantize(values, n_bands):
The floating point value.
n_bands : int
The number of bands. The output integer Tensor's value is in the range
The number of bands. The output integer Tensor's value is in the range
[0, n_bans).
Returns
@ -46,7 +45,7 @@ def quantize(values, n_bands):
def dequantize(quantized, n_bands, dtype=None):
"""Linearlly dequantize an integer Tensor into a float Tensor in the range
"""Linearlly dequantize an integer Tensor into a float Tensor in the range
[-1, 1).
Parameters
@ -55,7 +54,7 @@ def dequantize(quantized, n_bands, dtype=None):
The quantized value in the range [0, n_bands).
n_bands : int
Number of bands. The input integer Tensor's value is in the range
Number of bands. The input integer Tensor's value is in the range
[0, n_bans).
dtype : str, optional
@ -73,43 +72,36 @@ def dequantize(quantized, n_bands, dtype=None):
class STFT(nn.Layer):
"""A module for computing stft transformation in a differentiable way.
"""A module for computing stft transformation in a differentiable way.
Parameters
------------
n_fft : int
Number of samples in a frame.
hop_length : int
Number of samples shifted between adjacent frames.
win_length : int
Length of the window.
window : str, optional
Name of window function, see `scipy.signal.get_window` for more
Name of window function, see `scipy.signal.get_window` for more
details. Defaults to "hanning".
center : bool
If True, the signal y is padded so that frame D[:, t] is centered
at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
Defaults to True.
pad_mode : string or function
If center=True, this argument is passed to np.pad for padding the edges
of the signal y. By default (pad_mode="reflect"), y is padded on both
sides with its own reflection, mirrored around its first and last
If center=True, this argument is passed to np.pad for padding the edges
of the signal y. By default (pad_mode="reflect"), y is padded on both
sides with its own reflection, mirrored around its first and last
sample respectively. If center=False, this argument is ignored.
Notes
-----------
It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
details.
Given a audio which ``T`` samples, it the STFT transformation outputs a
spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
Given a audio which ``T`` samples, it the STFT transformation outputs a
spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
and ``frames = 1 + T // hop_lenghth``.
Ony ``center`` and ``reflect`` padding is supported now.
@ -144,19 +136,19 @@ class STFT(nn.Layer):
# pad window to n_fft size
if n_fft != win_length:
window = pad_center(window, n_fft, mode="constant")
#lpad = (n_fft - win_length) // 2
#rpad = n_fft - win_length - lpad
#window = np.pad(window, ((lpad, pad), ), 'constant')
# lpad = (n_fft - win_length) // 2
# rpad = n_fft - win_length - lpad
# window = np.pad(window, ((lpad, pad), ), 'constant')
# calculate weights
#r = np.arange(0, n_fft)
#M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
#w_real = np.reshape(window *
#np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
#(self.n_bin, 1, self.n_fft))
#w_imag = np.reshape(window *
#np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
#(self.n_bin, 1, self.n_fft))
# r = np.arange(0, n_fft)
# M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
# w_real = np.reshape(window *
# np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
# (self.n_bin, 1, self.n_fft))
# w_imag = np.reshape(window *
# np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
# (self.n_bin, 1, self.n_fft))
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
w_real = weight.real
w_imag = weight.imag
@ -174,17 +166,18 @@ class STFT(nn.Layer):
The input waveform.
Returns
------------
real : Tensor [shape=(B, C, frames)]
real : Tensor [shape=(B, C, frames)]
The real part of the spectrogram.
imag : Tensor [shape=(B, C, frames)]
imag : Tensor [shape=(B, C, frames)]
The image part of the spectrogram.
"""
x = paddle.unsqueeze(x, axis=1)
if self.center:
x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
data_format='NCL',
mode=self.pad_mode)
x = F.pad(
x, [self.n_fft // 2, self.n_fft // 2],
data_format='NCL',
mode=self.pad_mode)
# to BCT, C=1
out = F.conv1d(x, self.weight, stride=self.hop_length)
@ -199,7 +192,7 @@ class STFT(nn.Layer):
The input waveform.
Returns
------------
Tensor [shape=(B, C, T)]
Tensor [shape=(B, C, T)]
The power spectrum.
"""
real, imag = self.forward(x)
@ -214,7 +207,7 @@ class STFT(nn.Layer):
The input waveform.
Returns
------------
Tensor [shape=(B, C, T)]
Tensor [shape=(B, C, T)]
The magnitude of the spectrum.
"""
power = self.power(x)

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
@ -22,48 +21,40 @@ __all__ = [
class Conv1dCell(nn.Conv1D):
"""A subclass of Conv1D layer, which can be used in an autoregressive
"""A subclass of Conv1D layer, which can be used in an autoregressive
decoder like an RNN cell.
When used in autoregressive decoding, it performs causal temporal
convolution incrementally. At each time step, it takes a step input and
returns a step output.
When used in autoregressive decoding, it performs causal temporal
convolution incrementally. At each time step, it takes a step input and
returns a step output.
Notes
------
It is done by caching an internal buffer of length ``receptive_file - 1``.
when adding a step input, the buffer is shited by one step, the latest
input is added to be buffer and the oldest step is discarded. And it
returns a step output. For single step case, convolution is equivalent to a
It is done by caching an internal buffer of length ``receptive_file - 1``.
when adding a step input, the buffer is shited by one step, the latest
input is added to be buffer and the oldest step is discarded. And it
returns a step output. For single step case, convolution is equivalent to a
linear transformation.
That it can be used as a cell depends on several restrictions:
1. stride must be 1;
2. padding must be a causal padding (recpetive_field - 1, 0).
Thus, these arguments are removed from the ``__init__`` method of this
Thus, these arguments are removed from the ``__init__`` method of this
class.
Parameters
----------
in_channels: int
The feature size of the input.
out_channels: int
The feature size of the output.
kernel_size: int or Tuple[int]
The size of the kernel.
dilation: int or Tuple[int]
The dilation of the convolution, by default 1
weight_attr: ParamAttr, Initializer, str or bool, optional
The parameter attribute of the convolution kernel, by default None.
bias_attr: ParamAttr, Initializer, str or bool, optional
The parameter attribute of the bias. If ``False``, this layer does not
The parameter attribute of the bias. If ``False``, this layer does not
have a bias, by default None.
Examples
@ -114,7 +105,7 @@ class Conv1dCell(nn.Conv1D):
Warnings
---------
This method should be called before a sequence of calls to
This method should be called before a sequence of calls to
``add_input``.
Raises
@ -165,12 +156,12 @@ class Conv1dCell(nn.Conv1D):
Parameters
-----------
x_t : Tensor [shape=(batch_size, in_channels)]
x_t : Tensor [shape=(batch_size, in_channels)]
The step input.
Returns
-------
y_t :Tensor [shape=(batch_size, out_channels)]
y_t :Tensor [shape=(batch_size, out_channels)]
The step output.
"""
batch_size = x_t.shape[0]
@ -199,36 +190,27 @@ class Conv1dBatchNorm(nn.Layer):
----------
in_channels : int
The feature size of the input.
out_channels : int
The feature size of the output.
kernel_size : int
The size of the convolution kernel.
stride : int, optional
The stride of the convolution, by default 1.
padding : int, str or Tuple[int], optional
The padding of the convolution.
The padding of the convolution.
If int, a symmetrical padding is applied before convolution;
If str, it should be "same" or "valid";
If Tuple[int], its length should be 2, meaning
If Tuple[int], its length should be 2, meaning
``(pad_before, pad_after)``, by default 0.
weight_attr : ParamAttr, Initializer, str or bool, optional
The parameter attribute of the convolution kernel, by default None.
bias_attr : ParamAttr, Initializer, str or bool, optional
The parameter attribute of the bias of the convolution, by default
The parameter attribute of the bias of the convolution, by default
None.
data_format : str ["NCL" or "NLC"], optional
The data layout of the input, by default "NCL"
momentum : float, optional
The momentum of the BatchNorm1D layer, by default 0.9
epsilon : [type], optional
The epsilon of the BatchNorm1D layer, by default 1e-05
"""

View File

@ -11,9 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from paddle import Tensor

Some files were not shown because too many files have changed in this diff Show More