format code with pre-commit

This commit is contained in:
chenfeiyu 2021-05-13 16:22:56 +08:00
parent 73ca693395
commit 6a1fb158d9
62 changed files with 1068 additions and 709 deletions

View File

@ -45,7 +45,7 @@ See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. T
pip install -U paddle-parakeet
```
or
or
```bash
git clone https://github.com/PaddlePaddle/Parakeet
cd Parakeet

View File

@ -68,7 +68,6 @@ exclude_patterns = []
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".

View File

@ -127,6 +127,3 @@ python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpo
1. [Generalized End-to-end Loss for Speaker Verification](https://arxiv.org/pdf/1710.10467.pdf)
2. [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)

View File

@ -4,7 +4,7 @@
## 模型
本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
## 目录结构
@ -122,6 +122,3 @@ python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpo
1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf)
2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from warnings import warn
import struct
@ -30,16 +44,18 @@ def normalize_volume(wav,
if increase_only and decrease_only:
raise ValueError("Both increase only and decrease only are set")
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
if ((dBFS_change < 0 and increase_only)
or (dBFS_change > 0 and decrease_only)):
if ((dBFS_change < 0 and increase_only) or
(dBFS_change > 0 and decrease_only)):
return wav
gain = 10**(dBFS_change / 20)
return wav * gain
def trim_long_silences(wav, vad_window_length: int,
def trim_long_silences(wav,
vad_window_length: int,
vad_moving_average_width: int,
vad_max_silence_length: int, sampling_rate: int):
vad_max_silence_length: int,
sampling_rate: int):
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
@ -63,14 +79,15 @@ def trim_long_silences(wav, vad_window_length: int,
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(
vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
vad.is_speech(
pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)
# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros(
(width - 1) // 2), array, np.zeros(width // 2)))
array_padded = np.concatenate((np.zeros((width - 1) // 2), array,
np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width
@ -89,8 +106,8 @@ def trim_long_silences(wav, vad_window_length: int,
def compute_partial_slices(n_samples: int,
partial_utterance_n_frames: int,
hop_length: int,
min_pad_coverage: float = 0.75,
overlap: float = 0.5):
min_pad_coverage: float=0.75,
overlap: float=0.5):
"""
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
@ -121,8 +138,8 @@ def compute_partial_slices(n_samples: int,
# librosa's function to compute num_frames from num_samples
n_frames = int(np.ceil((n_samples + 1) / hop_length))
# frame shift between ajacent partials
frame_step = max(1,
int(np.round(partial_utterance_n_frames * (1 - overlap))))
frame_step = max(
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
# Compute the slices
wav_slices, mel_slices = [], []
@ -135,8 +152,8 @@ def compute_partial_slices(n_samples: int,
# Evaluate whether extra padding is warranted or not
last_wav_range = wav_slices[-1]
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop -
last_wav_range.start)
coverage = (n_samples - last_wav_range.start) / (
last_wav_range.stop - last_wav_range.start)
if coverage < min_pad_coverage and len(mel_slices) > 1:
mel_slices = mel_slices[:-1]
wav_slices = wav_slices[:-1]
@ -155,8 +172,8 @@ class SpeakerVerificationPreprocessor(object):
mel_window_step,
n_mels,
partial_n_frames: int,
min_pad_coverage: float = 0.75,
partial_overlap_ratio: float = 0.5):
min_pad_coverage: float=0.75,
partial_overlap_ratio: float=0.5):
self.sampling_rate = sampling_rate
self.audio_norm_target_dBFS = audio_norm_target_dBFS
@ -184,24 +201,23 @@ class SpeakerVerificationPreprocessor(object):
wav = librosa.resample(wav, source_sr, self.sampling_rate)
# loudness normalization
wav = normalize_volume(wav,
self.audio_norm_target_dBFS,
increase_only=True)
wav = normalize_volume(
wav, self.audio_norm_target_dBFS, increase_only=True)
# trim long silence
if webrtcvad:
wav = trim_long_silences(wav, self.vad_window_length,
self.vad_moving_average_width,
self.vad_max_silence_length,
self.sampling_rate)
wav = trim_long_silences(
wav, self.vad_window_length, self.vad_moving_average_width,
self.vad_max_silence_length, self.sampling_rate)
return wav
def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram(wav,
sr=self.sampling_rate,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels)
mel = librosa.feature.melspectrogram(
wav,
sr=self.sampling_rate,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels)
mel = mel.astype(np.float32).T
return mel

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode
_C = CfgNode()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from typing import List
from pathlib import Path
@ -29,7 +43,7 @@ def _process_speaker(speaker_dir: Path,
datasets_root: Path,
output_dir: Path,
pattern: str,
skip_existing: bool = False):
skip_existing: bool=False):
# datastes root: a reference path to compute speaker_name
# we prepand dataset name to speaker_id becase we are mixing serveal
# multispeaker datasets together
@ -67,24 +81,25 @@ def _process_dataset(processor: SpeakerVerificationPreprocessor,
dataset_name: str,
output_dir: Path,
pattern: str,
skip_existing: bool = False):
skip_existing: bool=False):
print(
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers."
)
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")
_func = partial(_process_speaker,
processor=processor,
datasets_root=datasets_root,
output_dir=output_dir,
pattern=pattern,
skip_existing=skip_existing)
_func = partial(
_process_speaker,
processor=processor,
datasets_root=datasets_root,
output_dir=output_dir,
pattern=pattern,
skip_existing=skip_existing)
with mp.Pool(16) as pool:
list(
tqdm(pool.imap(_func, speaker_dirs),
dataset_name,
len(speaker_dirs),
unit="speakers"))
tqdm(
pool.imap(_func, speaker_dirs),
dataset_name,
len(speaker_dirs),
unit="speakers"))
print(f"Done preprocessing {dataset_name}.")

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
@ -26,7 +40,9 @@ def embed_utterance(processor, model, fpath_or_wav):
return embed
def _process_utterance(ifpath: Path, input_dir: Path, output_dir: Path,
def _process_utterance(ifpath: Path,
input_dir: Path,
output_dir: Path,
processor: SpeakerVerificationPreprocessor,
model: LSTMSpeakerEncoder):
rel_path = ifpath.relative_to(input_dir)
@ -62,8 +78,7 @@ def main(config, args):
n_mels=c.n_mels,
partial_n_frames=c.partial_n_frames,
min_pad_coverage=c.min_pad_coverage,
partial_overlap_ratio=c.min_pad_coverage,
)
partial_overlap_ratio=c.min_pad_coverage, )
# input output preparation
input_dir = Path(args.input).expanduser()
@ -83,34 +98,34 @@ if __name__ == "__main__":
"--config",
metavar="FILE",
help="path of the config file to overwrite to default config with.")
parser.add_argument("--input",
type=str,
help="path of the audio_file folder.")
parser.add_argument("--pattern",
type=str,
default="*.wav",
help="pattern to filter audio files.")
parser.add_argument("--output",
metavar="OUTPUT_DIR",
help="path to save checkpoint and logs.")
parser.add_argument(
"--input", type=str, help="path of the audio_file folder.")
parser.add_argument(
"--pattern",
type=str,
default="*.wav",
help="pattern to filter audio files.")
parser.add_argument(
"--output",
metavar="OUTPUT_DIR",
help="path to save checkpoint and logs.")
# load from saved checkpoint
parser.add_argument("--checkpoint_path",
type=str,
help="path of the checkpoint to load")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load")
# running
parser.add_argument("--device",
type=str,
choices=["cpu", "gpu"],
help="device type to use, cpu and gpu are supported.")
parser.add_argument(
"--device",
type=str,
choices=["cpu", "gpu"],
help="device type to use, cpu and gpu are supported.")
# overwrite extra config and default config
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
args = parser.parse_args()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from config import get_cfg_defaults
@ -12,25 +26,21 @@ if __name__ == "__main__":
parser.add_argument(
"--datasets_root",
type=Path,
help=
"Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
)
parser.add_argument("--output_dir",
type=Path,
help="Path to save processed dataset.")
parser.add_argument(
"--output_dir", type=Path, help="Path to save processed dataset.")
parser.add_argument(
"--dataset_names",
type=str,
default="librispeech_other,voxceleb1,voxceleb2",
help=
"comma-separated list of names of the datasets you want to preprocess. only "
help="comma-separated list of names of the datasets you want to preprocess. only "
"the train set of these datastes will be used. Possible names: librispeech_other, "
"voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
parser.add_argument(
"--skip_existing",
action="store_true",
help=
"Whether to skip ouput files with the same name. Useful if this script was interrupted."
help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
)
parser.add_argument(
"--no_trim",
@ -74,8 +84,7 @@ if __name__ == "__main__":
n_mels=c.n_mels,
partial_n_frames=c.partial_n_frames,
min_pad_coverage=c.min_pad_coverage,
partial_overlap_ratio=c.min_pad_coverage,
)
partial_overlap_ratio=c.min_pad_coverage, )
preprocess_func = {
"librispeech_other": process_librispeech,

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from pathlib import Path
@ -22,6 +36,7 @@ class MultiSpeakerMelDataset(Dataset):
utterance2.npy
utterance3.npy
"""
def __init__(self, dataset_root: Path):
self.root = Path(dataset_root).expanduser()
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
@ -57,8 +72,11 @@ class MultiSpeakerSampler(BatchSampler):
First, N speakers from all speakers are sampled randomly. Then, for each
speaker, randomly sample M utterances from their corresponding utterances.
"""
def __init__(self, dataset: MultiSpeakerMelDataset,
speakers_per_batch: int, utterances_per_speaker: int):
def __init__(self,
dataset: MultiSpeakerMelDataset,
speakers_per_batch: int,
utterances_per_speaker: int):
self._speakers = list(dataset.speaker_dirs)
self._speaker_to_utterances = dataset.speaker_to_utterances

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from paddle import distributed as dist
@ -22,9 +36,10 @@ class Ge2eExperiment(ExperimentBase):
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
config.model.hidden_size,
config.model.embedding_size)
optimizer = Adam(config.training.learning_rate_init,
parameters=model.parameters(),
grad_clip=ClipGradByGlobalNorm(3))
optimizer = Adam(
config.training.learning_rate_init,
parameters=model.parameters(),
grad_clip=ClipGradByGlobalNorm(3))
self.model = DataParallel(model) if self.parallel else model
self.model_core = model
self.optimizer = optimizer
@ -35,11 +50,11 @@ class Ge2eExperiment(ExperimentBase):
sampler = MultiSpeakerSampler(train_dataset,
config.training.speakers_per_batch,
config.training.utterances_per_speaker)
train_loader = DataLoader(train_dataset,
batch_sampler=sampler,
collate_fn=Collate(
config.data.partial_n_frames),
num_workers=16)
train_loader = DataLoader(
train_dataset,
batch_sampler=sampler,
collate_fn=Collate(config.data.partial_n_frames),
num_workers=16)
self.train_dataset = train_dataset
self.train_loader = train_loader
@ -72,8 +87,8 @@ class Ge2eExperiment(ExperimentBase):
self.iteration)
self.visualizer.add_scalar("train/eer", eer, self.iteration)
self.visualizer.add_scalar(
"param/w", float(self.model_core.similarity_weight),
self.iteration)
"param/w",
float(self.model_core.similarity_weight), self.iteration)
self.visualizer.add_scalar("param/b",
float(self.model_core.similarity_bias),
self.iteration)

View File

@ -87,7 +87,6 @@ Pretrained Models can be downloaded from links below. We provide 2 models with d
2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
## Notebook: End-to-end TTS
## Notebook: End-to-end TTS
See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.

View File

@ -32,16 +32,14 @@ _C.data = CN(
_C.model = CN(
dict(
vocab_size=37, # set this according to the frontend's vocab size
n_tones=None,
n_tones=None,
reduction_factor=1, # reduction factor
d_encoder=512, # embedding & encoder's internal size
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
d_prenet=256, # hidden size of decoder prenet
d_attention_rnn=
1024, # hidden size of the first rnn layer in tacotron2 decoder
d_decoder_rnn=
1024, # hidden size of the second rnn layer in tacotron2 decoder
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
d_attention=128, # hidden size of decoder location linear layer
attention_filters=32, # number of filter in decoder location conv layer
attention_kernel_size=31, # kernel size of decoder location conv layer
@ -50,14 +48,11 @@ _C.model = CN(
postnet_conv_layers=5, # number of conv layer in decoder postnet
p_encoder_dropout=0.5, # droput probability in encoder
p_prenet_dropout=0.5, # droput probability in decoder prenet
p_attention_dropout=
0.1, # droput probability of first rnn layer in decoder
p_decoder_dropout=
0.1, # droput probability of second rnn layer in decoder
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
p_postnet_dropout=0.5, # droput probability in decoder postnet
d_global_condition=None,
use_stop_token=
True, # wherther to use binary classifier to predict when to stop
use_stop_token=True, # wherther to use binary classifier to predict when to stop
use_guided_attention_loss=False, # whether to use guided attention loss
guided_attention_loss_sigma=0.2 # sigma in guided attention loss
))

View File

@ -23,6 +23,7 @@ from parakeet.data.batch import batch_spec, batch_text_id
class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root):
self.root = Path(root).expanduser()
records = []
@ -44,9 +45,8 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples."""
def __init__(self,
padding_idx=0,
padding_value=0.,
def __init__(self, padding_idx=0, padding_value=0.,
padding_stop_token=1.0):
self.padding_idx = padding_idx
self.padding_value = padding_value
@ -68,16 +68,19 @@ class LJSpeechCollector(object):
# Sort by text_len in descending order
texts = [
i for i, _ in sorted(
i
for i, _ in sorted(
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
]
mels = [
i for i, _ in sorted(
i
for i, _ in sorted(
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
]
mel_lens = [
i for i, _ in sorted(
i
for i, _ in sorted(
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
]

View File

@ -35,13 +35,14 @@ def create_dataset(config, source_path, target_path, verbose=False):
meta_data = LJSpeechMetaData(source_path)
frontend = EnglishCharacter()
processor = AudioProcessor(sample_rate=config.data.sample_rate,
n_fft=config.data.n_fft,
n_mels=config.data.n_mels,
win_length=config.data.win_length,
hop_length=config.data.hop_length,
fmax=config.data.fmax,
fmin=config.data.fmin)
processor = AudioProcessor(
sample_rate=config.data.sample_rate,
n_fft=config.data.n_fft,
n_mels=config.data.n_mels,
win_length=config.data.win_length,
hop_length=config.data.hop_length,
fmax=config.data.fmax,
fmin=config.data.fmin)
normalizer = LogMagnitude()
records = []
@ -70,26 +71,22 @@ def create_dataset(config, source_path, target_path, verbose=False):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument("--input",
type=str,
help="path of the ljspeech dataset")
parser.add_argument("--output",
type=str,
help="path to save output dataset")
parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="print msg")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults()
args = parser.parse_args()

View File

@ -65,29 +65,24 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument("--checkpoint_path",
type=str,
help="path of the checkpoint to load.")
parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument("--input", type=str, help="path of the text sentences")
parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device",
type=str,
default="cpu",
help="device type to use.")
parser.add_argument(
"--device", type=str, default="cpu", help="device type to use.")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="print msg")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args()
if args.config:

View File

@ -98,9 +98,8 @@ class Experiment(ExperimentBase):
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
self.visualizer.add_figure(
f"valid_sentence_{i}_predicted_spectrogram",
display.plot_spectrogram(
outputs['mel_outputs_postnet'][0].numpy().T),
self.iteration)
display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
.numpy().T), self.iteration)
# write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
@ -169,26 +168,27 @@ class Experiment(ExperimentBase):
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
if not self.parallel:
self.train_loader = DataLoader(train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
self.train_loader = DataLoader(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
else:
sampler = DistributedBatchSampler(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True)
self.train_loader = DataLoader(train_set,
batch_sampler=sampler,
collate_fn=batch_fn)
self.train_loader = DataLoader(
train_set, batch_sampler=sampler, collate_fn=batch_fn)
self.valid_loader = DataLoader(valid_set,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
self.valid_loader = DataLoader(
valid_set,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
def main_sp(config, args):

View File

@ -80,7 +80,7 @@ input 是处理后的音频所在的文件夹output 是输出频谱的文件
运行脚本训练。
```python
python train.py --data=<data> --output=<output> --device="gpu"
python train.py --data=<data> --output=<output> --device="gpu"
```
我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
@ -90,7 +90,7 @@ python train.py --data=<data> --output=<output> --device="gpu"
可以使用 visualdl 查看训练过程的 log。
```bash
visualdl --logdir=<output> --host=$HOSTNAME
visualdl --logdir=<output> --host=$HOSTNAME
```
示例 training loss / validation loss 曲线如下。
@ -109,4 +109,4 @@ visualdl --logdir=<output> --host=$HOSTNAME
## 使用
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
from pathlib import Path
@ -16,6 +30,7 @@ print("vocab_tones:\n", voc_tones)
class AiShell3(Dataset):
"""Processed AiShell3 dataset."""
def __init__(self, root):
super().__init__()
self.root = Path(root).expanduser()
@ -31,10 +46,10 @@ class AiShell3(Dataset):
speaker_id = sentence_id[:7]
phones = metadatum["phones"]
tones = metadatum["tones"]
phones = np.array([voc_phones.lookup(item) for item in phones],
dtype=np.int64)
tones = np.array([voc_tones.lookup(item) for item in tones],
dtype=np.int64)
phones = np.array(
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
tones = np.array(
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
embed = np.load(
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
@ -50,8 +65,8 @@ def collate_aishell3_examples(examples):
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
T_dec = np.max(spec_lengths)
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
-1)).astype(np.float32)
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
).astype(np.float32)
phones, _ = batch_text_id(phones)
tones, _ = batch_text_id(tones)
mel, _ = batch_spec(mel)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple
from chinese_text_to_pinyin import convert_to_pinyin

View File

@ -1,3 +1,16 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A pinyin to phone transcription system for chinese.
Syllables are splited as initial and final. 'er' is also treated as s special symbol.
@ -96,9 +109,8 @@ def convert(syllable):
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un",
"uen").replace("ui",
"uei").replace("iu", "iou")
syllable = syllable.replace("un", "uen").replace(
"ui", "uei").replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from pypinyin import lazy_pinyin, Style
@ -7,7 +21,6 @@ def convert_to_pinyin(text: str) -> List[str]:
"""convert text into list of syllables, other characters that are not chinese, thus
cannot be converted to pinyin are splited.
"""
syllables = lazy_pinyin(text,
style=Style.TONE3,
neutral_tone_with_five=True)
syllables = lazy_pinyin(
text, style=Style.TONE3, neutral_tone_with_five=True)
return syllables

View File

@ -62,8 +62,7 @@ _C.model = CN(
# whether to use a classifier to predict stop probability
use_stop_token=False,
# whether to use guided attention loss in training
use_guided_attention_loss=True,
))
use_guided_attention_loss=True, ))
_C.training = CN(
dict(

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import multiprocessing as mp
from functools import partial
@ -12,8 +26,11 @@ import tqdm
from config import get_cfg_defaults
def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
p: AudioProcessor, n: NormalizerBase):
def extract_mel(fname: Path,
input_dir: Path,
output_dir: Path,
p: AudioProcessor,
n: NormalizerBase):
relative_path = fname.relative_to(input_dir)
out_path = (output_dir / relative_path).with_suffix(".npy")
out_path.parent.mkdir(parents=True, exist_ok=True)
@ -34,41 +51,37 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
config.fmax)
n = LogMagnitude(1e-5)
func = partial(extract_mel,
input_dir=input_dir,
output_dir=output_dir,
p=p,
n=n)
func = partial(
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
with mp.Pool(16) as pool:
list(
tqdm.tqdm(pool.imap(func, fnames),
total=len(fnames),
unit="utterance"))
tqdm.tqdm(
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=
"Extract mel spectrogram from processed wav in AiShell3 training dataset."
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
)
parser.add_argument(
"--config",
type=str,
help="yaml config file to overwrite the default config")
parser.add_argument("--input",
type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the processed wav folder")
parser.add_argument("--output",
type=str,
default="~/datasets/aishell3/train/mel",
help="path of the folder to save mel spectrograms")
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the processed wav folder")
parser.add_argument(
"--output",
type=str,
default="~/datasets/aishell3/train/mel",
help="path of the folder to save mel spectrograms")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
default_config = get_cfg_defaults()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import re
@ -107,9 +121,8 @@ def convert(syllable):
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un",
"uen").replace("ui",
"uei").replace("iu", "iou")
syllable = syllable.replace("un", "uen").replace(
"ui", "uei").replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
@ -218,18 +231,15 @@ def process_aishell3(dataset_root, output_dir):
pickle.dump(processed_records, f)
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
yaml.safe_dump(processed_records,
f,
default_flow_style=None,
allow_unicode=True)
yaml.safe_dump(
processed_records, f, default_flow_style=None, allow_unicode=True)
print("metadata done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=
"Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
)
parser.add_argument(
"--input",

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from multiprocessing import Pool
@ -47,34 +61,36 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
wav_paths = list(source_dir.rglob("*.wav"))
print(f"there are {len(wav_paths)} audio files in total")
fx = partial(process_utterance,
source_dir=source_dir,
target_dir=target_dir,
alignment_dir=alignment_dir)
fx = partial(
process_utterance,
source_dir=source_dir,
target_dir=target_dir,
alignment_dir=alignment_dir)
with Pool(16) as p:
list(
tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
unit="utterance"))
tqdm(
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=
"Process audio in AiShell3, trim silence according to the alignment "
description="Process audio in AiShell3, trim silence according to the alignment "
"files generated by MFA, and normalize volume by peak.")
parser.add_argument("--input",
type=str,
default="~/datasets/aishell3/train/wav",
help="path of the original audio folder in aishell3.")
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train/wav",
help="path of the original audio folder in aishell3.")
parser.add_argument(
"--output",
type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the folder to save the processed audio files.")
parser.add_argument("--alignment",
type=str,
default="~/datasets/aishell3/train/alignment",
help="path of the alignment files.")
parser.add_argument(
"--alignment",
type=str,
default="~/datasets/aishell3/train/alignment",
help="path of the alignment files.")
args = parser.parse_args()
preprocess_aishell3(args.input, args.output, args.alignment)

View File

@ -53,12 +53,13 @@ class Experiment(ExperimentBase):
self.optimizer.clear_grad()
self.model.train()
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
outputs = self.model(texts,
text_lens,
mels,
output_lens,
tones=tones,
global_condition=utterance_embeds)
outputs = self.model(
texts,
text_lens,
mels,
output_lens,
tones=tones,
global_condition=utterance_embeds)
losses = self.compute_losses(batch, outputs)
loss = losses["loss"]
loss.backward()
@ -86,12 +87,13 @@ class Experiment(ExperimentBase):
valid_losses = defaultdict(list)
for i, batch in enumerate(self.valid_loader):
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
outputs = self.model(texts,
text_lens,
mels,
output_lens,
tones=tones,
global_condition=utterance_embeds)
outputs = self.model(
texts,
text_lens,
mels,
output_lens,
tones=tones,
global_condition=utterance_embeds)
losses = self.compute_losses(batch, outputs)
for key, value in losses.items():
valid_losses[key].append(float(value))
@ -132,9 +134,8 @@ class Experiment(ExperimentBase):
mel_dir.mkdir(parents=True, exist_ok=True)
for i, batch in enumerate(self.test_loader):
texts, tones, mels, utterance_embeds, *_ = batch
outputs = self.model.infer(texts,
tones=tones,
global_condition=utterance_embeds)
outputs = self.model.infer(
texts, tones=tones, global_condition=utterance_embeds)
display.plot_alignment(outputs["alignments"][0].numpy().T)
plt.savefig(mel_dir / f"sentence_{i}.png")
@ -168,8 +169,7 @@ class Experiment(ExperimentBase):
p_decoder_dropout=config.model.p_decoder_dropout,
p_postnet_dropout=config.model.p_postnet_dropout,
d_global_condition=config.model.d_global_condition,
use_stop_token=config.model.use_stop_token,
)
use_stop_token=config.model.use_stop_token, )
if self.parallel:
model = paddle.DataParallel(model)
@ -200,32 +200,34 @@ class Experiment(ExperimentBase):
batch_fn = collate_aishell3_examples
if not self.parallel:
self.train_loader = DataLoader(train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
self.train_loader = DataLoader(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
else:
sampler = DistributedBatchSampler(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True)
self.train_loader = DataLoader(train_set,
batch_sampler=sampler,
collate_fn=batch_fn)
self.train_loader = DataLoader(
train_set, batch_sampler=sampler, collate_fn=batch_fn)
self.valid_loader = DataLoader(valid_set,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
self.valid_loader = DataLoader(
valid_set,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
self.test_loader = DataLoader(valid_set,
batch_size=1,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
self.test_loader = DataLoader(
valid_set,
batch_size=1,
shuffle=False,
drop_last=False,
collate_fn=batch_fn)
def main_sp(config, args):

View File

@ -14,7 +14,7 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar xjvf LJSpeech-1.1.tar.bz2
```
### Preprocess the dataset.
### Preprocess the dataset.
Assume the path to save the preprocessed dataset is `ljspeech_transformer_tts`. Run the command below to preprocess the dataset.
@ -49,4 +49,4 @@ python synthesize.py --input=sentence.txt --output=mels/ --checkpoint_path='step
## Pretrained Model
Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).
Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).

View File

@ -23,6 +23,7 @@ from parakeet.data.batch import batch_spec, batch_text_id
class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root):
self.root = Path(root).expanduser()
records = []
@ -64,6 +65,7 @@ class Transform(object):
class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples."""
def __init__(self, padding_idx=0, padding_value=0.):
self.padding_idx = padding_idx
self.padding_value = padding_value

View File

@ -35,13 +35,14 @@ def create_dataset(config, source_path, target_path, verbose=False):
meta_data = LJSpeechMetaData(source_path)
frontend = English()
processor = AudioProcessor(sample_rate=config.data.sample_rate,
n_fft=config.data.n_fft,
n_mels=config.data.d_mel,
win_length=config.data.win_length,
hop_length=config.data.hop_length,
fmax=config.data.fmax,
fmin=config.data.fmin)
processor = AudioProcessor(
sample_rate=config.data.sample_rate,
n_fft=config.data.n_fft,
n_mels=config.data.d_mel,
win_length=config.data.win_length,
hop_length=config.data.hop_length,
fmax=config.data.fmax,
fmin=config.data.fmin)
normalizer = LogMagnitude()
records = []
@ -80,26 +81,22 @@ def create_dataset(config, source_path, target_path, verbose=False):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument("--input",
type=str,
help="path of the ljspeech dataset")
parser.add_argument("--output",
type=str,
help="path to save output dataset")
parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="print msg")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults()
args = parser.parse_args()

View File

@ -73,29 +73,24 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument("--checkpoint_path",
type=str,
help="path of the checkpoint to load.")
parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument("--input", type=str, help="path of the text sentences")
parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device",
type=str,
default="cpu",
help="device type to use.")
parser.add_argument(
"--device", type=str, default="cpu", help="device type to use.")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="print msg")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args()
if args.config:

View File

@ -53,11 +53,12 @@ class TransformerTTSExperiment(ExperimentBase):
dropout=config.model.dropout)
if self.parallel:
model = paddle.DataParallel(model)
optimizer = paddle.optimizer.Adam(learning_rate=config.training.lr,
beta1=0.9,
beta2=0.98,
epsilon=1e-9,
parameters=model.parameters())
optimizer = paddle.optimizer.Adam(
learning_rate=config.training.lr,
beta1=0.9,
beta2=0.98,
epsilon=1e-9,
parameters=model.parameters())
criterion = TransformerTTSLoss(config.model.stop_loss_scale)
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
@ -82,11 +83,12 @@ class TransformerTTSExperiment(ExperimentBase):
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
if not self.parallel:
train_loader = DataLoader(train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
train_loader = DataLoader(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
else:
sampler = DistributedBatchSampler(
train_set,
@ -95,21 +97,20 @@ class TransformerTTSExperiment(ExperimentBase):
rank=dist.get_rank(),
shuffle=True,
drop_last=True)
train_loader = DataLoader(train_set,
batch_sampler=sampler,
collate_fn=batch_fn)
train_loader = DataLoader(
train_set, batch_sampler=sampler, collate_fn=batch_fn)
valid_loader = DataLoader(valid_set,
batch_size=config.data.batch_size,
collate_fn=batch_fn)
valid_loader = DataLoader(
valid_set, batch_size=config.data.batch_size, collate_fn=batch_fn)
self.train_loader = train_loader
self.valid_loader = valid_loader
def compute_outputs(self, text, mel):
model_core = self.model._layers if self.parallel else self.model
model_core.set_constants(self.reduction_factor(self.iteration),
self.drop_n_heads(self.iteration))
model_core.set_constants(
self.reduction_factor(self.iteration),
self.drop_n_heads(self.iteration))
mel_input = mel[:, :-1, :]
reduced_mel_input = mel_input[:, ::model_core.r, :]
@ -126,10 +127,9 @@ class TransformerTTSExperiment(ExperimentBase):
stop_logits = outputs["stop_logits"]
time_steps = mel_target.shape[1]
losses = self.criterion(mel_output[:, :time_steps, :],
mel_intermediate[:, :time_steps, :],
mel_target, stop_logits[:, :time_steps, :],
stop_label_target)
losses = self.criterion(
mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
mel_target, stop_logits[:, :time_steps, :], stop_label_target)
return losses
def train_batch(self):

View File

@ -14,7 +14,7 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar xjvf LJSpeech-1.1.tar.bz2
```
### Preprocess the dataset.
### Preprocess the dataset.
Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.
@ -49,4 +49,4 @@ python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-200000
## Pretrained Model
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).

View File

@ -23,12 +23,14 @@ from parakeet.data.batch import batch_spec, batch_wav
class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root):
self.root = Path(root).expanduser()
meta_data = pandas.read_csv(str(self.root / "metadata.csv"),
sep="\t",
header=None,
names=["fname", "frames", "samples"])
meta_data = pandas.read_csv(
str(self.root / "metadata.csv"),
sep="\t",
header=None,
names=["fname", "frames", "samples"])
records = []
for row in meta_data.itertuples():
@ -49,6 +51,7 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples."""
def __init__(self, padding_value=0.):
self.padding_value = padding_value

View File

@ -70,11 +70,12 @@ class Transform(object):
# Compute mel-spectrogram.
# Turn center to False to prevent internal padding.
spectrogram = librosa.core.stft(wav,
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
center=False)
spectrogram = librosa.core.stft(
wav,
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
center=False)
spectrogram_magnitude = np.abs(spectrogram)
# Compute mel-spectrograms.
@ -123,10 +124,8 @@ def create_dataset(config, input_dir, output_dir):
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
meta_data = pd.DataFrame.from_records(file_names)
meta_data.to_csv(str(output_dir / "metadata.csv"),
sep="\t",
index=None,
header=None)
meta_data.to_csv(
str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
print("saved meta data in to {}".format(
os.path.join(output_dir, "metadata.csv")))
@ -135,26 +134,22 @@ def create_dataset(config, input_dir, output_dir):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument("--input",
type=str,
help="path of the ljspeech dataset")
parser.add_argument("--output",
type=str,
help="path to save output dataset")
parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="print msg")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults()
args = parser.parse_args()

View File

@ -39,8 +39,8 @@ def main(config, args):
mel = np.load(str(file_path))
with paddle.amp.auto_cast():
audio = model.predict(mel)
audio_path = output_dir / (os.path.splitext(file_path.name)[0] +
".wav")
audio_path = output_dir / (
os.path.splitext(file_path.name)[0] + ".wav")
sf.write(audio_path, audio, config.data.sample_rate)
print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -50,32 +50,27 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument("--checkpoint_path",
type=str,
help="path of the checkpoint to load.")
parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument(
"--input",
type=str,
help="path of directory containing mel spectrogram (in .npy format)")
parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device",
type=str,
default="cpu",
help="device type to use.")
parser.add_argument(
"--device", type=str, default="cpu", help="device type to use.")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="print msg")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args()
if args.config:

View File

@ -43,8 +43,8 @@ class Experiment(ExperimentBase):
if self.parallel:
model = paddle.DataParallel(model)
optimizer = paddle.optimizer.Adam(config.training.lr,
parameters=model.parameters())
optimizer = paddle.optimizer.Adam(
config.training.lr, parameters=model.parameters())
criterion = WaveFlowLoss(sigma=config.model.sigma)
self.model = model
@ -63,11 +63,12 @@ class Experiment(ExperimentBase):
config.data.hop_length)
if not self.parallel:
train_loader = DataLoader(train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
train_loader = DataLoader(
train_set,
batch_size=config.data.batch_size,
shuffle=True,
drop_last=True,
collate_fn=batch_fn)
else:
sampler = DistributedBatchSampler(
train_set,
@ -76,14 +77,12 @@ class Experiment(ExperimentBase):
rank=dist.get_rank(),
shuffle=True,
drop_last=True)
train_loader = DataLoader(train_set,
batch_sampler=sampler,
collate_fn=batch_fn)
train_loader = DataLoader(
train_set, batch_sampler=sampler, collate_fn=batch_fn)
valid_batch_fn = LJSpeechCollector()
valid_loader = DataLoader(valid_set,
batch_size=1,
collate_fn=valid_batch_fn)
valid_loader = DataLoader(
valid_set, batch_size=1, collate_fn=valid_batch_fn)
self.train_loader = train_loader
self.valid_loader = valid_loader

View File

@ -25,9 +25,9 @@ class AudioProcessor(object):
n_fft: int,
win_length: int,
hop_length: int,
n_mels: int = 80,
fmin: int = 0,
fmax: int = None,
n_mels: int=80,
fmin: int=0,
fmax: int=None,
window="hann",
center=True,
pad_mode="reflect",
@ -73,21 +73,23 @@ class AudioProcessor(object):
sf.write(path, wav, samplerate=self.sample_rate)
def stft(self, wav):
D = librosa.core.stft(wav,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
window=self.window,
center=self.center,
pad_mode=self.pad_mode)
D = librosa.core.stft(
wav,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
window=self.window,
center=self.center,
pad_mode=self.pad_mode)
return D
def istft(self, D):
wav = librosa.core.istft(D,
hop_length=self.hop_length,
win_length=self.win_length,
window=self.window,
center=self.center)
wav = librosa.core.istft(
D,
hop_length=self.hop_length,
win_length=self.win_length,
window=self.window,
center=self.center)
return wav
def spectrogram(self, wav):

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This modules contains normalizers for spectrogram magnitude.
Normalizers are invertible transformations. They can be used to process

View File

@ -1,3 +1,16 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parakeet's infrastructure for data processing.
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.

View File

@ -26,7 +26,11 @@ class AudioSegmentDataset(Dataset):
"""A simple dataset adaptor for audio files to train vocoders.
Read -> trim silence -> normalize -> extract a segment
"""
def __init__(self, file_paths: List[Path], sample_rate: int, length: int,
def __init__(self,
file_paths: List[Path],
sample_rate: int,
length: int,
top_db: float):
self.file_paths = file_paths
self.sr = sample_rate
@ -56,10 +60,11 @@ class AudioDataset(Dataset):
"""A simple dataset adaptor for the audio files.
Read -> trim silence -> normalize
"""
def __init__(self,
file_paths: List[Path],
sample_rate: int,
top_db: float = 60):
top_db: float=60):
self.file_paths = file_paths
self.sr = sample_rate
self.top_db = top_db
@ -78,12 +83,11 @@ class AudioDataset(Dataset):
class AudioFolderDataset(AudioDataset):
def __init__(
self,
root,
sample_rate,
top_db=60,
extension=".wav",
):
self,
root,
sample_rate,
top_db=60,
extension=".wav", ):
root = Path(root).expanduser()
file_paths = sorted(list(root.rglob("*{}".format(extension))))
super().__init__(file_paths, sample_rate, top_db)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.phonectic import Phonetics
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
@ -10,7 +24,6 @@ from parakeet.frontend.vocab import Vocab
from g2p_en import G2p
class ARPABET(Phonetics):
"""A phonology for English that uses ARPABET as the phoneme vocabulary.
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
@ -132,7 +145,9 @@ class ARPABET(Phonetics):
List[str]
The list of pronunciation sequence.
"""
phonemes = [self._remove_vowels(item) for item in self.backend(sentence)]
phonemes = [
self._remove_vowels(item) for item in self.backend(sentence)
]
if add_start_end:
start = self.vocab.start_symbol
end = self.vocab.end_symbol
@ -184,7 +199,9 @@ class ARPABET(Phonetics):
List[str]
The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
return self.numericalize(
self.phoneticize(
sentence, add_start_end=add_start_end))
@property
def vocab_size(self):
@ -206,7 +223,7 @@ class ARPABETWithStress(Phonetics):
]
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
def __init__(self):
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)
@ -276,11 +293,13 @@ class ARPABETWithStress(Phonetics):
List[str]
The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
return self.numericalize(
self.phoneticize(
sentence, add_start_end=add_start_end))
@property
def vocab_size(self):
""" Vocab size.
"""
# 77 = 69 phones + 4 punctuations + 4 special tokens
return len(self.vocab)
return len(self.vocab)

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
def full2half_width(ustr):
half = []
for u in ustr:

View File

@ -1,3 +1,16 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A Simple Chinese Phonology using pinyin symbols.
The G2P conversion converts pinyin string to symbols. Also it can handle string
@ -32,6 +45,7 @@ _tones = ['0', '1', '2', '3', '4', '5']
_toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])]
_toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations
class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
@ -41,7 +55,7 @@ class ParakeetPinyin(Phonetics):
self.vocab_phonemes = Vocab(_phones)
self.vocab_tones = Vocab(_tones)
self.pinyin_backend = Pinyin(ParakeetConverter())
def convert_pypinyin_tone3(self, syllables, add_start_end=False):
phonemes, tones = _convert_to_parakeet_style_pinyin(syllables)
@ -58,8 +72,7 @@ class ParakeetPinyin(Phonetics):
item for item in phonemes if item in self.vocab_phonemes.stoi
]
tones = [item for item in tones if item in self.vocab_tones.stoi]
return phonemes, tones
return phonemes, tones
def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence.
@ -74,10 +87,10 @@ class ParakeetPinyin(Phonetics):
List[str]
The list of pronunciation sequence.
"""
syllables = self.pinyin_backend.lazy_pinyin(sentence,
style=Style.TONE3,
strict=True)
phonemes, tones = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
syllables = self.pinyin_backend.lazy_pinyin(
sentence, style=Style.TONE3, strict=True)
phonemes, tones = self.convert_pypinyin_tone3(
syllables, add_start_end=add_start_end)
return phonemes, tones
def numericalize(self, phonemes, tones):
@ -110,8 +123,8 @@ class ParakeetPinyin(Phonetics):
List[str]
The list of pronunciation id sequence.
"""
phonemes, tones = self.phoneticize(sentence,
add_start_end=add_start_end)
phonemes, tones = self.phoneticize(
sentence, add_start_end=add_start_end)
phoneme_ids, tone_ids = self.numericalize(phonemes, tones)
return phoneme_ids, tone_ids
@ -128,12 +141,11 @@ class ParakeetPinyin(Phonetics):
return len(self.vocab_tones)
class ParakeetPinyinWithTone(Phonetics):
def __init__(self):
self.vocab = Vocab(_toned_phonems)
self.pinyin_backend = Pinyin(ParakeetConverter())
def convert_pypinyin_tone3(self, syllables, add_start_end=False):
phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables)
@ -142,11 +154,9 @@ class ParakeetPinyinWithTone(Phonetics):
end = self.vocab_phonemes.end_symbol
phonemes = [start] + phonemes + [end]
phonemes = [
item for item in phonemes if item in self.vocab.stoi
]
phonemes = [item for item in phonemes if item in self.vocab.stoi]
return phonemes
def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence.
@ -160,10 +170,10 @@ class ParakeetPinyinWithTone(Phonetics):
List[str]
The list of pronunciation sequence.
"""
syllables = self.pinyin_backend.lazy_pinyin(sentence,
style=Style.TONE3,
strict=True)
phonemes = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
syllables = self.pinyin_backend.lazy_pinyin(
sentence, style=Style.TONE3, strict=True)
phonemes = self.convert_pypinyin_tone3(
syllables, add_start_end=add_start_end)
return phonemes
def numericalize(self, phonemes):
@ -289,6 +299,7 @@ def _convert_to_parakeet_style_pinyin(syllables):
tones.extend(t)
return phones, tones
def _split_syllable_with_tone(syllable: str):
global _punctuations
@ -311,10 +322,10 @@ def _split_syllable_with_tone(syllable: str):
phones.append(syllable)
return phones
def _convert_to_parakeet_style_pinyin_with_tone(syllables):
phones = []
for syllable in syllables:
p = _split_syllable_with_tone(syllable)
phones.extend(p)
return phones

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from paddle import nn
@ -23,9 +37,9 @@ class LSTMSpeakerEncoder(nn.Layer):
def forward(self, utterances, num_speakers, initial_states=None):
normalized_embeds = self.embed_sequences(utterances, initial_states)
embeds = normalized_embeds.reshape([num_speakers, -1, num_speakers])
loss, eer = self.loss(embeds)
loss, eer = self.loss(embeds)
return loss, eer
def embed_sequences(self, utterances, initial_states=None, reduce=False):
out, (h, c) = self.lstm(utterances, initial_states)
embeds = F.relu(self.linear(h[-1]))
@ -35,7 +49,7 @@ class LSTMSpeakerEncoder(nn.Layer):
embed = F.normalize(embed, axis=0)
return embed
return normalized_embeds
def embed_utterance(self, utterances, initial_states=None):
# utterances: [B, T, C] -> embed [C']
embed = self.embed_sequences(utterances, initial_states, reduce=True)
@ -47,37 +61,51 @@ class LSTMSpeakerEncoder(nn.Layer):
# Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
centroids_incl = paddle.mean(embeds, axis=1)
centroids_incl_norm = paddle.norm(centroids_incl, p=2, axis=1, keepdim=True)
centroids_incl_norm = paddle.norm(
centroids_incl, p=2, axis=1, keepdim=True)
normalized_centroids_incl = centroids_incl / centroids_incl_norm
# Exclusive centroids (1 per utterance)
centroids_excl = paddle.broadcast_to(paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
centroids_excl = paddle.broadcast_to(
paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
centroids_excl /= (utterances_per_speaker - 1)
centroids_excl_norm = paddle.norm(centroids_excl, p=2, axis=2, keepdim=True)
centroids_excl_norm = paddle.norm(
centroids_excl, p=2, axis=2, keepdim=True)
normalized_centroids_excl = centroids_excl / centroids_excl_norm
p1 = paddle.matmul(embeds.reshape([-1, embed_dim]),
normalized_centroids_incl, transpose_y=True) # (NMN)
p1 = paddle.matmul(
embeds.reshape([-1, embed_dim]),
normalized_centroids_incl,
transpose_y=True) # (NMN)
p1 = p1.reshape([-1])
# print("p1: ", p1.shape)
p2 = paddle.bmm(embeds.reshape([-1, 1, embed_dim]),
normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1)
p2 = p2.reshape([-1]) # NM)
p2 = paddle.bmm(
embeds.reshape([-1, 1, embed_dim]),
normalized_centroids_excl.reshape(
[-1, embed_dim, 1])) # (NM, 1, 1)
p2 = p2.reshape([-1]) # NM)
# begin: alternative implementation for scatter
with paddle.no_grad():
index = paddle.arange(0, speakers_per_batch * utterances_per_speaker, dtype="int64").reshape([speakers_per_batch, utterances_per_speaker])
index = index * speakers_per_batch + paddle.arange(0, speakers_per_batch, dtype="int64").unsqueeze(-1)
index = paddle.arange(
0, speakers_per_batch * utterances_per_speaker,
dtype="int64").reshape(
[speakers_per_batch, utterances_per_speaker])
index = index * speakers_per_batch + paddle.arange(
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
index = paddle.reshape(index, [-1])
ones = paddle.ones([speakers_per_batch * utterances_per_speaker * speakers_per_batch])
ones = paddle.ones([
speakers_per_batch * utterances_per_speaker * speakers_per_batch
])
zeros = paddle.zeros_like(index, dtype=ones.dtype)
mask_p1 = paddle.scatter(ones, index, zeros)
p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
# end: alternative implementation for scatter
# p = paddle.scatter(p1, index, p2)
p = p * self.similarity_weight + self.similarity_bias # neg
p = p.reshape([speakers_per_batch * utterances_per_speaker, speakers_per_batch])
p = p * self.similarity_weight + self.similarity_bias # neg
p = p.reshape(
[speakers_per_batch * utterances_per_speaker, speakers_per_batch])
return p, p1, p2
def do_gradient_ops(self):
@ -99,8 +127,10 @@ class LSTMSpeakerEncoder(nn.Layer):
sim_matrix, *_ = self.similarity_matrix(embeds)
sim_matrix = sim_matrix.reshape(
[speakers_per_batch * utterances_per_speaker, speakers_per_batch])
target = paddle.arange(0, speakers_per_batch, dtype="int64").unsqueeze(-1)
target = paddle.expand(target, [speakers_per_batch, utterances_per_speaker])
target = paddle.arange(
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
target = paddle.expand(target,
[speakers_per_batch, utterances_per_speaker])
target = paddle.reshape(target, [-1])
loss = nn.CrossEntropyLoss()(sim_matrix, target)
@ -113,9 +143,7 @@ class LSTMSpeakerEncoder(nn.Layer):
preds = sim_matrix.numpy()
# Snippet from https://yangcha.github.io/EER-ROC/
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
return loss, eer

View File

@ -47,7 +47,11 @@ class DecoderPreNet(nn.Layer):
The droput probability.
"""
def __init__(self, d_input: int, d_hidden: int, d_output: int,
def __init__(self,
d_input: int,
d_hidden: int,
d_output: int,
dropout_rate: float):
super().__init__()
@ -70,12 +74,10 @@ class DecoderPreNet(nn.Layer):
"""
x = F.dropout(F.relu(self.linear1(x)),
self.dropout_rate,
training=True)
output = F.dropout(F.relu(self.linear2(x)),
self.dropout_rate,
training=True)
x = F.dropout(
F.relu(self.linear1(x)), self.dropout_rate, training=True)
output = F.dropout(
F.relu(self.linear2(x)), self.dropout_rate, training=True)
return output
@ -100,8 +102,13 @@ class DecoderPostNet(nn.Layer):
The droput probability.
"""
def __init__(self, d_mels: int, d_hidden: int, kernel_size: int,
num_layers: int, dropout: float):
def __init__(self,
d_mels: int,
d_hidden: int,
kernel_size: int,
num_layers: int,
dropout: float):
super().__init__()
self.dropout = dropout
self.num_layers = num_layers
@ -111,31 +118,33 @@ class DecoderPostNet(nn.Layer):
self.conv_batchnorms = nn.LayerList()
k = math.sqrt(1.0 / (d_mels * kernel_size))
self.conv_batchnorms.append(
Conv1dBatchNorm(d_mels,
d_hidden,
kernel_size=kernel_size,
padding=padding,
bias_attr=I.Uniform(-k, k),
data_format='NLC'))
Conv1dBatchNorm(
d_mels,
d_hidden,
kernel_size=kernel_size,
padding=padding,
bias_attr=I.Uniform(-k, k),
data_format='NLC'))
k = math.sqrt(1.0 / (d_hidden * kernel_size))
self.conv_batchnorms.extend([
Conv1dBatchNorm(d_hidden,
d_hidden,
kernel_size=kernel_size,
padding=padding,
bias_attr=I.Uniform(-k, k),
data_format='NLC')
for i in range(1, num_layers - 1)
Conv1dBatchNorm(
d_hidden,
d_hidden,
kernel_size=kernel_size,
padding=padding,
bias_attr=I.Uniform(-k, k),
data_format='NLC') for i in range(1, num_layers - 1)
])
self.conv_batchnorms.append(
Conv1dBatchNorm(d_hidden,
d_mels,
kernel_size=kernel_size,
padding=padding,
bias_attr=I.Uniform(-k, k),
data_format='NLC'))
Conv1dBatchNorm(
d_hidden,
d_mels,
kernel_size=kernel_size,
padding=padding,
bias_attr=I.Uniform(-k, k),
data_format='NLC'))
def forward(self, x):
"""Calculate forward propagation.
@ -153,12 +162,14 @@ class DecoderPostNet(nn.Layer):
"""
for i in range(len(self.conv_batchnorms) - 1):
x = F.dropout(F.tanh(self.conv_batchnorms[i](x)),
self.dropout,
training=self.training)
output = F.dropout(self.conv_batchnorms[self.num_layers - 1](x),
self.dropout,
training=self.training)
x = F.dropout(
F.tanh(self.conv_batchnorms[i](x)),
self.dropout,
training=self.training)
output = F.dropout(
self.conv_batchnorms[self.num_layers - 1](x),
self.dropout,
training=self.training)
return output
@ -179,26 +190,30 @@ class Tacotron2Encoder(nn.Layer):
p_dropout: float
The droput probability.
"""
def __init__(self, d_hidden: int, conv_layers: int, kernel_size: int,
def __init__(self,
d_hidden: int,
conv_layers: int,
kernel_size: int,
p_dropout: float):
super().__init__()
k = math.sqrt(1.0 / (d_hidden * kernel_size))
self.conv_batchnorms = paddle.nn.LayerList([
Conv1dBatchNorm(d_hidden,
d_hidden,
kernel_size,
stride=1,
padding=int((kernel_size - 1) / 2),
bias_attr=I.Uniform(-k, k),
data_format='NLC') for i in range(conv_layers)
Conv1dBatchNorm(
d_hidden,
d_hidden,
kernel_size,
stride=1,
padding=int((kernel_size - 1) / 2),
bias_attr=I.Uniform(-k, k),
data_format='NLC') for i in range(conv_layers)
])
self.p_dropout = p_dropout
self.hidden_size = int(d_hidden / 2)
self.lstm = nn.LSTM(d_hidden,
self.hidden_size,
direction="bidirectional")
self.lstm = nn.LSTM(
d_hidden, self.hidden_size, direction="bidirectional")
def forward(self, x, input_lens=None):
"""Calculate forward propagation of tacotron2 encoder.
@ -218,9 +233,10 @@ class Tacotron2Encoder(nn.Layer):
"""
for conv_batchnorm in self.conv_batchnorms:
x = F.dropout(F.relu(conv_batchnorm(x)),
self.p_dropout,
training=self.training)
x = F.dropout(
F.relu(conv_batchnorm(x)),
self.p_dropout,
training=self.training)
output, _ = self.lstm(inputs=x, sequence_length=input_lens)
return output
@ -271,6 +287,7 @@ class Tacotron2Decoder(nn.Layer):
Whether to use a binary classifier for stop token prediction.
Defaults to False
"""
def __init__(self,
d_mels: int,
reduction_factor: int,
@ -284,7 +301,7 @@ class Tacotron2Decoder(nn.Layer):
p_prenet_dropout: float,
p_attention_dropout: float,
p_decoder_dropout: float,
use_stop_token: bool = False):
use_stop_token: bool=False):
super().__init__()
self.d_mels = d_mels
self.reduction_factor = reduction_factor
@ -294,10 +311,11 @@ class Tacotron2Decoder(nn.Layer):
self.p_attention_dropout = p_attention_dropout
self.p_decoder_dropout = p_decoder_dropout
self.prenet = DecoderPreNet(d_mels * reduction_factor,
d_prenet,
d_prenet,
dropout_rate=p_prenet_dropout)
self.prenet = DecoderPreNet(
d_mels * reduction_factor,
d_prenet,
d_prenet,
dropout_rate=p_prenet_dropout)
# attention_rnn takes attention's context vector has an
# auxiliary input
@ -367,9 +385,10 @@ class Tacotron2Decoder(nn.Layer):
# The first lstm layer (or spec encoder lstm)
_, (self.attention_hidden, self.attention_cell) = self.attention_rnn(
cell_input, (self.attention_hidden, self.attention_cell))
self.attention_hidden = F.dropout(self.attention_hidden,
self.p_attention_dropout,
training=self.training)
self.attention_hidden = F.dropout(
self.attention_hidden,
self.p_attention_dropout,
training=self.training)
# Loaction sensitive attention
attention_weights_cat = paddle.stack(
@ -384,9 +403,10 @@ class Tacotron2Decoder(nn.Layer):
[self.attention_hidden, self.attention_context], axis=-1)
_, (self.decoder_hidden, self.decoder_cell) = self.decoder_rnn(
decoder_input, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(self.decoder_hidden,
p=self.p_decoder_dropout,
training=self.training)
self.decoder_hidden = F.dropout(
self.decoder_hidden,
p=self.p_decoder_dropout,
training=self.training)
# decode output one step
decoder_hidden_attention_context = paddle.concat(
@ -426,8 +446,8 @@ class Tacotron2Decoder(nn.Layer):
querys = paddle.reshape(
querys,
[querys.shape[0], querys.shape[1] // self.reduction_factor, -1])
start_step = paddle.zeros(shape=[querys.shape[0], 1, querys.shape[-1]],
dtype=querys.dtype)
start_step = paddle.zeros(
shape=[querys.shape[0], 1, querys.shape[-1]], dtype=querys.dtype)
querys = paddle.concat([start_step, querys], axis=1)
querys = self.prenet(querys)
@ -604,43 +624,43 @@ class Tacotron2(nn.Layer):
outputs.
"""
def __init__(self,
vocab_size,
n_tones=None,
d_mels: int = 80,
d_encoder: int = 512,
encoder_conv_layers: int = 3,
encoder_kernel_size: int = 5,
d_prenet: int = 256,
d_attention_rnn: int = 1024,
d_decoder_rnn: int = 1024,
attention_filters: int = 32,
attention_kernel_size: int = 31,
d_attention: int = 128,
d_postnet: int = 512,
postnet_kernel_size: int = 5,
postnet_conv_layers: int = 5,
reduction_factor: int = 1,
p_encoder_dropout: float = 0.5,
p_prenet_dropout: float = 0.5,
p_attention_dropout: float = 0.1,
p_decoder_dropout: float = 0.1,
p_postnet_dropout: float = 0.5,
d_mels: int=80,
d_encoder: int=512,
encoder_conv_layers: int=3,
encoder_kernel_size: int=5,
d_prenet: int=256,
d_attention_rnn: int=1024,
d_decoder_rnn: int=1024,
attention_filters: int=32,
attention_kernel_size: int=31,
d_attention: int=128,
d_postnet: int=512,
postnet_kernel_size: int=5,
postnet_conv_layers: int=5,
reduction_factor: int=1,
p_encoder_dropout: float=0.5,
p_prenet_dropout: float=0.5,
p_attention_dropout: float=0.1,
p_decoder_dropout: float=0.1,
p_postnet_dropout: float=0.5,
d_global_condition=None,
use_stop_token=False):
super().__init__()
std = math.sqrt(2.0 / (vocab_size + d_encoder))
val = math.sqrt(3.0) * std # uniform bounds for std
self.embedding = nn.Embedding(vocab_size,
d_encoder,
weight_attr=I.Uniform(-val, val))
self.embedding = nn.Embedding(
vocab_size, d_encoder, weight_attr=I.Uniform(-val, val))
if n_tones:
self.embedding_tones = nn.Embedding(n_tones,
d_encoder,
padding_idx=0,
weight_attr=I.Uniform(
-0.1 * val, 0.1 * val))
self.embedding_tones = nn.Embedding(
n_tones,
d_encoder,
padding_idx=0,
weight_attr=I.Uniform(-0.1 * val, 0.1 * val))
self.toned = n_tones is not None
self.encoder = Tacotron2Encoder(d_encoder, encoder_conv_layers,
@ -649,24 +669,26 @@ class Tacotron2(nn.Layer):
# input augmentation scheme: concat global condition to the encoder output
if d_global_condition is not None:
d_encoder += d_global_condition
self.decoder = Tacotron2Decoder(d_mels,
reduction_factor,
d_encoder,
d_prenet,
d_attention_rnn,
d_decoder_rnn,
d_attention,
attention_filters,
attention_kernel_size,
p_prenet_dropout,
p_attention_dropout,
p_decoder_dropout,
use_stop_token=use_stop_token)
self.postnet = DecoderPostNet(d_mels=d_mels * reduction_factor,
d_hidden=d_postnet,
kernel_size=postnet_kernel_size,
num_layers=postnet_conv_layers,
dropout=p_postnet_dropout)
self.decoder = Tacotron2Decoder(
d_mels,
reduction_factor,
d_encoder,
d_prenet,
d_attention_rnn,
d_decoder_rnn,
d_attention,
attention_filters,
attention_kernel_size,
p_prenet_dropout,
p_attention_dropout,
p_decoder_dropout,
use_stop_token=use_stop_token)
self.postnet = DecoderPostNet(
d_mels=d_mels * reduction_factor,
d_hidden=d_postnet,
kernel_size=postnet_kernel_size,
num_layers=postnet_conv_layers,
dropout=p_postnet_dropout)
def forward(self,
text_inputs,
@ -729,15 +751,14 @@ class Tacotron2(nn.Layer):
[encoder_outputs, global_condition], -1)
# [B, T_enc, 1]
mask = sequence_mask(text_lens,
dtype=encoder_outputs.dtype).unsqueeze(-1)
mask = sequence_mask(
text_lens, dtype=encoder_outputs.dtype).unsqueeze(-1)
if self.decoder.use_stop_token:
mel_outputs, alignments, stop_logits = self.decoder(
encoder_outputs, mels, mask=mask)
else:
mel_outputs, alignments = self.decoder(encoder_outputs,
mels,
mask=mask)
mel_outputs, alignments = self.decoder(
encoder_outputs, mels, mask=mask)
mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
@ -863,6 +884,7 @@ class Tacotron2(nn.Layer):
class Tacotron2Loss(nn.Layer):
""" Tacotron2 Loss module
"""
def __init__(self,
use_stop_token_loss=True,
use_guided_attention_loss=False,

View File

@ -321,10 +321,8 @@ class MLPPreNet(nn.Layer):
self.dropout = dropout
def forward(self, x, dropout):
l1 = F.dropout(
F.relu(self.lin1(x)), self.dropout, training=True)
l2 = F.dropout(
F.relu(self.lin2(l1)), self.dropout, training=True)
l1 = F.dropout(F.relu(self.lin1(x)), self.dropout, training=True)
l2 = F.dropout(F.relu(self.lin2(l1)), self.dropout, training=True)
l3 = self.lin3(l2)
return l3
@ -403,7 +401,7 @@ class TransformerTTS(nn.Layer):
padding_idx=0,
weight_attr=I.Uniform(-0.005, 0.005))
else:
self.toned = False
self.toned = False
# position encoding matrix may be extended later
self.encoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_encoder)
self.encoder_pe_scalar = self.create_parameter(
@ -449,7 +447,8 @@ class TransformerTTS(nn.Layer):
self.drop_n_heads = 0
def forward(self, text, mel, tones=None):
encoded, encoder_attention_weights, encoder_mask = self.encode(text, tones=tones)
encoded, encoder_attention_weights, encoder_mask = self.encode(
text, tones=tones)
mel_output, mel_intermediate, cross_attention_weights, stop_logits = self.decode(
encoded, mel, encoder_mask)
outputs = {
@ -489,7 +488,8 @@ class TransformerTTS(nn.Layer):
# twice its length if needed
if x.shape[1] * self.r > self.decoder_pe.shape[0]:
new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T, self.d_decoder)
self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T,
self.d_decoder)
pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
x = x.scale(math.sqrt(
self.d_decoder)) + pos_enc * self.decoder_pe_scalar

View File

@ -78,6 +78,7 @@ class UpsampleNet(nn.LayerList):
---------
``librosa.core.stft``
"""
def __init__(self, upsample_factors):
super().__init__()
for factor in upsample_factors:
@ -85,12 +86,13 @@ class UpsampleNet(nn.LayerList):
init = I.Uniform(-std, std)
self.append(
nn.utils.weight_norm(
nn.Conv2DTranspose(1,
1, (3, 2 * factor),
padding=(1, factor // 2),
stride=(1, factor),
weight_attr=init,
bias_attr=init)))
nn.Conv2DTranspose(
1,
1, (3, 2 * factor),
padding=(1, factor // 2),
stride=(1, factor),
weight_attr=init,
bias_attr=init)))
# upsample factors
self.upsample_factor = np.prod(upsample_factors)
@ -149,6 +151,7 @@ class ResidualBlock(nn.Layer):
dilations : int
Dilations of the Convolution2d applied to the input.
"""
def __init__(self, channels, cond_channels, kernel_size, dilations):
super().__init__()
# input conv
@ -159,13 +162,14 @@ class ResidualBlock(nn.Layer):
]
rh, rw = receptive_field
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
conv = nn.Conv2D(channels,
2 * channels,
kernel_size,
padding=paddings,
dilation=dilations,
weight_attr=init,
bias_attr=init)
conv = nn.Conv2D(
channels,
2 * channels,
kernel_size,
padding=paddings,
dilation=dilations,
weight_attr=init,
bias_attr=init)
self.conv = nn.utils.weight_norm(conv)
self.rh = rh
self.rw = rw
@ -174,19 +178,18 @@ class ResidualBlock(nn.Layer):
# condition projection
std = math.sqrt(1 / cond_channels)
init = I.Uniform(-std, std)
condition_proj = nn.Conv2D(cond_channels,
2 * channels, (1, 1),
weight_attr=init,
bias_attr=init)
condition_proj = nn.Conv2D(
cond_channels,
2 * channels, (1, 1),
weight_attr=init,
bias_attr=init)
self.condition_proj = nn.utils.weight_norm(condition_proj)
# parametric residual & skip connection
std = math.sqrt(1 / channels)
init = I.Uniform(-std, std)
out_proj = nn.Conv2D(channels,
2 * channels, (1, 1),
weight_attr=init,
bias_attr=init)
out_proj = nn.Conv2D(
channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
self.out_proj = nn.utils.weight_norm(out_proj)
def forward(self, x, condition):
@ -265,11 +268,12 @@ class ResidualBlock(nn.Layer):
self._update_buffer(x_row)
rw = self.rw
x_row = F.conv2d(self._conv_buffer,
self.conv.weight,
self.conv.bias,
padding=[0, 0, rw // 2, (rw - 1) // 2],
dilation=self.dilations)
x_row = F.conv2d(
self._conv_buffer,
self.conv.weight,
self.conv.bias,
padding=[0, 0, rw // 2, (rw - 1) // 2],
dilation=self.dilations)
x_row += self.condition_proj(condition_row)
content, gate = paddle.chunk(x_row, 2, axis=1)
@ -315,8 +319,12 @@ class ResidualNet(nn.LayerList):
ValueError
If the length of dilations_h does not equals n_layers.
"""
def __init__(self, n_layer: int, residual_channels: int,
condition_channels: int, kernel_size: Tuple[int],
def __init__(self,
n_layer: int,
residual_channels: int,
condition_channels: int,
kernel_size: Tuple[int],
dilations_h: List[int]):
if len(dilations_h) != n_layer:
raise ValueError(
@ -421,20 +429,22 @@ class Flow(nn.Layer):
super().__init__()
# input projection
self.input_proj = nn.utils.weight_norm(
nn.Conv2D(1,
channels, (1, 1),
weight_attr=I.Uniform(-1., 1.),
bias_attr=I.Uniform(-1., 1.)))
nn.Conv2D(
1,
channels, (1, 1),
weight_attr=I.Uniform(-1., 1.),
bias_attr=I.Uniform(-1., 1.)))
# residual net
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
self.dilations_dict[n_group])
# output projection
self.output_proj = nn.Conv2D(channels,
2, (1, 1),
weight_attr=I.Constant(0.),
bias_attr=I.Constant(0.))
self.output_proj = nn.Conv2D(
channels,
2, (1, 1),
weight_attr=I.Constant(0.),
bias_attr=I.Constant(0.))
# specs
self.n_group = n_group
@ -478,8 +488,8 @@ class Flow(nn.Layer):
transformation from x to z.
"""
# (B, C, H-1, W)
logs, b = self._predict_parameters(x[:, :, :-1, :], condition[:, :,
1:, :])
logs, b = self._predict_parameters(x[:, :, :-1, :],
condition[:, :, 1:, :])
z = self._transform(x, logs, b)
return z, (logs, b)
@ -576,6 +586,7 @@ class WaveFlow(nn.LayerList):
kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
kernel_size):
if n_group % 2 or n_flows % 2:
@ -645,8 +656,8 @@ class WaveFlow(nn.LayerList):
# to (B, C, h, T//h) layout
x = paddle.unsqueeze(
paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
condition = paddle.transpose(fold(condition, self.n_group),
[0, 1, 3, 2])
condition = paddle.transpose(
fold(condition, self.n_group), [0, 1, 3, 2])
# flows
logs_list = []
@ -689,8 +700,8 @@ class WaveFlow(nn.LayerList):
# to (B, C, h, T//h) layout
z = paddle.unsqueeze(
paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
condition = paddle.transpose(fold(condition, self.n_group),
[0, 1, 3, 2])
condition = paddle.transpose(
fold(condition, self.n_group), [0, 1, 3, 2])
# reverse it flow by flow
for i in reversed(range(self.n_flows)):
@ -730,17 +741,24 @@ class ConditionalWaveFlow(nn.LayerList):
kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self, upsample_factors: List[int], n_flows: int,
n_layers: int, n_group: int, channels: int, n_mels: int,
def __init__(self,
upsample_factors: List[int],
n_flows: int,
n_layers: int,
n_group: int,
channels: int,
n_mels: int,
kernel_size: Union[int, List[int]]):
super().__init__()
self.encoder = UpsampleNet(upsample_factors)
self.decoder = WaveFlow(n_flows=n_flows,
n_layers=n_layers,
n_group=n_group,
channels=channels,
mel_bands=n_mels,
kernel_size=kernel_size)
self.decoder = WaveFlow(
n_flows=n_flows,
n_layers=n_layers,
n_group=n_group,
channels=channels,
mel_bands=n_mels,
kernel_size=kernel_size)
def forward(self, audio, mel):
"""Compute the transformed random variable z (x to z) and the log of
@ -847,6 +865,7 @@ class WaveFlowLoss(nn.Layer):
The standard deviation of the gaussian noise used in WaveFlow, by
default 1.0.
"""
def __init__(self, sigma=1.0):
super().__init__()
self.sigma = sigma
@ -870,7 +889,7 @@ class WaveFlowLoss(nn.Layer):
Tensor [shape=(1,)]
The loss.
"""
loss = paddle.sum(
z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
) - log_det_jacobian
loss = loss / np.prod(z.shape)
return loss + self.const

View File

@ -143,9 +143,9 @@ class MonoheadAttention(nn.Layer):
def __init__(self,
model_dim: int,
dropout: float = 0.0,
k_dim: int = None,
v_dim: int = None):
dropout: float=0.0,
k_dim: int=None,
v_dim: int=None):
super(MonoheadAttention, self).__init__()
k_dim = k_dim or model_dim
v_dim = v_dim or model_dim
@ -225,9 +225,9 @@ class MultiheadAttention(nn.Layer):
def __init__(self,
model_dim: int,
num_heads: int,
dropout: float = 0.0,
k_dim: int = None,
v_dim: int = None):
dropout: float=0.0,
k_dim: int=None,
v_dim: int=None):
super(MultiheadAttention, self).__init__()
if model_dim % num_heads != 0:
raise ValueError("model_dim must be divisible by num_heads")
@ -318,7 +318,8 @@ class LocationSensitiveAttention(nn.Layer):
# Location Layer
self.location_conv = nn.Conv1D(
2, location_filters,
2,
location_filters,
kernel_size=location_kernel_size,
padding=int((location_kernel_size - 1) / 2),
bias_attr=False,

View File

@ -116,16 +116,22 @@ class STFT(nn.Layer):
"""
def __init__(self, n_fft, hop_length=None, win_length=None, window="hanning", center=True, pad_mode="reflect"):
def __init__(self,
n_fft,
hop_length=None,
win_length=None,
window="hanning",
center=True,
pad_mode="reflect"):
super().__init__()
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
self.hop_length = hop_length
self.n_bin = 1 + n_fft // 2
self.n_fft = n_fft
@ -134,7 +140,7 @@ class STFT(nn.Layer):
# calculate window
window = signal.get_window(window, win_length, fftbins=True)
# pad window to n_fft size
if n_fft != win_length:
window = pad_center(window, n_fft, mode="constant")
@ -146,11 +152,11 @@ class STFT(nn.Layer):
#r = np.arange(0, n_fft)
#M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
#w_real = np.reshape(window *
#np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
#(self.n_bin, 1, self.n_fft))
#np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
#(self.n_bin, 1, self.n_fft))
#w_imag = np.reshape(window *
#np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
#(self.n_bin, 1, self.n_fft))
#np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
#(self.n_bin, 1, self.n_fft))
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
w_real = weight.real
w_imag = weight.imag
@ -178,8 +184,9 @@ class STFT(nn.Layer):
"""
x = paddle.unsqueeze(x, axis=1)
if self.center:
x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
data_format='NCL', mode=self.pad_mode)
x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
data_format='NCL',
mode=self.pad_mode)
# to BCT, C=1
out = F.conv1d(x, self.weight, stride=self.hop_length)
@ -226,8 +233,8 @@ class MelScale(nn.Layer):
super().__init__()
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
self.weight = paddle.to_tensor(mel_basis)
def forward(self, spec):
# (n_mels, n_freq) * (batch_size, n_freq, n_frames)
mel = paddle.matmul(self.weight, spec)
return mel
return mel

View File

@ -35,12 +35,12 @@ def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention. ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969.
"""
dtype = dtype or paddle.get_default_dtype()
dec_pos = paddle.arange(0, N).astype(
dtype) / dec_lens.unsqueeze(-1) # n/N # shape(B, T_dec)
enc_pos = paddle.arange(0, T).astype(
dtype) / enc_lens.unsqueeze(-1) # t/T # shape(B, T_enc)
W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) -
enc_pos.unsqueeze(1))**2 / (2 * g ** 2))
dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze(
-1) # n/N # shape(B, T_dec)
enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze(
-1) # t/T # shape(B, T_enc)
W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 /
(2 * g**2))
dec_mask = sequence_mask(dec_lens, maxlen=N)
enc_mask = sequence_mask(enc_lens, maxlen=T)
@ -57,8 +57,7 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype)
total_tokens = (dec_lens * enc_lens).astype(W.dtype)
loss = paddle.mean(paddle.sum(
W * attention_weight, [1, 2]) / total_tokens)
loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens)
return loss

View File

@ -87,6 +87,7 @@ class ExperimentBase(object):
>>> else:
>>> main_sp(config, args)
"""
def __init__(self, config, args):
self.config = config
self.args = args

View File

@ -1,7 +1,22 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
OBSERVATIONS = None
@contextlib.contextmanager
def scope(observations):
# make `observation` the target to report to.
@ -13,12 +28,14 @@ def scope(observations):
try:
yield
finally:
OBSERVATIONS = old
OBSERVATIONS = old
def get_observations():
global OBSERVATIONS
return OBSERVATIONS
def report(name, value):
# a simple function to report named value
# you can use it everywhere, it will get the default target and writ to it

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import tqdm
from dataclasses import dataclass
@ -25,7 +39,7 @@ class Trainer(object):
self.stop_trigger = get_trigger(stop_trigger)
self.out = Path(out)
self.observation = {}
def setup(self):
pass
@ -38,8 +52,8 @@ class Trainer(object):
ordinal += 1
modified_name = f"{name}_{ordinal}"
self.extensions[modified_name] = ExtensionEntry(
extension, trigger, priority)
self.extensions[modified_name] = ExtensionEntry(extension, trigger,
priority)
def run(self):
# sort extensions by priorities once
@ -61,7 +75,7 @@ class Trainer(object):
max_epoch = self.stop_trigger.period
else:
max_iteration = self.stop_trigger.period
while not stop_trigger(self):
self.observation = {}
# set observation as the report target
@ -75,4 +89,3 @@ class Trainer(object):
for name, entry in extensions:
if entry.trigger(self):
entry.extension(self)

View File

@ -1,10 +1,25 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class IntervalTrigger(object):
def __init__(self, period: int , unit: str):
def __init__(self, period: int, unit: str):
if unit not in ("iteration", "epoch"):
raise ValueError("unit should be 'iteration' or 'epoch'")
self.period = period
self.unit = unit
def __call__(self, trainer):
state = trainer.updater.state
if self.unit == "epoch":
@ -13,7 +28,7 @@ class IntervalTrigger(object):
fire = not (state.iteration % self.iteration)
return fire
def never_file_trigger(trainer):
return False
@ -25,4 +40,4 @@ def get_trigger(trigger):
return trigger
else:
trigger = IntervalTrigger(*trigger)
return trigger
return trigger

View File

@ -1,3 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Optional
@ -41,6 +55,7 @@ class UpdaterBase(object):
So the best practice is to define a model and define a updater for it.
"""
def update(self):
pass
@ -52,13 +67,14 @@ class StandardUpdater(UpdaterBase):
"""An example of over-simplification. Things may not be that simple, but
you can subclass it to fit your need.
"""
def __init__(self,
model: Layer,
dataloader: DataLoader,
optimizer: Optimizer,
loss_func=None,
auto_new_epoch: bool = True,
init_state: Optional[UpdaterState] = None):
auto_new_epoch: bool=True,
init_state: Optional[UpdaterState]=None):
self.model = model
self.dataloader = dataloader
self.optimizer = optimizer

View File

@ -31,10 +31,8 @@ __all__ = [
def plot_alignment(alignment, title=None):
# alignment: [encoder_steps, decoder_steps)
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(alignment,
aspect='auto',
origin='lower',
interpolation='none')
im = ax.imshow(
alignment, aspect='auto', origin='lower', interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if title is not None:
@ -49,15 +47,14 @@ def plot_multihead_alignments(alignments, title=None):
# alignments: [N, encoder_steps, decoder_steps)
num_subplots = alignments.shape[0]
fig, axes = plt.subplots(figsize=(6 * num_subplots, 4),
ncols=num_subplots,
sharey=True,
squeeze=True)
fig, axes = plt.subplots(
figsize=(6 * num_subplots, 4),
ncols=num_subplots,
sharey=True,
squeeze=True)
for i, ax in enumerate(axes):
im = ax.imshow(alignments[i],
aspect='auto',
origin='lower',
interpolation='none')
im = ax.imshow(
alignments[i], aspect='auto', origin='lower', interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if title is not None:
@ -73,18 +70,20 @@ def plot_multilayer_multihead_alignments(alignments, title=None):
# alignments: [num_layers, num_heads, encoder_steps, decoder_steps)
num_layers, num_heads, *_ = alignments.shape
fig, axes = plt.subplots(figsize=(6 * num_heads, 4 * num_layers),
nrows=num_layers,
ncols=num_heads,
sharex=True,
sharey=True,
squeeze=True)
fig, axes = plt.subplots(
figsize=(6 * num_heads, 4 * num_layers),
nrows=num_layers,
ncols=num_heads,
sharex=True,
sharey=True,
squeeze=True)
for i, row in enumerate(axes):
for j, ax in enumerate(row):
im = ax.imshow(alignments[i, j],
aspect='auto',
origin='lower',
interpolation='none')
im = ax.imshow(
alignments[i, j],
aspect='auto',
origin='lower',
interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if title is not None:

View File

@ -20,7 +20,6 @@ __all__ = ["rank_zero_only"]
def rank_zero_only(func):
@wraps(func)
def wrapper(*args, **kwargs):
if dist.get_rank() != 0:

View File

@ -20,8 +20,9 @@ from setuptools import setup, find_packages
def read(*names, **kwargs):
with io.open(os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8")) as fp:
with io.open(
os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8")) as fp:
return fp.read()
@ -73,9 +74,7 @@ setup_info = dict(
'g2pM',
'praatio',
],
extras_require={
'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
},
extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
# Package info
packages=find_packages(exclude=('tests', 'tests.*')),
@ -88,7 +87,6 @@ setup_info = dict(
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
)
], )
setup(**setup_info)