format code with pre-commit
This commit is contained in:
parent
73ca693395
commit
6a1fb158d9
|
@ -45,7 +45,7 @@ See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. T
|
|||
pip install -U paddle-parakeet
|
||||
```
|
||||
|
||||
or
|
||||
or
|
||||
```bash
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
|
|
|
@ -68,7 +68,6 @@ exclude_patterns = []
|
|||
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
|
|
|
@ -127,6 +127,3 @@ python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpo
|
|||
|
||||
1. [Generalized End-to-end Loss for Speaker Verification](https://arxiv.org/pdf/1710.10467.pdf)
|
||||
2. [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
## 模型
|
||||
|
||||
本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
|
||||
本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
|
||||
|
||||
## 目录结构
|
||||
|
||||
|
@ -122,6 +122,3 @@ python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpo
|
|||
|
||||
1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf)
|
||||
2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
from warnings import warn
|
||||
import struct
|
||||
|
@ -30,16 +44,18 @@ def normalize_volume(wav,
|
|||
if increase_only and decrease_only:
|
||||
raise ValueError("Both increase only and decrease only are set")
|
||||
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
|
||||
if ((dBFS_change < 0 and increase_only)
|
||||
or (dBFS_change > 0 and decrease_only)):
|
||||
if ((dBFS_change < 0 and increase_only) or
|
||||
(dBFS_change > 0 and decrease_only)):
|
||||
return wav
|
||||
gain = 10**(dBFS_change / 20)
|
||||
return wav * gain
|
||||
|
||||
|
||||
def trim_long_silences(wav, vad_window_length: int,
|
||||
def trim_long_silences(wav,
|
||||
vad_window_length: int,
|
||||
vad_moving_average_width: int,
|
||||
vad_max_silence_length: int, sampling_rate: int):
|
||||
vad_max_silence_length: int,
|
||||
sampling_rate: int):
|
||||
"""
|
||||
Ensures that segments without voice in the waveform remain no longer than a
|
||||
threshold determined by the VAD parameters in params.py.
|
||||
|
@ -63,14 +79,15 @@ def trim_long_silences(wav, vad_window_length: int,
|
|||
for window_start in range(0, len(wav), samples_per_window):
|
||||
window_end = window_start + samples_per_window
|
||||
voice_flags.append(
|
||||
vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
||||
sample_rate=sampling_rate))
|
||||
vad.is_speech(
|
||||
pcm_wave[window_start * 2:window_end * 2],
|
||||
sample_rate=sampling_rate))
|
||||
voice_flags = np.array(voice_flags)
|
||||
|
||||
# Smooth the voice detection with a moving average
|
||||
def moving_average(array, width):
|
||||
array_padded = np.concatenate((np.zeros(
|
||||
(width - 1) // 2), array, np.zeros(width // 2)))
|
||||
array_padded = np.concatenate((np.zeros((width - 1) // 2), array,
|
||||
np.zeros(width // 2)))
|
||||
ret = np.cumsum(array_padded, dtype=float)
|
||||
ret[width:] = ret[width:] - ret[:-width]
|
||||
return ret[width - 1:] / width
|
||||
|
@ -89,8 +106,8 @@ def trim_long_silences(wav, vad_window_length: int,
|
|||
def compute_partial_slices(n_samples: int,
|
||||
partial_utterance_n_frames: int,
|
||||
hop_length: int,
|
||||
min_pad_coverage: float = 0.75,
|
||||
overlap: float = 0.5):
|
||||
min_pad_coverage: float=0.75,
|
||||
overlap: float=0.5):
|
||||
"""
|
||||
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
|
||||
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
|
||||
|
@ -121,8 +138,8 @@ def compute_partial_slices(n_samples: int,
|
|||
# librosa's function to compute num_frames from num_samples
|
||||
n_frames = int(np.ceil((n_samples + 1) / hop_length))
|
||||
# frame shift between ajacent partials
|
||||
frame_step = max(1,
|
||||
int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
frame_step = max(
|
||||
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
|
||||
# Compute the slices
|
||||
wav_slices, mel_slices = [], []
|
||||
|
@ -135,8 +152,8 @@ def compute_partial_slices(n_samples: int,
|
|||
|
||||
# Evaluate whether extra padding is warranted or not
|
||||
last_wav_range = wav_slices[-1]
|
||||
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop -
|
||||
last_wav_range.start)
|
||||
coverage = (n_samples - last_wav_range.start) / (
|
||||
last_wav_range.stop - last_wav_range.start)
|
||||
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
||||
mel_slices = mel_slices[:-1]
|
||||
wav_slices = wav_slices[:-1]
|
||||
|
@ -155,8 +172,8 @@ class SpeakerVerificationPreprocessor(object):
|
|||
mel_window_step,
|
||||
n_mels,
|
||||
partial_n_frames: int,
|
||||
min_pad_coverage: float = 0.75,
|
||||
partial_overlap_ratio: float = 0.5):
|
||||
min_pad_coverage: float=0.75,
|
||||
partial_overlap_ratio: float=0.5):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.audio_norm_target_dBFS = audio_norm_target_dBFS
|
||||
|
||||
|
@ -184,24 +201,23 @@ class SpeakerVerificationPreprocessor(object):
|
|||
wav = librosa.resample(wav, source_sr, self.sampling_rate)
|
||||
|
||||
# loudness normalization
|
||||
wav = normalize_volume(wav,
|
||||
self.audio_norm_target_dBFS,
|
||||
increase_only=True)
|
||||
wav = normalize_volume(
|
||||
wav, self.audio_norm_target_dBFS, increase_only=True)
|
||||
|
||||
# trim long silence
|
||||
if webrtcvad:
|
||||
wav = trim_long_silences(wav, self.vad_window_length,
|
||||
self.vad_moving_average_width,
|
||||
self.vad_max_silence_length,
|
||||
self.sampling_rate)
|
||||
wav = trim_long_silences(
|
||||
wav, self.vad_window_length, self.vad_moving_average_width,
|
||||
self.vad_max_silence_length, self.sampling_rate)
|
||||
return wav
|
||||
|
||||
def melspectrogram(self, wav):
|
||||
mel = librosa.feature.melspectrogram(wav,
|
||||
sr=self.sampling_rate,
|
||||
n_fft=self.n_fft,
|
||||
hop_length=self.hop_length,
|
||||
n_mels=self.n_mels)
|
||||
mel = librosa.feature.melspectrogram(
|
||||
wav,
|
||||
sr=self.sampling_rate,
|
||||
n_fft=self.n_fft,
|
||||
hop_length=self.hop_length,
|
||||
n_mels=self.n_mels)
|
||||
mel = mel.astype(np.float32).T
|
||||
return mel
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode
|
||||
|
||||
_C = CfgNode()
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from functools import partial
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
|
@ -29,7 +43,7 @@ def _process_speaker(speaker_dir: Path,
|
|||
datasets_root: Path,
|
||||
output_dir: Path,
|
||||
pattern: str,
|
||||
skip_existing: bool = False):
|
||||
skip_existing: bool=False):
|
||||
# datastes root: a reference path to compute speaker_name
|
||||
# we prepand dataset name to speaker_id becase we are mixing serveal
|
||||
# multispeaker datasets together
|
||||
|
@ -67,24 +81,25 @@ def _process_dataset(processor: SpeakerVerificationPreprocessor,
|
|||
dataset_name: str,
|
||||
output_dir: Path,
|
||||
pattern: str,
|
||||
skip_existing: bool = False):
|
||||
skip_existing: bool=False):
|
||||
print(
|
||||
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers."
|
||||
)
|
||||
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")
|
||||
|
||||
_func = partial(_process_speaker,
|
||||
processor=processor,
|
||||
datasets_root=datasets_root,
|
||||
output_dir=output_dir,
|
||||
pattern=pattern,
|
||||
skip_existing=skip_existing)
|
||||
_func = partial(
|
||||
_process_speaker,
|
||||
processor=processor,
|
||||
datasets_root=datasets_root,
|
||||
output_dir=output_dir,
|
||||
pattern=pattern,
|
||||
skip_existing=skip_existing)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm(pool.imap(_func, speaker_dirs),
|
||||
dataset_name,
|
||||
len(speaker_dirs),
|
||||
unit="speakers"))
|
||||
tqdm(
|
||||
pool.imap(_func, speaker_dirs),
|
||||
dataset_name,
|
||||
len(speaker_dirs),
|
||||
unit="speakers"))
|
||||
print(f"Done preprocessing {dataset_name}.")
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -26,7 +40,9 @@ def embed_utterance(processor, model, fpath_or_wav):
|
|||
return embed
|
||||
|
||||
|
||||
def _process_utterance(ifpath: Path, input_dir: Path, output_dir: Path,
|
||||
def _process_utterance(ifpath: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
processor: SpeakerVerificationPreprocessor,
|
||||
model: LSTMSpeakerEncoder):
|
||||
rel_path = ifpath.relative_to(input_dir)
|
||||
|
@ -62,8 +78,7 @@ def main(config, args):
|
|||
n_mels=c.n_mels,
|
||||
partial_n_frames=c.partial_n_frames,
|
||||
min_pad_coverage=c.min_pad_coverage,
|
||||
partial_overlap_ratio=c.min_pad_coverage,
|
||||
)
|
||||
partial_overlap_ratio=c.min_pad_coverage, )
|
||||
|
||||
# input output preparation
|
||||
input_dir = Path(args.input).expanduser()
|
||||
|
@ -83,34 +98,34 @@ if __name__ == "__main__":
|
|||
"--config",
|
||||
metavar="FILE",
|
||||
help="path of the config file to overwrite to default config with.")
|
||||
parser.add_argument("--input",
|
||||
type=str,
|
||||
help="path of the audio_file folder.")
|
||||
parser.add_argument("--pattern",
|
||||
type=str,
|
||||
default="*.wav",
|
||||
help="pattern to filter audio files.")
|
||||
parser.add_argument("--output",
|
||||
metavar="OUTPUT_DIR",
|
||||
help="path to save checkpoint and logs.")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the audio_file folder.")
|
||||
parser.add_argument(
|
||||
"--pattern",
|
||||
type=str,
|
||||
default="*.wav",
|
||||
help="pattern to filter audio files.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
metavar="OUTPUT_DIR",
|
||||
help="path to save checkpoint and logs.")
|
||||
|
||||
# load from saved checkpoint
|
||||
parser.add_argument("--checkpoint_path",
|
||||
type=str,
|
||||
help="path of the checkpoint to load")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load")
|
||||
|
||||
# running
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
choices=["cpu", "gpu"],
|
||||
help="device type to use, cpu and gpu are supported.")
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
choices=["cpu", "gpu"],
|
||||
help="device type to use, cpu and gpu are supported.")
|
||||
|
||||
# overwrite extra config and default config
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from config import get_cfg_defaults
|
||||
|
@ -12,25 +26,21 @@ if __name__ == "__main__":
|
|||
parser.add_argument(
|
||||
"--datasets_root",
|
||||
type=Path,
|
||||
help=
|
||||
"Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
|
||||
help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
|
||||
)
|
||||
parser.add_argument("--output_dir",
|
||||
type=Path,
|
||||
help="Path to save processed dataset.")
|
||||
parser.add_argument(
|
||||
"--output_dir", type=Path, help="Path to save processed dataset.")
|
||||
parser.add_argument(
|
||||
"--dataset_names",
|
||||
type=str,
|
||||
default="librispeech_other,voxceleb1,voxceleb2",
|
||||
help=
|
||||
"comma-separated list of names of the datasets you want to preprocess. only "
|
||||
help="comma-separated list of names of the datasets you want to preprocess. only "
|
||||
"the train set of these datastes will be used. Possible names: librispeech_other, "
|
||||
"voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
|
||||
parser.add_argument(
|
||||
"--skip_existing",
|
||||
action="store_true",
|
||||
help=
|
||||
"Whether to skip ouput files with the same name. Useful if this script was interrupted."
|
||||
help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no_trim",
|
||||
|
@ -74,8 +84,7 @@ if __name__ == "__main__":
|
|||
n_mels=c.n_mels,
|
||||
partial_n_frames=c.partial_n_frames,
|
||||
min_pad_coverage=c.min_pad_coverage,
|
||||
partial_overlap_ratio=c.min_pad_coverage,
|
||||
)
|
||||
partial_overlap_ratio=c.min_pad_coverage, )
|
||||
|
||||
preprocess_func = {
|
||||
"librispeech_other": process_librispeech,
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -22,6 +36,7 @@ class MultiSpeakerMelDataset(Dataset):
|
|||
utterance2.npy
|
||||
utterance3.npy
|
||||
"""
|
||||
|
||||
def __init__(self, dataset_root: Path):
|
||||
self.root = Path(dataset_root).expanduser()
|
||||
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
||||
|
@ -57,8 +72,11 @@ class MultiSpeakerSampler(BatchSampler):
|
|||
First, N speakers from all speakers are sampled randomly. Then, for each
|
||||
speaker, randomly sample M utterances from their corresponding utterances.
|
||||
"""
|
||||
def __init__(self, dataset: MultiSpeakerMelDataset,
|
||||
speakers_per_batch: int, utterances_per_speaker: int):
|
||||
|
||||
def __init__(self,
|
||||
dataset: MultiSpeakerMelDataset,
|
||||
speakers_per_batch: int,
|
||||
utterances_per_speaker: int):
|
||||
self._speakers = list(dataset.speaker_dirs)
|
||||
self._speaker_to_utterances = dataset.speaker_to_utterances
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
|
||||
from paddle import distributed as dist
|
||||
|
@ -22,9 +36,10 @@ class Ge2eExperiment(ExperimentBase):
|
|||
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
|
||||
config.model.hidden_size,
|
||||
config.model.embedding_size)
|
||||
optimizer = Adam(config.training.learning_rate_init,
|
||||
parameters=model.parameters(),
|
||||
grad_clip=ClipGradByGlobalNorm(3))
|
||||
optimizer = Adam(
|
||||
config.training.learning_rate_init,
|
||||
parameters=model.parameters(),
|
||||
grad_clip=ClipGradByGlobalNorm(3))
|
||||
self.model = DataParallel(model) if self.parallel else model
|
||||
self.model_core = model
|
||||
self.optimizer = optimizer
|
||||
|
@ -35,11 +50,11 @@ class Ge2eExperiment(ExperimentBase):
|
|||
sampler = MultiSpeakerSampler(train_dataset,
|
||||
config.training.speakers_per_batch,
|
||||
config.training.utterances_per_speaker)
|
||||
train_loader = DataLoader(train_dataset,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=Collate(
|
||||
config.data.partial_n_frames),
|
||||
num_workers=16)
|
||||
train_loader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=Collate(config.data.partial_n_frames),
|
||||
num_workers=16)
|
||||
|
||||
self.train_dataset = train_dataset
|
||||
self.train_loader = train_loader
|
||||
|
@ -72,8 +87,8 @@ class Ge2eExperiment(ExperimentBase):
|
|||
self.iteration)
|
||||
self.visualizer.add_scalar("train/eer", eer, self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"param/w", float(self.model_core.similarity_weight),
|
||||
self.iteration)
|
||||
"param/w",
|
||||
float(self.model_core.similarity_weight), self.iteration)
|
||||
self.visualizer.add_scalar("param/b",
|
||||
float(self.model_core.similarity_bias),
|
||||
self.iteration)
|
||||
|
|
|
@ -87,7 +87,6 @@ Pretrained Models can be downloaded from links below. We provide 2 models with d
|
|||
2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
|
||||
## Notebook: End-to-end TTS
|
||||
## Notebook: End-to-end TTS
|
||||
|
||||
See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
|
||||
|
||||
|
|
|
@ -32,16 +32,14 @@ _C.data = CN(
|
|||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=37, # set this according to the frontend's vocab size
|
||||
n_tones=None,
|
||||
n_tones=None,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
d_attention_rnn=
|
||||
1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=
|
||||
1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
|
@ -50,14 +48,11 @@ _C.model = CN(
|
|||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
p_attention_dropout=
|
||||
0.1, # droput probability of first rnn layer in decoder
|
||||
p_decoder_dropout=
|
||||
0.1, # droput probability of second rnn layer in decoder
|
||||
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
|
||||
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
d_global_condition=None,
|
||||
use_stop_token=
|
||||
True, # wherther to use binary classifier to predict when to stop
|
||||
use_stop_token=True, # wherther to use binary classifier to predict when to stop
|
||||
use_guided_attention_loss=False, # whether to use guided attention loss
|
||||
guided_attention_loss_sigma=0.2 # sigma in guided attention loss
|
||||
))
|
||||
|
|
|
@ -23,6 +23,7 @@ from parakeet.data.batch import batch_spec, batch_text_id
|
|||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
|
@ -44,9 +45,8 @@ class LJSpeech(Dataset):
|
|||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
def __init__(self,
|
||||
padding_idx=0,
|
||||
padding_value=0.,
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.,
|
||||
padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
|
@ -68,16 +68,19 @@ class LJSpeechCollector(object):
|
|||
|
||||
# Sort by text_len in descending order
|
||||
texts = [
|
||||
i for i, _ in sorted(
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i for i, _ in sorted(
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = [
|
||||
i for i, _ in sorted(
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
|
|
|
@ -35,13 +35,14 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
|
||||
meta_data = LJSpeechMetaData(source_path)
|
||||
frontend = EnglishCharacter()
|
||||
processor = AudioProcessor(sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.n_mels,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
processor = AudioProcessor(
|
||||
sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.n_mels,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
normalizer = LogMagnitude()
|
||||
|
||||
records = []
|
||||
|
@ -70,26 +71,22 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument("--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument("--input",
|
||||
type=str,
|
||||
help="path of the ljspeech dataset")
|
||||
parser.add_argument("--output",
|
||||
type=str,
|
||||
help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -65,29 +65,24 @@ if __name__ == "__main__":
|
|||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument("--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument("--checkpoint_path",
|
||||
type=str,
|
||||
help="path of the checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
default="cpu",
|
||||
help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
|
|
|
@ -98,9 +98,8 @@ class Experiment(ExperimentBase):
|
|||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(
|
||||
outputs['mel_outputs_postnet'][0].numpy().T),
|
||||
self.iteration)
|
||||
display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
|
||||
.numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
@ -169,26 +168,27 @@ class Experiment(ExperimentBase):
|
|||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(train_set,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=batch_fn)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
|
|
|
@ -80,7 +80,7 @@ input 是处理后的音频所在的文件夹,output 是输出频谱的文件
|
|||
运行脚本训练。
|
||||
|
||||
```python
|
||||
python train.py --data=<data> --output=<output> --device="gpu"
|
||||
python train.py --data=<data> --output=<output> --device="gpu"
|
||||
```
|
||||
|
||||
我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
|
||||
|
@ -90,7 +90,7 @@ python train.py --data=<data> --output=<output> --device="gpu"
|
|||
可以使用 visualdl 查看训练过程的 log。
|
||||
|
||||
```bash
|
||||
visualdl --logdir=<output> --host=$HOSTNAME
|
||||
visualdl --logdir=<output> --host=$HOSTNAME
|
||||
```
|
||||
|
||||
示例 training loss / validation loss 曲线如下。
|
||||
|
@ -109,4 +109,4 @@ visualdl --logdir=<output> --host=$HOSTNAME
|
|||
|
||||
## 使用
|
||||
|
||||
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
|
||||
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -16,6 +30,7 @@ print("vocab_tones:\n", voc_tones)
|
|||
|
||||
class AiShell3(Dataset):
|
||||
"""Processed AiShell3 dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.root = Path(root).expanduser()
|
||||
|
@ -31,10 +46,10 @@ class AiShell3(Dataset):
|
|||
speaker_id = sentence_id[:7]
|
||||
phones = metadatum["phones"]
|
||||
tones = metadatum["tones"]
|
||||
phones = np.array([voc_phones.lookup(item) for item in phones],
|
||||
dtype=np.int64)
|
||||
tones = np.array([voc_tones.lookup(item) for item in tones],
|
||||
dtype=np.int64)
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array(
|
||||
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||
embed = np.load(
|
||||
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||
|
@ -50,8 +65,8 @@ def collate_aishell3_examples(examples):
|
|||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
|
||||
-1)).astype(np.float32)
|
||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
|
||||
).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
from chinese_text_to_pinyin import convert_to_pinyin
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
A pinyin to phone transcription system for chinese.
|
||||
Syllables are splited as initial and final. 'er' is also treated as s special symbol.
|
||||
|
@ -96,9 +109,8 @@ def convert(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un",
|
||||
"uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
syllable = syllable.replace("un", "uen").replace(
|
||||
"ui", "uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List
|
||||
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
|
@ -7,7 +21,6 @@ def convert_to_pinyin(text: str) -> List[str]:
|
|||
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||
cannot be converted to pinyin are splited.
|
||||
"""
|
||||
syllables = lazy_pinyin(text,
|
||||
style=Style.TONE3,
|
||||
neutral_tone_with_five=True)
|
||||
syllables = lazy_pinyin(
|
||||
text, style=Style.TONE3, neutral_tone_with_five=True)
|
||||
return syllables
|
||||
|
|
|
@ -62,8 +62,7 @@ _C.model = CN(
|
|||
# whether to use a classifier to predict stop probability
|
||||
use_stop_token=False,
|
||||
# whether to use guided attention loss in training
|
||||
use_guided_attention_loss=True,
|
||||
))
|
||||
use_guided_attention_loss=True, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
|
@ -12,8 +26,11 @@ import tqdm
|
|||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
|
||||
p: AudioProcessor, n: NormalizerBase):
|
||||
def extract_mel(fname: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
p: AudioProcessor,
|
||||
n: NormalizerBase):
|
||||
relative_path = fname.relative_to(input_dir)
|
||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
@ -34,41 +51,37 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
|||
config.fmax)
|
||||
n = LogMagnitude(1e-5)
|
||||
|
||||
func = partial(extract_mel,
|
||||
input_dir=input_dir,
|
||||
output_dir=output_dir,
|
||||
p=p,
|
||||
n=n)
|
||||
func = partial(
|
||||
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm.tqdm(pool.imap(func, fnames),
|
||||
total=len(fnames),
|
||||
unit="utterance"))
|
||||
tqdm.tqdm(
|
||||
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
"Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="yaml config file to overwrite the default config")
|
||||
parser.add_argument("--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument("--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
default_config = get_cfg_defaults()
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
@ -107,9 +121,8 @@ def convert(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un",
|
||||
"uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
syllable = syllable.replace("un", "uen").replace(
|
||||
"ui", "uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
|
@ -218,18 +231,15 @@ def process_aishell3(dataset_root, output_dir):
|
|||
pickle.dump(processed_records, f)
|
||||
|
||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||
yaml.safe_dump(processed_records,
|
||||
f,
|
||||
default_flow_style=None,
|
||||
allow_unicode=True)
|
||||
yaml.safe_dump(
|
||||
processed_records, f, default_flow_style=None, allow_unicode=True)
|
||||
|
||||
print("metadata done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
"Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from multiprocessing import Pool
|
||||
|
@ -47,34 +61,36 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
|||
|
||||
wav_paths = list(source_dir.rglob("*.wav"))
|
||||
print(f"there are {len(wav_paths)} audio files in total")
|
||||
fx = partial(process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
fx = partial(
|
||||
process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
|
||||
unit="utterance"))
|
||||
tqdm(
|
||||
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
"Process audio in AiShell3, trim silence according to the alignment "
|
||||
description="Process audio in AiShell3, trim silence according to the alignment "
|
||||
"files generated by MFA, and normalize volume by peak.")
|
||||
parser.add_argument("--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the folder to save the processed audio files.")
|
||||
parser.add_argument("--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
parser.add_argument(
|
||||
"--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocess_aishell3(args.input, args.output, args.alignment)
|
||||
|
|
|
@ -53,12 +53,13 @@ class Experiment(ExperimentBase):
|
|||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
|
@ -86,12 +87,13 @@ class Experiment(ExperimentBase):
|
|||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for key, value in losses.items():
|
||||
valid_losses[key].append(float(value))
|
||||
|
@ -132,9 +134,8 @@ class Experiment(ExperimentBase):
|
|||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
texts, tones, mels, utterance_embeds, *_ = batch
|
||||
outputs = self.model.infer(texts,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
outputs = self.model.infer(
|
||||
texts, tones=tones, global_condition=utterance_embeds)
|
||||
|
||||
display.plot_alignment(outputs["alignments"][0].numpy().T)
|
||||
plt.savefig(mel_dir / f"sentence_{i}.png")
|
||||
|
@ -168,8 +169,7 @@ class Experiment(ExperimentBase):
|
|||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
d_global_condition=config.model.d_global_condition,
|
||||
use_stop_token=config.model.use_stop_token,
|
||||
)
|
||||
use_stop_token=config.model.use_stop_token, )
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
@ -200,32 +200,34 @@ class Experiment(ExperimentBase):
|
|||
batch_fn = collate_aishell3_examples
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(train_set,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=batch_fn)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
self.test_loader = DataLoader(valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
self.test_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
|
|
|
@ -14,7 +14,7 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
|||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
### Preprocess the dataset.
|
||||
### Preprocess the dataset.
|
||||
|
||||
Assume the path to save the preprocessed dataset is `ljspeech_transformer_tts`. Run the command below to preprocess the dataset.
|
||||
|
||||
|
@ -49,4 +49,4 @@ python synthesize.py --input=sentence.txt --output=mels/ --checkpoint_path='step
|
|||
|
||||
## Pretrained Model
|
||||
|
||||
Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).
|
||||
Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).
|
||||
|
|
|
@ -23,6 +23,7 @@ from parakeet.data.batch import batch_spec, batch_text_id
|
|||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
|
@ -64,6 +65,7 @@ class Transform(object):
|
|||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
|
|
|
@ -35,13 +35,14 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
|
||||
meta_data = LJSpeechMetaData(source_path)
|
||||
frontend = English()
|
||||
processor = AudioProcessor(sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.d_mel,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
processor = AudioProcessor(
|
||||
sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.d_mel,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
normalizer = LogMagnitude()
|
||||
|
||||
records = []
|
||||
|
@ -80,26 +81,22 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument("--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument("--input",
|
||||
type=str,
|
||||
help="path of the ljspeech dataset")
|
||||
parser.add_argument("--output",
|
||||
type=str,
|
||||
help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -73,29 +73,24 @@ if __name__ == "__main__":
|
|||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument("--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument("--checkpoint_path",
|
||||
type=str,
|
||||
help="path of the checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
default="cpu",
|
||||
help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
|
|
|
@ -53,11 +53,12 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
dropout=config.model.dropout)
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
optimizer = paddle.optimizer.Adam(learning_rate=config.training.lr,
|
||||
beta1=0.9,
|
||||
beta2=0.98,
|
||||
epsilon=1e-9,
|
||||
parameters=model.parameters())
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
beta1=0.9,
|
||||
beta2=0.98,
|
||||
epsilon=1e-9,
|
||||
parameters=model.parameters())
|
||||
criterion = TransformerTTSLoss(config.model.stop_loss_scale)
|
||||
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
||||
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
||||
|
@ -82,11 +83,12 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
train_loader = DataLoader(train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
|
@ -95,21 +97,20 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
rank=dist.get_rank(),
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
train_loader = DataLoader(train_set,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=batch_fn)
|
||||
train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
valid_loader = DataLoader(valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
collate_fn=batch_fn)
|
||||
valid_loader = DataLoader(
|
||||
valid_set, batch_size=config.data.batch_size, collate_fn=batch_fn)
|
||||
|
||||
self.train_loader = train_loader
|
||||
self.valid_loader = valid_loader
|
||||
|
||||
def compute_outputs(self, text, mel):
|
||||
model_core = self.model._layers if self.parallel else self.model
|
||||
model_core.set_constants(self.reduction_factor(self.iteration),
|
||||
self.drop_n_heads(self.iteration))
|
||||
model_core.set_constants(
|
||||
self.reduction_factor(self.iteration),
|
||||
self.drop_n_heads(self.iteration))
|
||||
|
||||
mel_input = mel[:, :-1, :]
|
||||
reduced_mel_input = mel_input[:, ::model_core.r, :]
|
||||
|
@ -126,10 +127,9 @@ class TransformerTTSExperiment(ExperimentBase):
|
|||
stop_logits = outputs["stop_logits"]
|
||||
|
||||
time_steps = mel_target.shape[1]
|
||||
losses = self.criterion(mel_output[:, :time_steps, :],
|
||||
mel_intermediate[:, :time_steps, :],
|
||||
mel_target, stop_logits[:, :time_steps, :],
|
||||
stop_label_target)
|
||||
losses = self.criterion(
|
||||
mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
|
||||
mel_target, stop_logits[:, :time_steps, :], stop_label_target)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
|
|
|
@ -14,7 +14,7 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
|||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
### Preprocess the dataset.
|
||||
### Preprocess the dataset.
|
||||
|
||||
Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.
|
||||
|
||||
|
@ -49,4 +49,4 @@ python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-200000
|
|||
|
||||
## Pretrained Model
|
||||
|
||||
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
|
||||
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
|
||||
|
|
|
@ -23,12 +23,14 @@ from parakeet.data.batch import batch_spec, batch_wav
|
|||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
meta_data = pandas.read_csv(str(self.root / "metadata.csv"),
|
||||
sep="\t",
|
||||
header=None,
|
||||
names=["fname", "frames", "samples"])
|
||||
meta_data = pandas.read_csv(
|
||||
str(self.root / "metadata.csv"),
|
||||
sep="\t",
|
||||
header=None,
|
||||
names=["fname", "frames", "samples"])
|
||||
|
||||
records = []
|
||||
for row in meta_data.itertuples():
|
||||
|
@ -49,6 +51,7 @@ class LJSpeech(Dataset):
|
|||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_value=0.):
|
||||
self.padding_value = padding_value
|
||||
|
||||
|
|
|
@ -70,11 +70,12 @@ class Transform(object):
|
|||
|
||||
# Compute mel-spectrogram.
|
||||
# Turn center to False to prevent internal padding.
|
||||
spectrogram = librosa.core.stft(wav,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
n_fft=n_fft,
|
||||
center=False)
|
||||
spectrogram = librosa.core.stft(
|
||||
wav,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
n_fft=n_fft,
|
||||
center=False)
|
||||
spectrogram_magnitude = np.abs(spectrogram)
|
||||
|
||||
# Compute mel-spectrograms.
|
||||
|
@ -123,10 +124,8 @@ def create_dataset(config, input_dir, output_dir):
|
|||
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
||||
|
||||
meta_data = pd.DataFrame.from_records(file_names)
|
||||
meta_data.to_csv(str(output_dir / "metadata.csv"),
|
||||
sep="\t",
|
||||
index=None,
|
||||
header=None)
|
||||
meta_data.to_csv(
|
||||
str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||
print("saved meta data in to {}".format(
|
||||
os.path.join(output_dir, "metadata.csv")))
|
||||
|
||||
|
@ -135,26 +134,22 @@ def create_dataset(config, input_dir, output_dir):
|
|||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument("--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument("--input",
|
||||
type=str,
|
||||
help="path of the ljspeech dataset")
|
||||
parser.add_argument("--output",
|
||||
type=str,
|
||||
help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -39,8 +39,8 @@ def main(config, args):
|
|||
mel = np.load(str(file_path))
|
||||
with paddle.amp.auto_cast():
|
||||
audio = model.predict(mel)
|
||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] +
|
||||
".wav")
|
||||
audio_path = output_dir / (
|
||||
os.path.splitext(file_path.name)[0] + ".wav")
|
||||
sf.write(audio_path, audio, config.data.sample_rate)
|
||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||
|
||||
|
@ -50,32 +50,27 @@ if __name__ == "__main__":
|
|||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument("--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument("--checkpoint_path",
|
||||
type=str,
|
||||
help="path of the checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
help="path of directory containing mel spectrogram (in .npy format)")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
default="cpu",
|
||||
help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help=
|
||||
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
|
|
|
@ -43,8 +43,8 @@ class Experiment(ExperimentBase):
|
|||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
optimizer = paddle.optimizer.Adam(config.training.lr,
|
||||
parameters=model.parameters())
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
config.training.lr, parameters=model.parameters())
|
||||
criterion = WaveFlowLoss(sigma=config.model.sigma)
|
||||
|
||||
self.model = model
|
||||
|
@ -63,11 +63,12 @@ class Experiment(ExperimentBase):
|
|||
config.data.hop_length)
|
||||
|
||||
if not self.parallel:
|
||||
train_loader = DataLoader(train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
|
@ -76,14 +77,12 @@ class Experiment(ExperimentBase):
|
|||
rank=dist.get_rank(),
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
train_loader = DataLoader(train_set,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=batch_fn)
|
||||
train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
valid_batch_fn = LJSpeechCollector()
|
||||
valid_loader = DataLoader(valid_set,
|
||||
batch_size=1,
|
||||
collate_fn=valid_batch_fn)
|
||||
valid_loader = DataLoader(
|
||||
valid_set, batch_size=1, collate_fn=valid_batch_fn)
|
||||
|
||||
self.train_loader = train_loader
|
||||
self.valid_loader = valid_loader
|
||||
|
|
|
@ -25,9 +25,9 @@ class AudioProcessor(object):
|
|||
n_fft: int,
|
||||
win_length: int,
|
||||
hop_length: int,
|
||||
n_mels: int = 80,
|
||||
fmin: int = 0,
|
||||
fmax: int = None,
|
||||
n_mels: int=80,
|
||||
fmin: int=0,
|
||||
fmax: int=None,
|
||||
window="hann",
|
||||
center=True,
|
||||
pad_mode="reflect",
|
||||
|
@ -73,21 +73,23 @@ class AudioProcessor(object):
|
|||
sf.write(path, wav, samplerate=self.sample_rate)
|
||||
|
||||
def stft(self, wav):
|
||||
D = librosa.core.stft(wav,
|
||||
n_fft=self.n_fft,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
window=self.window,
|
||||
center=self.center,
|
||||
pad_mode=self.pad_mode)
|
||||
D = librosa.core.stft(
|
||||
wav,
|
||||
n_fft=self.n_fft,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
window=self.window,
|
||||
center=self.center,
|
||||
pad_mode=self.pad_mode)
|
||||
return D
|
||||
|
||||
def istft(self, D):
|
||||
wav = librosa.core.istft(D,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
window=self.window,
|
||||
center=self.center)
|
||||
wav = librosa.core.istft(
|
||||
D,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
window=self.window,
|
||||
center=self.center)
|
||||
return wav
|
||||
|
||||
def spectrogram(self, wav):
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This modules contains normalizers for spectrogram magnitude.
|
||||
Normalizers are invertible transformations. They can be used to process
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Parakeet's infrastructure for data processing.
|
||||
"""
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
|
|
@ -26,7 +26,11 @@ class AudioSegmentDataset(Dataset):
|
|||
"""A simple dataset adaptor for audio files to train vocoders.
|
||||
Read -> trim silence -> normalize -> extract a segment
|
||||
"""
|
||||
def __init__(self, file_paths: List[Path], sample_rate: int, length: int,
|
||||
|
||||
def __init__(self,
|
||||
file_paths: List[Path],
|
||||
sample_rate: int,
|
||||
length: int,
|
||||
top_db: float):
|
||||
self.file_paths = file_paths
|
||||
self.sr = sample_rate
|
||||
|
@ -56,10 +60,11 @@ class AudioDataset(Dataset):
|
|||
"""A simple dataset adaptor for the audio files.
|
||||
Read -> trim silence -> normalize
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
file_paths: List[Path],
|
||||
sample_rate: int,
|
||||
top_db: float = 60):
|
||||
top_db: float=60):
|
||||
self.file_paths = file_paths
|
||||
self.sr = sample_rate
|
||||
self.top_db = top_db
|
||||
|
@ -78,12 +83,11 @@ class AudioDataset(Dataset):
|
|||
|
||||
class AudioFolderDataset(AudioDataset):
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
sample_rate,
|
||||
top_db=60,
|
||||
extension=".wav",
|
||||
):
|
||||
self,
|
||||
root,
|
||||
sample_rate,
|
||||
top_db=60,
|
||||
extension=".wav", ):
|
||||
root = Path(root).expanduser()
|
||||
file_paths = sorted(list(root.rglob("*{}".format(extension))))
|
||||
super().__init__(file_paths, sample_rate, top_db)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.phonectic import Phonetics
|
||||
"""
|
||||
A phonology system with ARPABET symbols and limited punctuations. The G2P
|
||||
|
@ -10,7 +24,6 @@ from parakeet.frontend.vocab import Vocab
|
|||
from g2p_en import G2p
|
||||
|
||||
|
||||
|
||||
class ARPABET(Phonetics):
|
||||
"""A phonology for English that uses ARPABET as the phoneme vocabulary.
|
||||
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
|
||||
|
@ -132,7 +145,9 @@ class ARPABET(Phonetics):
|
|||
List[str]
|
||||
The list of pronunciation sequence.
|
||||
"""
|
||||
phonemes = [self._remove_vowels(item) for item in self.backend(sentence)]
|
||||
phonemes = [
|
||||
self._remove_vowels(item) for item in self.backend(sentence)
|
||||
]
|
||||
if add_start_end:
|
||||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
|
@ -184,7 +199,9 @@ class ARPABET(Phonetics):
|
|||
List[str]
|
||||
The list of pronunciation id sequence.
|
||||
"""
|
||||
return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
|
||||
return self.numericalize(
|
||||
self.phoneticize(
|
||||
sentence, add_start_end=add_start_end))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
|
@ -206,7 +223,7 @@ class ARPABETWithStress(Phonetics):
|
|||
]
|
||||
punctuations = [',', '.', '?', '!']
|
||||
symbols = phonemes + punctuations
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.backend = G2p()
|
||||
self.vocab = Vocab(self.phonemes + self.punctuations)
|
||||
|
@ -276,11 +293,13 @@ class ARPABETWithStress(Phonetics):
|
|||
List[str]
|
||||
The list of pronunciation id sequence.
|
||||
"""
|
||||
return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
|
||||
return self.numericalize(
|
||||
self.phoneticize(
|
||||
sentence, add_start_end=add_start_end))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
""" Vocab size.
|
||||
"""
|
||||
# 77 = 69 phones + 4 punctuations + 4 special tokens
|
||||
return len(self.vocab)
|
||||
return len(self.vocab)
|
||||
|
|
|
@ -11,4 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
|
|
@ -11,4 +11,3 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
def full2half_width(ustr):
|
||||
half = []
|
||||
for u in ustr:
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
A Simple Chinese Phonology using pinyin symbols.
|
||||
The G2P conversion converts pinyin string to symbols. Also it can handle string
|
||||
|
@ -32,6 +45,7 @@ _tones = ['0', '1', '2', '3', '4', '5']
|
|||
_toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])]
|
||||
_toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations
|
||||
|
||||
|
||||
class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter):
|
||||
pass
|
||||
|
||||
|
@ -41,7 +55,7 @@ class ParakeetPinyin(Phonetics):
|
|||
self.vocab_phonemes = Vocab(_phones)
|
||||
self.vocab_tones = Vocab(_tones)
|
||||
self.pinyin_backend = Pinyin(ParakeetConverter())
|
||||
|
||||
|
||||
def convert_pypinyin_tone3(self, syllables, add_start_end=False):
|
||||
phonemes, tones = _convert_to_parakeet_style_pinyin(syllables)
|
||||
|
||||
|
@ -58,8 +72,7 @@ class ParakeetPinyin(Phonetics):
|
|||
item for item in phonemes if item in self.vocab_phonemes.stoi
|
||||
]
|
||||
tones = [item for item in tones if item in self.vocab_tones.stoi]
|
||||
return phonemes, tones
|
||||
|
||||
return phonemes, tones
|
||||
|
||||
def phoneticize(self, sentence, add_start_end=False):
|
||||
""" Normalize the input text sequence and convert it into pronunciation sequence.
|
||||
|
@ -74,10 +87,10 @@ class ParakeetPinyin(Phonetics):
|
|||
List[str]
|
||||
The list of pronunciation sequence.
|
||||
"""
|
||||
syllables = self.pinyin_backend.lazy_pinyin(sentence,
|
||||
style=Style.TONE3,
|
||||
strict=True)
|
||||
phonemes, tones = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
|
||||
syllables = self.pinyin_backend.lazy_pinyin(
|
||||
sentence, style=Style.TONE3, strict=True)
|
||||
phonemes, tones = self.convert_pypinyin_tone3(
|
||||
syllables, add_start_end=add_start_end)
|
||||
return phonemes, tones
|
||||
|
||||
def numericalize(self, phonemes, tones):
|
||||
|
@ -110,8 +123,8 @@ class ParakeetPinyin(Phonetics):
|
|||
List[str]
|
||||
The list of pronunciation id sequence.
|
||||
"""
|
||||
phonemes, tones = self.phoneticize(sentence,
|
||||
add_start_end=add_start_end)
|
||||
phonemes, tones = self.phoneticize(
|
||||
sentence, add_start_end=add_start_end)
|
||||
phoneme_ids, tone_ids = self.numericalize(phonemes, tones)
|
||||
return phoneme_ids, tone_ids
|
||||
|
||||
|
@ -128,12 +141,11 @@ class ParakeetPinyin(Phonetics):
|
|||
return len(self.vocab_tones)
|
||||
|
||||
|
||||
|
||||
class ParakeetPinyinWithTone(Phonetics):
|
||||
def __init__(self):
|
||||
self.vocab = Vocab(_toned_phonems)
|
||||
self.pinyin_backend = Pinyin(ParakeetConverter())
|
||||
|
||||
|
||||
def convert_pypinyin_tone3(self, syllables, add_start_end=False):
|
||||
phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables)
|
||||
|
||||
|
@ -142,11 +154,9 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
end = self.vocab_phonemes.end_symbol
|
||||
phonemes = [start] + phonemes + [end]
|
||||
|
||||
phonemes = [
|
||||
item for item in phonemes if item in self.vocab.stoi
|
||||
]
|
||||
phonemes = [item for item in phonemes if item in self.vocab.stoi]
|
||||
return phonemes
|
||||
|
||||
|
||||
def phoneticize(self, sentence, add_start_end=False):
|
||||
""" Normalize the input text sequence and convert it into pronunciation sequence.
|
||||
|
||||
|
@ -160,10 +170,10 @@ class ParakeetPinyinWithTone(Phonetics):
|
|||
List[str]
|
||||
The list of pronunciation sequence.
|
||||
"""
|
||||
syllables = self.pinyin_backend.lazy_pinyin(sentence,
|
||||
style=Style.TONE3,
|
||||
strict=True)
|
||||
phonemes = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
|
||||
syllables = self.pinyin_backend.lazy_pinyin(
|
||||
sentence, style=Style.TONE3, strict=True)
|
||||
phonemes = self.convert_pypinyin_tone3(
|
||||
syllables, add_start_end=add_start_end)
|
||||
return phonemes
|
||||
|
||||
def numericalize(self, phonemes):
|
||||
|
@ -289,6 +299,7 @@ def _convert_to_parakeet_style_pinyin(syllables):
|
|||
tones.extend(t)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def _split_syllable_with_tone(syllable: str):
|
||||
global _punctuations
|
||||
|
||||
|
@ -311,10 +322,10 @@ def _split_syllable_with_tone(syllable: str):
|
|||
phones.append(syllable)
|
||||
return phones
|
||||
|
||||
|
||||
def _convert_to_parakeet_style_pinyin_with_tone(syllables):
|
||||
phones = []
|
||||
for syllable in syllables:
|
||||
p = _split_syllable_with_tone(syllable)
|
||||
phones.extend(p)
|
||||
return phones
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
@ -23,9 +37,9 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
def forward(self, utterances, num_speakers, initial_states=None):
|
||||
normalized_embeds = self.embed_sequences(utterances, initial_states)
|
||||
embeds = normalized_embeds.reshape([num_speakers, -1, num_speakers])
|
||||
loss, eer = self.loss(embeds)
|
||||
loss, eer = self.loss(embeds)
|
||||
return loss, eer
|
||||
|
||||
|
||||
def embed_sequences(self, utterances, initial_states=None, reduce=False):
|
||||
out, (h, c) = self.lstm(utterances, initial_states)
|
||||
embeds = F.relu(self.linear(h[-1]))
|
||||
|
@ -35,7 +49,7 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
embed = F.normalize(embed, axis=0)
|
||||
return embed
|
||||
return normalized_embeds
|
||||
|
||||
|
||||
def embed_utterance(self, utterances, initial_states=None):
|
||||
# utterances: [B, T, C] -> embed [C']
|
||||
embed = self.embed_sequences(utterances, initial_states, reduce=True)
|
||||
|
@ -47,37 +61,51 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
|
||||
# Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
|
||||
centroids_incl = paddle.mean(embeds, axis=1)
|
||||
centroids_incl_norm = paddle.norm(centroids_incl, p=2, axis=1, keepdim=True)
|
||||
centroids_incl_norm = paddle.norm(
|
||||
centroids_incl, p=2, axis=1, keepdim=True)
|
||||
normalized_centroids_incl = centroids_incl / centroids_incl_norm
|
||||
|
||||
# Exclusive centroids (1 per utterance)
|
||||
centroids_excl = paddle.broadcast_to(paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
|
||||
centroids_excl = paddle.broadcast_to(
|
||||
paddle.sum(embeds, axis=1, keepdim=True), embeds.shape) - embeds
|
||||
centroids_excl /= (utterances_per_speaker - 1)
|
||||
centroids_excl_norm = paddle.norm(centroids_excl, p=2, axis=2, keepdim=True)
|
||||
centroids_excl_norm = paddle.norm(
|
||||
centroids_excl, p=2, axis=2, keepdim=True)
|
||||
normalized_centroids_excl = centroids_excl / centroids_excl_norm
|
||||
|
||||
p1 = paddle.matmul(embeds.reshape([-1, embed_dim]),
|
||||
normalized_centroids_incl, transpose_y=True) # (NMN)
|
||||
p1 = paddle.matmul(
|
||||
embeds.reshape([-1, embed_dim]),
|
||||
normalized_centroids_incl,
|
||||
transpose_y=True) # (NMN)
|
||||
p1 = p1.reshape([-1])
|
||||
# print("p1: ", p1.shape)
|
||||
p2 = paddle.bmm(embeds.reshape([-1, 1, embed_dim]),
|
||||
normalized_centroids_excl.reshape([-1, embed_dim, 1])) # (NM, 1, 1)
|
||||
p2 = p2.reshape([-1]) # (NM)
|
||||
p2 = paddle.bmm(
|
||||
embeds.reshape([-1, 1, embed_dim]),
|
||||
normalized_centroids_excl.reshape(
|
||||
[-1, embed_dim, 1])) # (NM, 1, 1)
|
||||
p2 = p2.reshape([-1]) # (NM)
|
||||
|
||||
# begin: alternative implementation for scatter
|
||||
with paddle.no_grad():
|
||||
index = paddle.arange(0, speakers_per_batch * utterances_per_speaker, dtype="int64").reshape([speakers_per_batch, utterances_per_speaker])
|
||||
index = index * speakers_per_batch + paddle.arange(0, speakers_per_batch, dtype="int64").unsqueeze(-1)
|
||||
index = paddle.arange(
|
||||
0, speakers_per_batch * utterances_per_speaker,
|
||||
dtype="int64").reshape(
|
||||
[speakers_per_batch, utterances_per_speaker])
|
||||
index = index * speakers_per_batch + paddle.arange(
|
||||
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
|
||||
index = paddle.reshape(index, [-1])
|
||||
ones = paddle.ones([speakers_per_batch * utterances_per_speaker * speakers_per_batch])
|
||||
ones = paddle.ones([
|
||||
speakers_per_batch * utterances_per_speaker * speakers_per_batch
|
||||
])
|
||||
zeros = paddle.zeros_like(index, dtype=ones.dtype)
|
||||
mask_p1 = paddle.scatter(ones, index, zeros)
|
||||
p = p1 * mask_p1 + (1 - mask_p1) * paddle.scatter(ones, index, p2)
|
||||
# end: alternative implementation for scatter
|
||||
# p = paddle.scatter(p1, index, p2)
|
||||
|
||||
p = p * self.similarity_weight + self.similarity_bias # neg
|
||||
p = p.reshape([speakers_per_batch * utterances_per_speaker, speakers_per_batch])
|
||||
p = p * self.similarity_weight + self.similarity_bias # neg
|
||||
p = p.reshape(
|
||||
[speakers_per_batch * utterances_per_speaker, speakers_per_batch])
|
||||
return p, p1, p2
|
||||
|
||||
def do_gradient_ops(self):
|
||||
|
@ -99,8 +127,10 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
sim_matrix, *_ = self.similarity_matrix(embeds)
|
||||
sim_matrix = sim_matrix.reshape(
|
||||
[speakers_per_batch * utterances_per_speaker, speakers_per_batch])
|
||||
target = paddle.arange(0, speakers_per_batch, dtype="int64").unsqueeze(-1)
|
||||
target = paddle.expand(target, [speakers_per_batch, utterances_per_speaker])
|
||||
target = paddle.arange(
|
||||
0, speakers_per_batch, dtype="int64").unsqueeze(-1)
|
||||
target = paddle.expand(target,
|
||||
[speakers_per_batch, utterances_per_speaker])
|
||||
target = paddle.reshape(target, [-1])
|
||||
|
||||
loss = nn.CrossEntropyLoss()(sim_matrix, target)
|
||||
|
@ -113,9 +143,7 @@ class LSTMSpeakerEncoder(nn.Layer):
|
|||
preds = sim_matrix.numpy()
|
||||
|
||||
# Snippet from https://yangcha.github.io/EER-ROC/
|
||||
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
||||
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
||||
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
|
||||
|
||||
return loss, eer
|
||||
|
||||
|
||||
|
|
|
@ -47,7 +47,11 @@ class DecoderPreNet(nn.Layer):
|
|||
The droput probability.
|
||||
|
||||
"""
|
||||
def __init__(self, d_input: int, d_hidden: int, d_output: int,
|
||||
|
||||
def __init__(self,
|
||||
d_input: int,
|
||||
d_hidden: int,
|
||||
d_output: int,
|
||||
dropout_rate: float):
|
||||
super().__init__()
|
||||
|
||||
|
@ -70,12 +74,10 @@ class DecoderPreNet(nn.Layer):
|
|||
|
||||
"""
|
||||
|
||||
x = F.dropout(F.relu(self.linear1(x)),
|
||||
self.dropout_rate,
|
||||
training=True)
|
||||
output = F.dropout(F.relu(self.linear2(x)),
|
||||
self.dropout_rate,
|
||||
training=True)
|
||||
x = F.dropout(
|
||||
F.relu(self.linear1(x)), self.dropout_rate, training=True)
|
||||
output = F.dropout(
|
||||
F.relu(self.linear2(x)), self.dropout_rate, training=True)
|
||||
return output
|
||||
|
||||
|
||||
|
@ -100,8 +102,13 @@ class DecoderPostNet(nn.Layer):
|
|||
The droput probability.
|
||||
|
||||
"""
|
||||
def __init__(self, d_mels: int, d_hidden: int, kernel_size: int,
|
||||
num_layers: int, dropout: float):
|
||||
|
||||
def __init__(self,
|
||||
d_mels: int,
|
||||
d_hidden: int,
|
||||
kernel_size: int,
|
||||
num_layers: int,
|
||||
dropout: float):
|
||||
super().__init__()
|
||||
self.dropout = dropout
|
||||
self.num_layers = num_layers
|
||||
|
@ -111,31 +118,33 @@ class DecoderPostNet(nn.Layer):
|
|||
self.conv_batchnorms = nn.LayerList()
|
||||
k = math.sqrt(1.0 / (d_mels * kernel_size))
|
||||
self.conv_batchnorms.append(
|
||||
Conv1dBatchNorm(d_mels,
|
||||
d_hidden,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding,
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC'))
|
||||
Conv1dBatchNorm(
|
||||
d_mels,
|
||||
d_hidden,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding,
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC'))
|
||||
|
||||
k = math.sqrt(1.0 / (d_hidden * kernel_size))
|
||||
self.conv_batchnorms.extend([
|
||||
Conv1dBatchNorm(d_hidden,
|
||||
d_hidden,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding,
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC')
|
||||
for i in range(1, num_layers - 1)
|
||||
Conv1dBatchNorm(
|
||||
d_hidden,
|
||||
d_hidden,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding,
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC') for i in range(1, num_layers - 1)
|
||||
])
|
||||
|
||||
self.conv_batchnorms.append(
|
||||
Conv1dBatchNorm(d_hidden,
|
||||
d_mels,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding,
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC'))
|
||||
Conv1dBatchNorm(
|
||||
d_hidden,
|
||||
d_mels,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding,
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC'))
|
||||
|
||||
def forward(self, x):
|
||||
"""Calculate forward propagation.
|
||||
|
@ -153,12 +162,14 @@ class DecoderPostNet(nn.Layer):
|
|||
"""
|
||||
|
||||
for i in range(len(self.conv_batchnorms) - 1):
|
||||
x = F.dropout(F.tanh(self.conv_batchnorms[i](x)),
|
||||
self.dropout,
|
||||
training=self.training)
|
||||
output = F.dropout(self.conv_batchnorms[self.num_layers - 1](x),
|
||||
self.dropout,
|
||||
training=self.training)
|
||||
x = F.dropout(
|
||||
F.tanh(self.conv_batchnorms[i](x)),
|
||||
self.dropout,
|
||||
training=self.training)
|
||||
output = F.dropout(
|
||||
self.conv_batchnorms[self.num_layers - 1](x),
|
||||
self.dropout,
|
||||
training=self.training)
|
||||
return output
|
||||
|
||||
|
||||
|
@ -179,26 +190,30 @@ class Tacotron2Encoder(nn.Layer):
|
|||
p_dropout: float
|
||||
The droput probability.
|
||||
"""
|
||||
def __init__(self, d_hidden: int, conv_layers: int, kernel_size: int,
|
||||
|
||||
def __init__(self,
|
||||
d_hidden: int,
|
||||
conv_layers: int,
|
||||
kernel_size: int,
|
||||
p_dropout: float):
|
||||
super().__init__()
|
||||
|
||||
k = math.sqrt(1.0 / (d_hidden * kernel_size))
|
||||
self.conv_batchnorms = paddle.nn.LayerList([
|
||||
Conv1dBatchNorm(d_hidden,
|
||||
d_hidden,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=int((kernel_size - 1) / 2),
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC') for i in range(conv_layers)
|
||||
Conv1dBatchNorm(
|
||||
d_hidden,
|
||||
d_hidden,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=int((kernel_size - 1) / 2),
|
||||
bias_attr=I.Uniform(-k, k),
|
||||
data_format='NLC') for i in range(conv_layers)
|
||||
])
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.hidden_size = int(d_hidden / 2)
|
||||
self.lstm = nn.LSTM(d_hidden,
|
||||
self.hidden_size,
|
||||
direction="bidirectional")
|
||||
self.lstm = nn.LSTM(
|
||||
d_hidden, self.hidden_size, direction="bidirectional")
|
||||
|
||||
def forward(self, x, input_lens=None):
|
||||
"""Calculate forward propagation of tacotron2 encoder.
|
||||
|
@ -218,9 +233,10 @@ class Tacotron2Encoder(nn.Layer):
|
|||
|
||||
"""
|
||||
for conv_batchnorm in self.conv_batchnorms:
|
||||
x = F.dropout(F.relu(conv_batchnorm(x)),
|
||||
self.p_dropout,
|
||||
training=self.training)
|
||||
x = F.dropout(
|
||||
F.relu(conv_batchnorm(x)),
|
||||
self.p_dropout,
|
||||
training=self.training)
|
||||
|
||||
output, _ = self.lstm(inputs=x, sequence_length=input_lens)
|
||||
return output
|
||||
|
@ -271,6 +287,7 @@ class Tacotron2Decoder(nn.Layer):
|
|||
Whether to use a binary classifier for stop token prediction.
|
||||
Defaults to False
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_mels: int,
|
||||
reduction_factor: int,
|
||||
|
@ -284,7 +301,7 @@ class Tacotron2Decoder(nn.Layer):
|
|||
p_prenet_dropout: float,
|
||||
p_attention_dropout: float,
|
||||
p_decoder_dropout: float,
|
||||
use_stop_token: bool = False):
|
||||
use_stop_token: bool=False):
|
||||
super().__init__()
|
||||
self.d_mels = d_mels
|
||||
self.reduction_factor = reduction_factor
|
||||
|
@ -294,10 +311,11 @@ class Tacotron2Decoder(nn.Layer):
|
|||
self.p_attention_dropout = p_attention_dropout
|
||||
self.p_decoder_dropout = p_decoder_dropout
|
||||
|
||||
self.prenet = DecoderPreNet(d_mels * reduction_factor,
|
||||
d_prenet,
|
||||
d_prenet,
|
||||
dropout_rate=p_prenet_dropout)
|
||||
self.prenet = DecoderPreNet(
|
||||
d_mels * reduction_factor,
|
||||
d_prenet,
|
||||
d_prenet,
|
||||
dropout_rate=p_prenet_dropout)
|
||||
|
||||
# attention_rnn takes attention's context vector has an
|
||||
# auxiliary input
|
||||
|
@ -367,9 +385,10 @@ class Tacotron2Decoder(nn.Layer):
|
|||
# The first lstm layer (or spec encoder lstm)
|
||||
_, (self.attention_hidden, self.attention_cell) = self.attention_rnn(
|
||||
cell_input, (self.attention_hidden, self.attention_cell))
|
||||
self.attention_hidden = F.dropout(self.attention_hidden,
|
||||
self.p_attention_dropout,
|
||||
training=self.training)
|
||||
self.attention_hidden = F.dropout(
|
||||
self.attention_hidden,
|
||||
self.p_attention_dropout,
|
||||
training=self.training)
|
||||
|
||||
# Loaction sensitive attention
|
||||
attention_weights_cat = paddle.stack(
|
||||
|
@ -384,9 +403,10 @@ class Tacotron2Decoder(nn.Layer):
|
|||
[self.attention_hidden, self.attention_context], axis=-1)
|
||||
_, (self.decoder_hidden, self.decoder_cell) = self.decoder_rnn(
|
||||
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
||||
self.decoder_hidden = F.dropout(self.decoder_hidden,
|
||||
p=self.p_decoder_dropout,
|
||||
training=self.training)
|
||||
self.decoder_hidden = F.dropout(
|
||||
self.decoder_hidden,
|
||||
p=self.p_decoder_dropout,
|
||||
training=self.training)
|
||||
|
||||
# decode output one step
|
||||
decoder_hidden_attention_context = paddle.concat(
|
||||
|
@ -426,8 +446,8 @@ class Tacotron2Decoder(nn.Layer):
|
|||
querys = paddle.reshape(
|
||||
querys,
|
||||
[querys.shape[0], querys.shape[1] // self.reduction_factor, -1])
|
||||
start_step = paddle.zeros(shape=[querys.shape[0], 1, querys.shape[-1]],
|
||||
dtype=querys.dtype)
|
||||
start_step = paddle.zeros(
|
||||
shape=[querys.shape[0], 1, querys.shape[-1]], dtype=querys.dtype)
|
||||
querys = paddle.concat([start_step, querys], axis=1)
|
||||
|
||||
querys = self.prenet(querys)
|
||||
|
@ -604,43 +624,43 @@ class Tacotron2(nn.Layer):
|
|||
outputs.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_size,
|
||||
n_tones=None,
|
||||
d_mels: int = 80,
|
||||
d_encoder: int = 512,
|
||||
encoder_conv_layers: int = 3,
|
||||
encoder_kernel_size: int = 5,
|
||||
d_prenet: int = 256,
|
||||
d_attention_rnn: int = 1024,
|
||||
d_decoder_rnn: int = 1024,
|
||||
attention_filters: int = 32,
|
||||
attention_kernel_size: int = 31,
|
||||
d_attention: int = 128,
|
||||
d_postnet: int = 512,
|
||||
postnet_kernel_size: int = 5,
|
||||
postnet_conv_layers: int = 5,
|
||||
reduction_factor: int = 1,
|
||||
p_encoder_dropout: float = 0.5,
|
||||
p_prenet_dropout: float = 0.5,
|
||||
p_attention_dropout: float = 0.1,
|
||||
p_decoder_dropout: float = 0.1,
|
||||
p_postnet_dropout: float = 0.5,
|
||||
d_mels: int=80,
|
||||
d_encoder: int=512,
|
||||
encoder_conv_layers: int=3,
|
||||
encoder_kernel_size: int=5,
|
||||
d_prenet: int=256,
|
||||
d_attention_rnn: int=1024,
|
||||
d_decoder_rnn: int=1024,
|
||||
attention_filters: int=32,
|
||||
attention_kernel_size: int=31,
|
||||
d_attention: int=128,
|
||||
d_postnet: int=512,
|
||||
postnet_kernel_size: int=5,
|
||||
postnet_conv_layers: int=5,
|
||||
reduction_factor: int=1,
|
||||
p_encoder_dropout: float=0.5,
|
||||
p_prenet_dropout: float=0.5,
|
||||
p_attention_dropout: float=0.1,
|
||||
p_decoder_dropout: float=0.1,
|
||||
p_postnet_dropout: float=0.5,
|
||||
d_global_condition=None,
|
||||
use_stop_token=False):
|
||||
super().__init__()
|
||||
|
||||
std = math.sqrt(2.0 / (vocab_size + d_encoder))
|
||||
val = math.sqrt(3.0) * std # uniform bounds for std
|
||||
self.embedding = nn.Embedding(vocab_size,
|
||||
d_encoder,
|
||||
weight_attr=I.Uniform(-val, val))
|
||||
self.embedding = nn.Embedding(
|
||||
vocab_size, d_encoder, weight_attr=I.Uniform(-val, val))
|
||||
if n_tones:
|
||||
self.embedding_tones = nn.Embedding(n_tones,
|
||||
d_encoder,
|
||||
padding_idx=0,
|
||||
weight_attr=I.Uniform(
|
||||
-0.1 * val, 0.1 * val))
|
||||
self.embedding_tones = nn.Embedding(
|
||||
n_tones,
|
||||
d_encoder,
|
||||
padding_idx=0,
|
||||
weight_attr=I.Uniform(-0.1 * val, 0.1 * val))
|
||||
self.toned = n_tones is not None
|
||||
|
||||
self.encoder = Tacotron2Encoder(d_encoder, encoder_conv_layers,
|
||||
|
@ -649,24 +669,26 @@ class Tacotron2(nn.Layer):
|
|||
# input augmentation scheme: concat global condition to the encoder output
|
||||
if d_global_condition is not None:
|
||||
d_encoder += d_global_condition
|
||||
self.decoder = Tacotron2Decoder(d_mels,
|
||||
reduction_factor,
|
||||
d_encoder,
|
||||
d_prenet,
|
||||
d_attention_rnn,
|
||||
d_decoder_rnn,
|
||||
d_attention,
|
||||
attention_filters,
|
||||
attention_kernel_size,
|
||||
p_prenet_dropout,
|
||||
p_attention_dropout,
|
||||
p_decoder_dropout,
|
||||
use_stop_token=use_stop_token)
|
||||
self.postnet = DecoderPostNet(d_mels=d_mels * reduction_factor,
|
||||
d_hidden=d_postnet,
|
||||
kernel_size=postnet_kernel_size,
|
||||
num_layers=postnet_conv_layers,
|
||||
dropout=p_postnet_dropout)
|
||||
self.decoder = Tacotron2Decoder(
|
||||
d_mels,
|
||||
reduction_factor,
|
||||
d_encoder,
|
||||
d_prenet,
|
||||
d_attention_rnn,
|
||||
d_decoder_rnn,
|
||||
d_attention,
|
||||
attention_filters,
|
||||
attention_kernel_size,
|
||||
p_prenet_dropout,
|
||||
p_attention_dropout,
|
||||
p_decoder_dropout,
|
||||
use_stop_token=use_stop_token)
|
||||
self.postnet = DecoderPostNet(
|
||||
d_mels=d_mels * reduction_factor,
|
||||
d_hidden=d_postnet,
|
||||
kernel_size=postnet_kernel_size,
|
||||
num_layers=postnet_conv_layers,
|
||||
dropout=p_postnet_dropout)
|
||||
|
||||
def forward(self,
|
||||
text_inputs,
|
||||
|
@ -729,15 +751,14 @@ class Tacotron2(nn.Layer):
|
|||
[encoder_outputs, global_condition], -1)
|
||||
|
||||
# [B, T_enc, 1]
|
||||
mask = sequence_mask(text_lens,
|
||||
dtype=encoder_outputs.dtype).unsqueeze(-1)
|
||||
mask = sequence_mask(
|
||||
text_lens, dtype=encoder_outputs.dtype).unsqueeze(-1)
|
||||
if self.decoder.use_stop_token:
|
||||
mel_outputs, alignments, stop_logits = self.decoder(
|
||||
encoder_outputs, mels, mask=mask)
|
||||
else:
|
||||
mel_outputs, alignments = self.decoder(encoder_outputs,
|
||||
mels,
|
||||
mask=mask)
|
||||
mel_outputs, alignments = self.decoder(
|
||||
encoder_outputs, mels, mask=mask)
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
|
||||
|
@ -863,6 +884,7 @@ class Tacotron2(nn.Layer):
|
|||
class Tacotron2Loss(nn.Layer):
|
||||
""" Tacotron2 Loss module
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
use_stop_token_loss=True,
|
||||
use_guided_attention_loss=False,
|
||||
|
|
|
@ -321,10 +321,8 @@ class MLPPreNet(nn.Layer):
|
|||
self.dropout = dropout
|
||||
|
||||
def forward(self, x, dropout):
|
||||
l1 = F.dropout(
|
||||
F.relu(self.lin1(x)), self.dropout, training=True)
|
||||
l2 = F.dropout(
|
||||
F.relu(self.lin2(l1)), self.dropout, training=True)
|
||||
l1 = F.dropout(F.relu(self.lin1(x)), self.dropout, training=True)
|
||||
l2 = F.dropout(F.relu(self.lin2(l1)), self.dropout, training=True)
|
||||
l3 = self.lin3(l2)
|
||||
return l3
|
||||
|
||||
|
@ -403,7 +401,7 @@ class TransformerTTS(nn.Layer):
|
|||
padding_idx=0,
|
||||
weight_attr=I.Uniform(-0.005, 0.005))
|
||||
else:
|
||||
self.toned = False
|
||||
self.toned = False
|
||||
# position encoding matrix may be extended later
|
||||
self.encoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_encoder)
|
||||
self.encoder_pe_scalar = self.create_parameter(
|
||||
|
@ -449,7 +447,8 @@ class TransformerTTS(nn.Layer):
|
|||
self.drop_n_heads = 0
|
||||
|
||||
def forward(self, text, mel, tones=None):
|
||||
encoded, encoder_attention_weights, encoder_mask = self.encode(text, tones=tones)
|
||||
encoded, encoder_attention_weights, encoder_mask = self.encode(
|
||||
text, tones=tones)
|
||||
mel_output, mel_intermediate, cross_attention_weights, stop_logits = self.decode(
|
||||
encoded, mel, encoder_mask)
|
||||
outputs = {
|
||||
|
@ -489,7 +488,8 @@ class TransformerTTS(nn.Layer):
|
|||
# twice its length if needed
|
||||
if x.shape[1] * self.r > self.decoder_pe.shape[0]:
|
||||
new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
|
||||
self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T, self.d_decoder)
|
||||
self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T,
|
||||
self.d_decoder)
|
||||
pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
|
||||
x = x.scale(math.sqrt(
|
||||
self.d_decoder)) + pos_enc * self.decoder_pe_scalar
|
||||
|
|
|
@ -78,6 +78,7 @@ class UpsampleNet(nn.LayerList):
|
|||
---------
|
||||
``librosa.core.stft``
|
||||
"""
|
||||
|
||||
def __init__(self, upsample_factors):
|
||||
super().__init__()
|
||||
for factor in upsample_factors:
|
||||
|
@ -85,12 +86,13 @@ class UpsampleNet(nn.LayerList):
|
|||
init = I.Uniform(-std, std)
|
||||
self.append(
|
||||
nn.utils.weight_norm(
|
||||
nn.Conv2DTranspose(1,
|
||||
1, (3, 2 * factor),
|
||||
padding=(1, factor // 2),
|
||||
stride=(1, factor),
|
||||
weight_attr=init,
|
||||
bias_attr=init)))
|
||||
nn.Conv2DTranspose(
|
||||
1,
|
||||
1, (3, 2 * factor),
|
||||
padding=(1, factor // 2),
|
||||
stride=(1, factor),
|
||||
weight_attr=init,
|
||||
bias_attr=init)))
|
||||
|
||||
# upsample factors
|
||||
self.upsample_factor = np.prod(upsample_factors)
|
||||
|
@ -149,6 +151,7 @@ class ResidualBlock(nn.Layer):
|
|||
dilations : int
|
||||
Dilations of the Convolution2d applied to the input.
|
||||
"""
|
||||
|
||||
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
||||
super().__init__()
|
||||
# input conv
|
||||
|
@ -159,13 +162,14 @@ class ResidualBlock(nn.Layer):
|
|||
]
|
||||
rh, rw = receptive_field
|
||||
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
|
||||
conv = nn.Conv2D(channels,
|
||||
2 * channels,
|
||||
kernel_size,
|
||||
padding=paddings,
|
||||
dilation=dilations,
|
||||
weight_attr=init,
|
||||
bias_attr=init)
|
||||
conv = nn.Conv2D(
|
||||
channels,
|
||||
2 * channels,
|
||||
kernel_size,
|
||||
padding=paddings,
|
||||
dilation=dilations,
|
||||
weight_attr=init,
|
||||
bias_attr=init)
|
||||
self.conv = nn.utils.weight_norm(conv)
|
||||
self.rh = rh
|
||||
self.rw = rw
|
||||
|
@ -174,19 +178,18 @@ class ResidualBlock(nn.Layer):
|
|||
# condition projection
|
||||
std = math.sqrt(1 / cond_channels)
|
||||
init = I.Uniform(-std, std)
|
||||
condition_proj = nn.Conv2D(cond_channels,
|
||||
2 * channels, (1, 1),
|
||||
weight_attr=init,
|
||||
bias_attr=init)
|
||||
condition_proj = nn.Conv2D(
|
||||
cond_channels,
|
||||
2 * channels, (1, 1),
|
||||
weight_attr=init,
|
||||
bias_attr=init)
|
||||
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
||||
|
||||
# parametric residual & skip connection
|
||||
std = math.sqrt(1 / channels)
|
||||
init = I.Uniform(-std, std)
|
||||
out_proj = nn.Conv2D(channels,
|
||||
2 * channels, (1, 1),
|
||||
weight_attr=init,
|
||||
bias_attr=init)
|
||||
out_proj = nn.Conv2D(
|
||||
channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
|
||||
self.out_proj = nn.utils.weight_norm(out_proj)
|
||||
|
||||
def forward(self, x, condition):
|
||||
|
@ -265,11 +268,12 @@ class ResidualBlock(nn.Layer):
|
|||
self._update_buffer(x_row)
|
||||
|
||||
rw = self.rw
|
||||
x_row = F.conv2d(self._conv_buffer,
|
||||
self.conv.weight,
|
||||
self.conv.bias,
|
||||
padding=[0, 0, rw // 2, (rw - 1) // 2],
|
||||
dilation=self.dilations)
|
||||
x_row = F.conv2d(
|
||||
self._conv_buffer,
|
||||
self.conv.weight,
|
||||
self.conv.bias,
|
||||
padding=[0, 0, rw // 2, (rw - 1) // 2],
|
||||
dilation=self.dilations)
|
||||
x_row += self.condition_proj(condition_row)
|
||||
|
||||
content, gate = paddle.chunk(x_row, 2, axis=1)
|
||||
|
@ -315,8 +319,12 @@ class ResidualNet(nn.LayerList):
|
|||
ValueError
|
||||
If the length of dilations_h does not equals n_layers.
|
||||
"""
|
||||
def __init__(self, n_layer: int, residual_channels: int,
|
||||
condition_channels: int, kernel_size: Tuple[int],
|
||||
|
||||
def __init__(self,
|
||||
n_layer: int,
|
||||
residual_channels: int,
|
||||
condition_channels: int,
|
||||
kernel_size: Tuple[int],
|
||||
dilations_h: List[int]):
|
||||
if len(dilations_h) != n_layer:
|
||||
raise ValueError(
|
||||
|
@ -421,20 +429,22 @@ class Flow(nn.Layer):
|
|||
super().__init__()
|
||||
# input projection
|
||||
self.input_proj = nn.utils.weight_norm(
|
||||
nn.Conv2D(1,
|
||||
channels, (1, 1),
|
||||
weight_attr=I.Uniform(-1., 1.),
|
||||
bias_attr=I.Uniform(-1., 1.)))
|
||||
nn.Conv2D(
|
||||
1,
|
||||
channels, (1, 1),
|
||||
weight_attr=I.Uniform(-1., 1.),
|
||||
bias_attr=I.Uniform(-1., 1.)))
|
||||
|
||||
# residual net
|
||||
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
|
||||
self.dilations_dict[n_group])
|
||||
|
||||
# output projection
|
||||
self.output_proj = nn.Conv2D(channels,
|
||||
2, (1, 1),
|
||||
weight_attr=I.Constant(0.),
|
||||
bias_attr=I.Constant(0.))
|
||||
self.output_proj = nn.Conv2D(
|
||||
channels,
|
||||
2, (1, 1),
|
||||
weight_attr=I.Constant(0.),
|
||||
bias_attr=I.Constant(0.))
|
||||
|
||||
# specs
|
||||
self.n_group = n_group
|
||||
|
@ -478,8 +488,8 @@ class Flow(nn.Layer):
|
|||
transformation from x to z.
|
||||
"""
|
||||
# (B, C, H-1, W)
|
||||
logs, b = self._predict_parameters(x[:, :, :-1, :], condition[:, :,
|
||||
1:, :])
|
||||
logs, b = self._predict_parameters(x[:, :, :-1, :],
|
||||
condition[:, :, 1:, :])
|
||||
z = self._transform(x, logs, b)
|
||||
return z, (logs, b)
|
||||
|
||||
|
@ -576,6 +586,7 @@ class WaveFlow(nn.LayerList):
|
|||
kernel_size : Union[int, List[int]]
|
||||
Kernel size of the convolution layer in each ResidualBlock.
|
||||
"""
|
||||
|
||||
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
|
||||
kernel_size):
|
||||
if n_group % 2 or n_flows % 2:
|
||||
|
@ -645,8 +656,8 @@ class WaveFlow(nn.LayerList):
|
|||
# to (B, C, h, T//h) layout
|
||||
x = paddle.unsqueeze(
|
||||
paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
||||
condition = paddle.transpose(fold(condition, self.n_group),
|
||||
[0, 1, 3, 2])
|
||||
condition = paddle.transpose(
|
||||
fold(condition, self.n_group), [0, 1, 3, 2])
|
||||
|
||||
# flows
|
||||
logs_list = []
|
||||
|
@ -689,8 +700,8 @@ class WaveFlow(nn.LayerList):
|
|||
# to (B, C, h, T//h) layout
|
||||
z = paddle.unsqueeze(
|
||||
paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
||||
condition = paddle.transpose(fold(condition, self.n_group),
|
||||
[0, 1, 3, 2])
|
||||
condition = paddle.transpose(
|
||||
fold(condition, self.n_group), [0, 1, 3, 2])
|
||||
|
||||
# reverse it flow by flow
|
||||
for i in reversed(range(self.n_flows)):
|
||||
|
@ -730,17 +741,24 @@ class ConditionalWaveFlow(nn.LayerList):
|
|||
kernel_size : Union[int, List[int]]
|
||||
Kernel size of the convolution layer in each ResidualBlock.
|
||||
"""
|
||||
def __init__(self, upsample_factors: List[int], n_flows: int,
|
||||
n_layers: int, n_group: int, channels: int, n_mels: int,
|
||||
|
||||
def __init__(self,
|
||||
upsample_factors: List[int],
|
||||
n_flows: int,
|
||||
n_layers: int,
|
||||
n_group: int,
|
||||
channels: int,
|
||||
n_mels: int,
|
||||
kernel_size: Union[int, List[int]]):
|
||||
super().__init__()
|
||||
self.encoder = UpsampleNet(upsample_factors)
|
||||
self.decoder = WaveFlow(n_flows=n_flows,
|
||||
n_layers=n_layers,
|
||||
n_group=n_group,
|
||||
channels=channels,
|
||||
mel_bands=n_mels,
|
||||
kernel_size=kernel_size)
|
||||
self.decoder = WaveFlow(
|
||||
n_flows=n_flows,
|
||||
n_layers=n_layers,
|
||||
n_group=n_group,
|
||||
channels=channels,
|
||||
mel_bands=n_mels,
|
||||
kernel_size=kernel_size)
|
||||
|
||||
def forward(self, audio, mel):
|
||||
"""Compute the transformed random variable z (x to z) and the log of
|
||||
|
@ -847,6 +865,7 @@ class WaveFlowLoss(nn.Layer):
|
|||
The standard deviation of the gaussian noise used in WaveFlow, by
|
||||
default 1.0.
|
||||
"""
|
||||
|
||||
def __init__(self, sigma=1.0):
|
||||
super().__init__()
|
||||
self.sigma = sigma
|
||||
|
@ -870,7 +889,7 @@ class WaveFlowLoss(nn.Layer):
|
|||
Tensor [shape=(1,)]
|
||||
The loss.
|
||||
"""
|
||||
loss = paddle.sum(
|
||||
z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
|
||||
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
|
||||
) - log_det_jacobian
|
||||
loss = loss / np.prod(z.shape)
|
||||
return loss + self.const
|
||||
|
|
|
@ -143,9 +143,9 @@ class MonoheadAttention(nn.Layer):
|
|||
|
||||
def __init__(self,
|
||||
model_dim: int,
|
||||
dropout: float = 0.0,
|
||||
k_dim: int = None,
|
||||
v_dim: int = None):
|
||||
dropout: float=0.0,
|
||||
k_dim: int=None,
|
||||
v_dim: int=None):
|
||||
super(MonoheadAttention, self).__init__()
|
||||
k_dim = k_dim or model_dim
|
||||
v_dim = v_dim or model_dim
|
||||
|
@ -225,9 +225,9 @@ class MultiheadAttention(nn.Layer):
|
|||
def __init__(self,
|
||||
model_dim: int,
|
||||
num_heads: int,
|
||||
dropout: float = 0.0,
|
||||
k_dim: int = None,
|
||||
v_dim: int = None):
|
||||
dropout: float=0.0,
|
||||
k_dim: int=None,
|
||||
v_dim: int=None):
|
||||
super(MultiheadAttention, self).__init__()
|
||||
if model_dim % num_heads != 0:
|
||||
raise ValueError("model_dim must be divisible by num_heads")
|
||||
|
@ -318,7 +318,8 @@ class LocationSensitiveAttention(nn.Layer):
|
|||
|
||||
# Location Layer
|
||||
self.location_conv = nn.Conv1D(
|
||||
2, location_filters,
|
||||
2,
|
||||
location_filters,
|
||||
kernel_size=location_kernel_size,
|
||||
padding=int((location_kernel_size - 1) / 2),
|
||||
bias_attr=False,
|
||||
|
|
|
@ -116,16 +116,22 @@ class STFT(nn.Layer):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, n_fft, hop_length=None, win_length=None, window="hanning", center=True, pad_mode="reflect"):
|
||||
def __init__(self,
|
||||
n_fft,
|
||||
hop_length=None,
|
||||
win_length=None,
|
||||
window="hanning",
|
||||
center=True,
|
||||
pad_mode="reflect"):
|
||||
super().__init__()
|
||||
# By default, use the entire frame
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
|
||||
|
||||
# Set the default hop, if it's not already specified
|
||||
if hop_length is None:
|
||||
hop_length = int(win_length // 4)
|
||||
|
||||
|
||||
self.hop_length = hop_length
|
||||
self.n_bin = 1 + n_fft // 2
|
||||
self.n_fft = n_fft
|
||||
|
@ -134,7 +140,7 @@ class STFT(nn.Layer):
|
|||
|
||||
# calculate window
|
||||
window = signal.get_window(window, win_length, fftbins=True)
|
||||
|
||||
|
||||
# pad window to n_fft size
|
||||
if n_fft != win_length:
|
||||
window = pad_center(window, n_fft, mode="constant")
|
||||
|
@ -146,11 +152,11 @@ class STFT(nn.Layer):
|
|||
#r = np.arange(0, n_fft)
|
||||
#M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
|
||||
#w_real = np.reshape(window *
|
||||
#np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
#(self.n_bin, 1, self.n_fft))
|
||||
#np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
#(self.n_bin, 1, self.n_fft))
|
||||
#w_imag = np.reshape(window *
|
||||
#np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
#(self.n_bin, 1, self.n_fft))
|
||||
#np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
|
||||
#(self.n_bin, 1, self.n_fft))
|
||||
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
|
||||
w_real = weight.real
|
||||
w_imag = weight.imag
|
||||
|
@ -178,8 +184,9 @@ class STFT(nn.Layer):
|
|||
"""
|
||||
x = paddle.unsqueeze(x, axis=1)
|
||||
if self.center:
|
||||
x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
|
||||
data_format='NCL', mode=self.pad_mode)
|
||||
x = F.pad(x, [self.n_fft // 2, self.n_fft // 2],
|
||||
data_format='NCL',
|
||||
mode=self.pad_mode)
|
||||
|
||||
# to BCT, C=1
|
||||
out = F.conv1d(x, self.weight, stride=self.hop_length)
|
||||
|
@ -226,8 +233,8 @@ class MelScale(nn.Layer):
|
|||
super().__init__()
|
||||
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
||||
self.weight = paddle.to_tensor(mel_basis)
|
||||
|
||||
|
||||
def forward(self, spec):
|
||||
# (n_mels, n_freq) * (batch_size, n_freq, n_frames)
|
||||
mel = paddle.matmul(self.weight, spec)
|
||||
return mel
|
||||
return mel
|
||||
|
|
|
@ -35,12 +35,12 @@ def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
|
|||
Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969.
|
||||
"""
|
||||
dtype = dtype or paddle.get_default_dtype()
|
||||
dec_pos = paddle.arange(0, N).astype(
|
||||
dtype) / dec_lens.unsqueeze(-1) # n/N # shape(B, T_dec)
|
||||
enc_pos = paddle.arange(0, T).astype(
|
||||
dtype) / enc_lens.unsqueeze(-1) # t/T # shape(B, T_enc)
|
||||
W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) -
|
||||
enc_pos.unsqueeze(1))**2 / (2 * g ** 2))
|
||||
dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze(
|
||||
-1) # n/N # shape(B, T_dec)
|
||||
enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze(
|
||||
-1) # t/T # shape(B, T_enc)
|
||||
W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 /
|
||||
(2 * g**2))
|
||||
|
||||
dec_mask = sequence_mask(dec_lens, maxlen=N)
|
||||
enc_mask = sequence_mask(enc_lens, maxlen=T)
|
||||
|
@ -57,8 +57,7 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
|
|||
W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype)
|
||||
|
||||
total_tokens = (dec_lens * enc_lens).astype(W.dtype)
|
||||
loss = paddle.mean(paddle.sum(
|
||||
W * attention_weight, [1, 2]) / total_tokens)
|
||||
loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens)
|
||||
return loss
|
||||
|
||||
|
||||
|
|
|
@ -87,6 +87,7 @@ class ExperimentBase(object):
|
|||
>>> else:
|
||||
>>> main_sp(config, args)
|
||||
"""
|
||||
|
||||
def __init__(self, config, args):
|
||||
self.config = config
|
||||
self.args = args
|
||||
|
|
|
@ -1,7 +1,22 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import contextlib
|
||||
|
||||
OBSERVATIONS = None
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def scope(observations):
|
||||
# make `observation` the target to report to.
|
||||
|
@ -13,12 +28,14 @@ def scope(observations):
|
|||
try:
|
||||
yield
|
||||
finally:
|
||||
OBSERVATIONS = old
|
||||
OBSERVATIONS = old
|
||||
|
||||
|
||||
def get_observations():
|
||||
global OBSERVATIONS
|
||||
return OBSERVATIONS
|
||||
|
||||
|
||||
def report(name, value):
|
||||
# a simple function to report named value
|
||||
# you can use it everywhere, it will get the default target and writ to it
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
from dataclasses import dataclass
|
||||
|
@ -25,7 +39,7 @@ class Trainer(object):
|
|||
self.stop_trigger = get_trigger(stop_trigger)
|
||||
self.out = Path(out)
|
||||
self.observation = {}
|
||||
|
||||
|
||||
def setup(self):
|
||||
pass
|
||||
|
||||
|
@ -38,8 +52,8 @@ class Trainer(object):
|
|||
ordinal += 1
|
||||
modified_name = f"{name}_{ordinal}"
|
||||
|
||||
self.extensions[modified_name] = ExtensionEntry(
|
||||
extension, trigger, priority)
|
||||
self.extensions[modified_name] = ExtensionEntry(extension, trigger,
|
||||
priority)
|
||||
|
||||
def run(self):
|
||||
# sort extensions by priorities once
|
||||
|
@ -61,7 +75,7 @@ class Trainer(object):
|
|||
max_epoch = self.stop_trigger.period
|
||||
else:
|
||||
max_iteration = self.stop_trigger.period
|
||||
|
||||
|
||||
while not stop_trigger(self):
|
||||
self.observation = {}
|
||||
# set observation as the report target
|
||||
|
@ -75,4 +89,3 @@ class Trainer(object):
|
|||
for name, entry in extensions:
|
||||
if entry.trigger(self):
|
||||
entry.extension(self)
|
||||
|
||||
|
|
|
@ -1,10 +1,25 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
class IntervalTrigger(object):
|
||||
def __init__(self, period: int , unit: str):
|
||||
def __init__(self, period: int, unit: str):
|
||||
if unit not in ("iteration", "epoch"):
|
||||
raise ValueError("unit should be 'iteration' or 'epoch'")
|
||||
self.period = period
|
||||
self.unit = unit
|
||||
|
||||
|
||||
def __call__(self, trainer):
|
||||
state = trainer.updater.state
|
||||
if self.unit == "epoch":
|
||||
|
@ -13,7 +28,7 @@ class IntervalTrigger(object):
|
|||
fire = not (state.iteration % self.iteration)
|
||||
return fire
|
||||
|
||||
|
||||
|
||||
def never_file_trigger(trainer):
|
||||
return False
|
||||
|
||||
|
@ -25,4 +40,4 @@ def get_trigger(trigger):
|
|||
return trigger
|
||||
else:
|
||||
trigger = IntervalTrigger(*trigger)
|
||||
return trigger
|
||||
return trigger
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
@ -41,6 +55,7 @@ class UpdaterBase(object):
|
|||
|
||||
So the best practice is to define a model and define a updater for it.
|
||||
"""
|
||||
|
||||
def update(self):
|
||||
pass
|
||||
|
||||
|
@ -52,13 +67,14 @@ class StandardUpdater(UpdaterBase):
|
|||
"""An example of over-simplification. Things may not be that simple, but
|
||||
you can subclass it to fit your need.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model: Layer,
|
||||
dataloader: DataLoader,
|
||||
optimizer: Optimizer,
|
||||
loss_func=None,
|
||||
auto_new_epoch: bool = True,
|
||||
init_state: Optional[UpdaterState] = None):
|
||||
auto_new_epoch: bool=True,
|
||||
init_state: Optional[UpdaterState]=None):
|
||||
self.model = model
|
||||
self.dataloader = dataloader
|
||||
self.optimizer = optimizer
|
||||
|
|
|
@ -31,10 +31,8 @@ __all__ = [
|
|||
def plot_alignment(alignment, title=None):
|
||||
# alignment: [encoder_steps, decoder_steps)
|
||||
fig, ax = plt.subplots(figsize=(6, 4))
|
||||
im = ax.imshow(alignment,
|
||||
aspect='auto',
|
||||
origin='lower',
|
||||
interpolation='none')
|
||||
im = ax.imshow(
|
||||
alignment, aspect='auto', origin='lower', interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if title is not None:
|
||||
|
@ -49,15 +47,14 @@ def plot_multihead_alignments(alignments, title=None):
|
|||
# alignments: [N, encoder_steps, decoder_steps)
|
||||
num_subplots = alignments.shape[0]
|
||||
|
||||
fig, axes = plt.subplots(figsize=(6 * num_subplots, 4),
|
||||
ncols=num_subplots,
|
||||
sharey=True,
|
||||
squeeze=True)
|
||||
fig, axes = plt.subplots(
|
||||
figsize=(6 * num_subplots, 4),
|
||||
ncols=num_subplots,
|
||||
sharey=True,
|
||||
squeeze=True)
|
||||
for i, ax in enumerate(axes):
|
||||
im = ax.imshow(alignments[i],
|
||||
aspect='auto',
|
||||
origin='lower',
|
||||
interpolation='none')
|
||||
im = ax.imshow(
|
||||
alignments[i], aspect='auto', origin='lower', interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if title is not None:
|
||||
|
@ -73,18 +70,20 @@ def plot_multilayer_multihead_alignments(alignments, title=None):
|
|||
# alignments: [num_layers, num_heads, encoder_steps, decoder_steps)
|
||||
num_layers, num_heads, *_ = alignments.shape
|
||||
|
||||
fig, axes = plt.subplots(figsize=(6 * num_heads, 4 * num_layers),
|
||||
nrows=num_layers,
|
||||
ncols=num_heads,
|
||||
sharex=True,
|
||||
sharey=True,
|
||||
squeeze=True)
|
||||
fig, axes = plt.subplots(
|
||||
figsize=(6 * num_heads, 4 * num_layers),
|
||||
nrows=num_layers,
|
||||
ncols=num_heads,
|
||||
sharex=True,
|
||||
sharey=True,
|
||||
squeeze=True)
|
||||
for i, row in enumerate(axes):
|
||||
for j, ax in enumerate(row):
|
||||
im = ax.imshow(alignments[i, j],
|
||||
aspect='auto',
|
||||
origin='lower',
|
||||
interpolation='none')
|
||||
im = ax.imshow(
|
||||
alignments[i, j],
|
||||
aspect='auto',
|
||||
origin='lower',
|
||||
interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if title is not None:
|
||||
|
|
|
@ -20,7 +20,6 @@ __all__ = ["rank_zero_only"]
|
|||
|
||||
|
||||
def rank_zero_only(func):
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if dist.get_rank() != 0:
|
||||
|
|
12
setup.py
12
setup.py
|
@ -20,8 +20,9 @@ from setuptools import setup, find_packages
|
|||
|
||||
|
||||
def read(*names, **kwargs):
|
||||
with io.open(os.path.join(os.path.dirname(__file__), *names),
|
||||
encoding=kwargs.get("encoding", "utf8")) as fp:
|
||||
with io.open(
|
||||
os.path.join(os.path.dirname(__file__), *names),
|
||||
encoding=kwargs.get("encoding", "utf8")) as fp:
|
||||
return fp.read()
|
||||
|
||||
|
||||
|
@ -73,9 +74,7 @@ setup_info = dict(
|
|||
'g2pM',
|
||||
'praatio',
|
||||
],
|
||||
extras_require={
|
||||
'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
|
||||
},
|
||||
extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
|
||||
|
||||
# Package info
|
||||
packages=find_packages(exclude=('tests', 'tests.*')),
|
||||
|
@ -88,7 +87,6 @@ setup_info = dict(
|
|||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
],
|
||||
)
|
||||
], )
|
||||
|
||||
setup(**setup_info)
|
||||
|
|
Loading…
Reference in New Issue