fix pwg
This commit is contained in:
parent
3ac2e01263
commit
796fafbac8
|
@ -21,10 +21,10 @@ from typing import List, Dict, Any
|
|||
import jsonlines
|
||||
import librosa
|
||||
import numpy as np
|
||||
from parakeet.data.get_feats import LogMelFBank, Energy, Pitch
|
||||
import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
from get_feats import LogMelFBank, Energy, Pitch
|
||||
|
||||
|
||||
def get_phn_dur(file_name):
|
||||
|
|
|
@ -94,7 +94,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--fastspeech2-config",
|
||||
type=str,
|
||||
help="config file to overwrite default config")
|
||||
help="config file to overwrite default config.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
|
@ -121,13 +121,13 @@ def main():
|
|||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt ",
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
|
|
|
@ -99,7 +99,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--fastspeech2-config",
|
||||
type=str,
|
||||
help="config file to overwrite default config")
|
||||
help="fastspeech2 config file to overwrite default config.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
|
@ -112,8 +112,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--pwg-config",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
help="parallel wavegan config file to overwrite default config.")
|
||||
parser.add_argument(
|
||||
"--pwg-params",
|
||||
type=str,
|
||||
|
@ -126,16 +125,16 @@ def main():
|
|||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt ",
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
|
|
|
@ -169,18 +169,18 @@ def train_sp(args, config):
|
|||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||
parser = argparse.ArgumentParser(description="Train a FastSpeech2 "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
|
|
|
@ -27,10 +27,14 @@ class Clip(object):
|
|||
aux_context_window=0, ):
|
||||
"""Initialize customized collater for DataLoader.
|
||||
|
||||
Args:
|
||||
batch_max_steps (int): The maximum length of input signal in batch.
|
||||
hop_size (int): Hop size of auxiliary features.
|
||||
aux_context_window (int): Context window size for auxiliary feature conv.
|
||||
Parameters
|
||||
----------
|
||||
batch_max_steps : int
|
||||
The maximum length of input signal in batch.
|
||||
hop_size : int
|
||||
Hop size of auxiliary features.
|
||||
aux_context_window : int
|
||||
Context window size for auxiliary feature conv.
|
||||
|
||||
"""
|
||||
if batch_max_steps % hop_size != 0:
|
||||
|
@ -49,14 +53,18 @@ class Clip(object):
|
|||
def __call__(self, examples):
|
||||
"""Convert into batch tensors.
|
||||
|
||||
Args:
|
||||
batch (list): list of tuple of the pair of audio and features. Audio shape
|
||||
(T, ), features shape(T', C).
|
||||
Parameters
|
||||
----------
|
||||
batch : list
|
||||
list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
|
||||
|
||||
Returns:
|
||||
Tensor: Auxiliary feature batch (B, C, T'), where
|
||||
T = (T' - 2 * aux_context_window) * hop_size.
|
||||
Tensor: Target signal batch (B, 1, T).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Auxiliary feature batch (B, C, T'), where
|
||||
T = (T' - 2 * aux_context_window) * hop_size.
|
||||
Tensor
|
||||
Target signal batch (B, 1, T).
|
||||
|
||||
"""
|
||||
# check length
|
||||
|
@ -93,10 +101,11 @@ class Clip(object):
|
|||
def _adjust_length(self, x, c):
|
||||
"""Adjust the audio and feature lengths.
|
||||
|
||||
Note:
|
||||
Basically we assume that the length of x and c are adjusted
|
||||
through preprocessing stage, but if we use other library processed
|
||||
features, this process will be needed.
|
||||
Note
|
||||
-------
|
||||
Basically we assume that the length of x and c are adjusted
|
||||
through preprocessing stage, but if we use other library processed
|
||||
features, this process will be needed.
|
||||
|
||||
"""
|
||||
if len(x) < c.shape[1] * self.hop_size:
|
||||
|
|
|
@ -82,7 +82,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
|
|||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 6 # Batch size.
|
||||
batch_size: 8 # Batch size.
|
||||
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
||||
num_workers: 4 # Number of workers in Pytorch DataLoader.
|
||||
|
|
|
@ -12,88 +12,28 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Dict, Any
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
import numpy as np
|
||||
import argparse
|
||||
import yaml
|
||||
import json
|
||||
import jsonlines
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
from operator import itemgetter
|
||||
from praatio import tgio
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import logging
|
||||
import numpy as np
|
||||
import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from pathlib import Path
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def logmelfilterbank(audio,
|
||||
sr,
|
||||
n_fft=1024,
|
||||
hop_length=256,
|
||||
win_length=None,
|
||||
window="hann",
|
||||
n_mels=80,
|
||||
fmin=None,
|
||||
fmax=None,
|
||||
eps=1e-10):
|
||||
"""Compute log-Mel filterbank feature.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : ndarray
|
||||
Audio signal (T,).
|
||||
sr : int
|
||||
Sampling rate.
|
||||
n_fft : int
|
||||
FFT size. (Default value = 1024)
|
||||
hop_length : int
|
||||
Hop size. (Default value = 256)
|
||||
win_length : int
|
||||
Window length. If set to None, it will be the same as fft_size. (Default value = None)
|
||||
window : str
|
||||
Window function type. (Default value = "hann")
|
||||
n_mels : int
|
||||
Number of mel basis. (Default value = 80)
|
||||
fmin : int
|
||||
Minimum frequency in mel basis calculation. (Default value = None)
|
||||
fmax : int
|
||||
Maximum frequency in mel basis calculation. (Default value = None)
|
||||
eps : float
|
||||
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
Log Mel filterbank feature (#frames, num_mels).
|
||||
|
||||
"""
|
||||
# get amplitude spectrogram
|
||||
x_stft = librosa.stft(
|
||||
audio,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
pad_mode="reflect")
|
||||
spc = np.abs(x_stft) # (#bins, #frames,)
|
||||
|
||||
# get mel basis
|
||||
fmin = 0 if fmin is None else fmin
|
||||
fmax = sr / 2 if fmax is None else fmax
|
||||
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
||||
|
||||
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
alignment_fp: Path,
|
||||
output_dir: Path):
|
||||
output_dir: Path,
|
||||
mel_extractor=None):
|
||||
utt_id = fp.stem
|
||||
|
||||
# reading
|
||||
|
@ -134,19 +74,11 @@ def process_sentence(config: Dict[str, Any],
|
|||
frame_length=config.trim_frame_length,
|
||||
hop_length=config.trim_hop_length)
|
||||
|
||||
logmel = logmelfilterbank(
|
||||
y,
|
||||
sr=sr,
|
||||
n_fft=config.n_fft,
|
||||
window=config.window,
|
||||
win_length=config.win_length,
|
||||
hop_length=config.hop_length,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(y)
|
||||
|
||||
# adjust time to make num_samples == num_frames * hop_length
|
||||
num_frames = logmel.shape[1]
|
||||
num_frames = logmel.shape[0]
|
||||
if y.size < num_frames * config.hop_length:
|
||||
y = np.pad(y, (0, num_frames * config.hop_length - y.size),
|
||||
mode="reflect")
|
||||
|
@ -157,7 +89,7 @@ def process_sentence(config: Dict[str, Any],
|
|||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
wav_path = output_dir / (utt_id + "_wave.npy")
|
||||
np.save(wav_path, y) # (num_samples, )
|
||||
np.save(mel_path, logmel.T) # (num_frames, n_mels)
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"num_samples": num_sample,
|
||||
|
@ -172,19 +104,22 @@ def process_sentences(config,
|
|||
fps: List[Path],
|
||||
alignment_fps: List[Path],
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
|
||||
results.append(
|
||||
process_sentence(config, fp, alignment_fp, output_dir))
|
||||
process_sentence(config, fp, alignment_fp, output_dir,
|
||||
mel_extractor))
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
alignment_fp, output_dir)
|
||||
alignment_fp, output_dir,
|
||||
mel_extractor)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
|
@ -260,24 +195,37 @@ def main():
|
|||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.sr,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.hop_length,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
process_sentences(
|
||||
C,
|
||||
train_wav_files,
|
||||
train_alignment_files,
|
||||
train_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
dev_wav_files,
|
||||
dev_alignment_files,
|
||||
dev_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
test_wav_files,
|
||||
test_alignment_files,
|
||||
test_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
|
||||
|
||||
|
|
|
@ -78,16 +78,17 @@ class PWGUpdater(StandardUpdater):
|
|||
wav_ = self.generator(noise, mel)
|
||||
logging.debug(f"Generator takes {t.elapse}s.")
|
||||
|
||||
## Multi-resolution stft loss
|
||||
# initialize
|
||||
gen_loss = 0.0
|
||||
|
||||
## Multi-resolution stft loss
|
||||
with timer() as t:
|
||||
sc_loss, mag_loss = self.criterion_stft(
|
||||
wav_.squeeze(1), wav.squeeze(1))
|
||||
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
|
||||
logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")
|
||||
|
||||
report("train/spectral_convergence_loss", float(sc_loss))
|
||||
report("train/log_stft_magnitude_loss", float(mag_loss))
|
||||
gen_loss = sc_loss + mag_loss
|
||||
gen_loss += sc_loss + mag_loss
|
||||
|
||||
## Adversarial loss
|
||||
if self.state.iteration > self.discriminator_train_start_steps:
|
||||
|
@ -119,9 +120,9 @@ class PWGUpdater(StandardUpdater):
|
|||
p_ = self.discriminator(wav_.detach())
|
||||
real_loss = self.criterion_mse(p, paddle.ones_like(p))
|
||||
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("train/real_loss", float(real_loss))
|
||||
report("train/fake_loss", float(fake_loss))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("train/discriminator_loss", float(dis_loss))
|
||||
|
||||
self.optimizer_d.clear_grad()
|
||||
|
@ -164,8 +165,7 @@ class PWGEvaluator(StandardEvaluator):
|
|||
|
||||
# stft loss
|
||||
with timer() as t:
|
||||
sc_loss, mag_loss = self.criterion_stft(
|
||||
wav_.squeeze(1), wav.squeeze(1))
|
||||
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
|
||||
logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")
|
||||
|
||||
report("eval/spectral_convergence_loss", float(sc_loss))
|
||||
|
@ -178,7 +178,7 @@ class PWGEvaluator(StandardEvaluator):
|
|||
p = self.discriminator(wav)
|
||||
real_loss = self.criterion_mse(p, paddle.ones_like(p))
|
||||
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("eval/real_loss", float(real_loss))
|
||||
report("eval/fake_loss", float(fake_loss))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("eval/discriminator_loss", float(dis_loss))
|
||||
|
|
|
@ -32,14 +32,14 @@ from parakeet.models.parallel_wavegan import PWGGenerator
|
|||
from config import get_cfg_default
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="synthesize with parallel wavegan.")
|
||||
description="Synthesize with parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load")
|
||||
parser.add_argument("--test-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument("--device", type=str, default="gpu", help="device to run")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument("--test-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument("--device", type=str, default="gpu", help="device to run.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = get_cfg_default()
|
||||
|
@ -89,5 +89,5 @@ for example in test_dataset:
|
|||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
|
||||
)
|
||||
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr)
|
||||
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.sr)
|
||||
print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
python3 synthesize.py \
|
||||
--config=conf/default.yaml \
|
||||
--checkpoint=exp/default/checkpoints/snapshot_iter_220000.pdz \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/debug/test
|
|
@ -0,0 +1,111 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def evaluate(args, config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
vocoder = PWGGenerator(**config["generator_params"])
|
||||
state_dict = paddle.load(args.checkpoint)
|
||||
vocoder.set_state_dict(state_dict["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
normalizer = ZScore(mu, std)
|
||||
|
||||
pwg_inference = PWGInference(normalizer, vocoder)
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=config.sr,
|
||||
n_fft=config.n_fft,
|
||||
hop_length=config.hop_length,
|
||||
win_length=config.win_length,
|
||||
window=config.window,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
|
||||
for utt_name in os.listdir(input_dir):
|
||||
wav, _ = librosa.load(str(input_dir / utt_name), sr=config.sr)
|
||||
# extract mel feats
|
||||
mel = mel_extractor.get_log_mel_fbank(wav)
|
||||
mel = paddle.to_tensor(mel)
|
||||
gen_wav = pwg_inference(mel)
|
||||
sf.write(
|
||||
str(output_dir / ("gen_" + utt_name)),
|
||||
gen_wav.numpy(),
|
||||
samplerate=config.sr)
|
||||
print(f"{utt_name} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with parallel wavegan.")
|
||||
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument(
|
||||
"--stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device to run.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
|
||||
evaluate(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -12,36 +12,29 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from paddle.optimizer import Adam # No RAdaom
|
||||
from paddle.optimizer.lr import StepDecay
|
||||
from paddle import DataParallel
|
||||
from visualdl import LogWriter
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.training.updater import UpdaterBase
|
||||
from parakeet.training.trainer import Trainer
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training import extension
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
|
||||
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from pathlib import Path
|
||||
from visualdl import LogWriter
|
||||
|
||||
from batch_fn import Clip
|
||||
from config import get_cfg_default
|
||||
|
@ -210,15 +203,15 @@ def main():
|
|||
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
|
|
|
@ -12,94 +12,34 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Dict, Any
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
import numpy as np
|
||||
import argparse
|
||||
import yaml
|
||||
import json
|
||||
import re
|
||||
import jsonlines
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
from operator import itemgetter
|
||||
from praatio import tgio
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import logging
|
||||
import numpy as np
|
||||
import re
|
||||
import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from pathlib import Path
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
from tg_utils import validate_textgrid
|
||||
|
||||
|
||||
def logmelfilterbank(audio,
|
||||
sr,
|
||||
n_fft=1024,
|
||||
hop_length=256,
|
||||
win_length=None,
|
||||
window="hann",
|
||||
n_mels=80,
|
||||
fmin=None,
|
||||
fmax=None,
|
||||
eps=1e-10):
|
||||
"""Compute log-Mel filterbank feature.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio : ndarray
|
||||
Audio signal (T,).
|
||||
sr : int
|
||||
Sampling rate.
|
||||
n_fft : int
|
||||
FFT size. (Default value = 1024)
|
||||
hop_length : int
|
||||
Hop size. (Default value = 256)
|
||||
win_length : int
|
||||
Window length. If set to None, it will be the same as fft_size. (Default value = None)
|
||||
window : str
|
||||
Window function type. (Default value = "hann")
|
||||
n_mels : int
|
||||
Number of mel basis. (Default value = 80)
|
||||
fmin : int
|
||||
Minimum frequency in mel basis calculation. (Default value = None)
|
||||
fmax : int
|
||||
Maximum frequency in mel basis calculation. (Default value = None)
|
||||
eps : float
|
||||
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
Log Mel filterbank feature (#frames, num_mels).
|
||||
|
||||
"""
|
||||
# get amplitude spectrogram
|
||||
x_stft = librosa.stft(
|
||||
audio,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
pad_mode="reflect")
|
||||
spc = np.abs(x_stft) # (#bins, #frames,)
|
||||
|
||||
# get mel basis
|
||||
fmin = 0 if fmin is None else fmin
|
||||
fmax = sr / 2 if fmax is None else fmax
|
||||
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
||||
|
||||
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
alignment_fp: Path,
|
||||
output_dir: Path):
|
||||
output_dir: Path,
|
||||
mel_extractor=None):
|
||||
utt_id = fp.stem
|
||||
|
||||
# reading
|
||||
y, sr = librosa.load(fp, sr=config.sr) # resampling may occur
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(y).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
|
@ -125,16 +65,8 @@ def process_sentence(config: Dict[str, Any],
|
|||
f" There is something wrong with the last interval {last} in utterance: {utt_id}"
|
||||
)
|
||||
|
||||
logmel = logmelfilterbank(
|
||||
y,
|
||||
sr=sr,
|
||||
n_fft=config.n_fft,
|
||||
window=config.window,
|
||||
win_length=config.win_length,
|
||||
hop_length=config.hop_length,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(y)
|
||||
|
||||
# extract phone and duration
|
||||
phones = []
|
||||
|
@ -162,7 +94,7 @@ def process_sentence(config: Dict[str, Any],
|
|||
ends, sr=sr, hop_length=config.hop_length)
|
||||
durations_frame = np.diff(frame_pos, prepend=0)
|
||||
|
||||
num_frames = logmel.shape[-1] # number of frames of the spectrogram
|
||||
num_frames = logmel.shape[0] # number of frames of the spectrogram
|
||||
extra = np.sum(durations_frame) - num_frames
|
||||
assert extra <= 0, (
|
||||
f"Number of frames inferred from alignemnt is "
|
||||
|
@ -173,7 +105,7 @@ def process_sentence(config: Dict[str, Any],
|
|||
durations_frame = durations_frame.tolist()
|
||||
|
||||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
np.save(mel_path, logmel.T) # (num_frames, n_mels)
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"phones": phones,
|
||||
|
@ -190,20 +122,23 @@ def process_sentences(config,
|
|||
fps: List[Path],
|
||||
alignment_fps: List[Path],
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp, alignment_fp in tqdm.tqdm(
|
||||
zip(fps, alignment_fps), total=len(fps)):
|
||||
results.append(
|
||||
process_sentence(config, fp, alignment_fp, output_dir))
|
||||
process_sentence(config, fp, alignment_fp, output_dir,
|
||||
mel_extractor))
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
alignment_fp, output_dir)
|
||||
alignment_fp, output_dir,
|
||||
mel_extractor)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
|
@ -284,24 +219,37 @@ def main():
|
|||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.sr,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.hop_length,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
process_sentences(
|
||||
C,
|
||||
train_wav_files,
|
||||
train_alignment_files,
|
||||
train_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
dev_wav_files,
|
||||
dev_alignment_files,
|
||||
dev_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
test_wav_files,
|
||||
test_alignment_files,
|
||||
test_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
|
||||
|
||||
|
|
|
@ -12,40 +12,31 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import dataclasses
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import os
|
||||
|
||||
import yaml
|
||||
import jsonlines
|
||||
import paddle
|
||||
import numpy as np
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
from paddle import DataParallel
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from paddle.optimizer import Adam # No RAdaom
|
||||
from paddle.optimizer.lr import StepDecay
|
||||
from paddle import DataParallel
|
||||
from visualdl import LogWriter
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
|
||||
from parakeet.training.updater import UpdaterBase
|
||||
from parakeet.training.trainer import Trainer
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training import extension
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from pathlib import Path
|
||||
from visualdl import LogWriter
|
||||
|
||||
from batch_fn import collate_baker_examples
|
||||
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
|
||||
from config import get_cfg_default
|
||||
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
|
@ -93,10 +84,6 @@ def train_sp(args, config):
|
|||
batch_size=config.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=True)
|
||||
# dev_sampler = DistributedBatchSampler(dev_dataset,
|
||||
# batch_size=config.batch_size,
|
||||
# shuffle=False,
|
||||
# drop_last=False)
|
||||
print("samplers done!")
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
|
@ -113,13 +100,9 @@ def train_sp(args, config):
|
|||
num_workers=config.num_workers)
|
||||
print("dataloaders done!")
|
||||
|
||||
# batch = collate_baker_examples([train_dataset[i] for i in range(10)])
|
||||
# # batch = collate_baker_examples([dev_dataset[i] for i in range(10)])
|
||||
# import pdb; pdb.set_trace()
|
||||
model = SpeedySpeech(**config["model"])
|
||||
if world_size > 1:
|
||||
model = DataParallel(model) # TODO, do not use vocab size from config
|
||||
# print(model)
|
||||
print("model done!")
|
||||
optimizer = Adam(
|
||||
0.001,
|
||||
|
@ -147,18 +130,18 @@ def train_sp(args, config):
|
|||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||
parser = argparse.ArgumentParser(description="Train a SpeedySpeech "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
|
|
|
@ -27,5 +27,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.data.dataset import *
|
||||
from parakeet.data.batch import *
|
||||
from parakeet.data.dataset import *
|
||||
from parakeet.data.get_feats import *
|
||||
|
|
|
@ -17,8 +17,6 @@ import numpy as np
|
|||
import pyworld
|
||||
from scipy.interpolate import interp1d
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
class LogMelFBank():
|
||||
def __init__(self,
|
||||
|
@ -42,8 +40,8 @@ class LogMelFBank():
|
|||
|
||||
# mel
|
||||
self.n_mels = n_mels
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.fmin = 0 if fmin is None else fmin
|
||||
self.fmax = sr / 2 if fmax is None else fmax
|
||||
|
||||
self.mel_filter = self._create_mel_filter()
|
||||
|
||||
|
@ -217,41 +215,3 @@ class Energy():
|
|||
if use_token_averaged_energy and duration is not None:
|
||||
energy = self._average_by_duration(energy, duration)
|
||||
return energy
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
C = get_cfg_default()
|
||||
filename = "../raw_data/data/format.1/000001.flac"
|
||||
wav, _ = librosa.load(filename, sr=C.fs)
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.fs,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.n_shift,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax, )
|
||||
mel = mel_extractor.get_log_mel_fbank(wav)
|
||||
print(mel)
|
||||
print(mel.shape)
|
||||
|
||||
pitch_extractor = Pitch(
|
||||
sr=C.fs, hop_length=C.n_shift, f0min=C.f0min, f0max=C.f0max)
|
||||
duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
|
||||
duration = np.array([int(x) for x in duration.split(" ")])
|
||||
avg_f0 = pitch_extractor.get_pitch(wav, duration=duration)
|
||||
print(avg_f0)
|
||||
print(avg_f0.shape)
|
||||
|
||||
energy_extractor = Energy(
|
||||
sr=C.fs,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.n_shift,
|
||||
win_length=C.win_length,
|
||||
window=C.window)
|
||||
duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
|
||||
duration = np.array([int(x) for x in duration.split(" ")])
|
||||
avg_energy = energy_extractor.get_energy(wav, duration=duration)
|
||||
print(avg_energy)
|
||||
print(avg_energy.sum())
|
|
@ -109,4 +109,5 @@ class Frontend():
|
|||
def get_phonemes(self, sentence):
|
||||
sentences = self.text_normalizer.normalize(sentence)
|
||||
phonemes = self._g2p(sentences)
|
||||
print("phonemes:", phonemes)
|
||||
return phonemes
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
|
||||
from typing import Dict, Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss
|
||||
|
@ -252,36 +251,36 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
text : Tensor
|
||||
Batch of padded token ids (B, Tmax).
|
||||
text_lengths : Tensor)
|
||||
Batch of lengths of each input (B,).
|
||||
speech : Tensor
|
||||
Batch of padded target features (B, Lmax, odim).
|
||||
speech_lengths : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
durations : Tensor
|
||||
Batch of padded durations (B, Tmax).
|
||||
pitch : Tensor
|
||||
Batch of padded token-averaged pitch (B, Tmax, 1).
|
||||
energy : Tensor
|
||||
Batch of padded token-averaged energy (B, Tmax, 1).
|
||||
text : Tensor
|
||||
Batch of padded token ids (B, Tmax).
|
||||
text_lengths : Tensor)
|
||||
Batch of lengths of each input (B,).
|
||||
speech : Tensor
|
||||
Batch of padded target features (B, Lmax, odim).
|
||||
speech_lengths : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
durations : Tensor
|
||||
Batch of padded durations (B, Tmax).
|
||||
pitch : Tensor
|
||||
Batch of padded token-averaged pitch (B, Tmax, 1).
|
||||
energy : Tensor
|
||||
Batch of padded token-averaged energy (B, Tmax, 1).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
mel outs before postnet
|
||||
Tensor
|
||||
mel outs after postnet
|
||||
Tensor
|
||||
duration predictor's output
|
||||
Tensor
|
||||
pitch predictor's output
|
||||
Tensor
|
||||
energy predictor's output
|
||||
Tensor
|
||||
speech
|
||||
Tensor
|
||||
speech_lengths, modified if reduction_factor >1
|
||||
Tensor
|
||||
mel outs before postnet
|
||||
Tensor
|
||||
mel outs after postnet
|
||||
Tensor
|
||||
duration predictor's output
|
||||
Tensor
|
||||
pitch predictor's output
|
||||
Tensor
|
||||
energy predictor's output
|
||||
Tensor
|
||||
speech
|
||||
Tensor
|
||||
speech_lengths, modified if reduction_factor > 1
|
||||
"""
|
||||
|
||||
xs = text
|
||||
|
@ -389,26 +388,26 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
text : Tensor
|
||||
Input sequence of characters (T,).
|
||||
speech : Tensor, optional
|
||||
Feature sequence to extract style (N, idim).
|
||||
durations : Tensor, optional
|
||||
Groundtruth of duration (T,).
|
||||
pitch : Tensor, optional
|
||||
Groundtruth of token-averaged pitch (T, 1).
|
||||
energy : Tensor, optional
|
||||
Groundtruth of token-averaged energy (T, 1).
|
||||
alpha : float, optional
|
||||
Alpha to control the speed.
|
||||
use_teacher_forcing : bool, optional
|
||||
Whether to use teacher forcing.
|
||||
If true, groundtruth of duration, pitch and energy will be used.
|
||||
text : Tensor
|
||||
Input sequence of characters (T,).
|
||||
speech : Tensor, optional
|
||||
Feature sequence to extract style (N, idim).
|
||||
durations : Tensor, optional
|
||||
Groundtruth of duration (T,).
|
||||
pitch : Tensor, optional
|
||||
Groundtruth of token-averaged pitch (T, 1).
|
||||
energy : Tensor, optional
|
||||
Groundtruth of token-averaged energy (T, 1).
|
||||
alpha : float, optional
|
||||
Alpha to control the speed.
|
||||
use_teacher_forcing : bool, optional
|
||||
Whether to use teacher forcing.
|
||||
If true, groundtruth of duration, pitch and energy will be used.
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Output sequence of features (L, odim).
|
||||
Tensor
|
||||
Output sequence of features (L, odim).
|
||||
"""
|
||||
x, y = text, speech
|
||||
d, p, e = durations, pitch, energy
|
||||
|
@ -448,21 +447,21 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
ilens : Tensor
|
||||
Batch of lengths (B,).
|
||||
ilens : Tensor
|
||||
Batch of lengths (B,).
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tensor
|
||||
Mask tensor for self-attention.
|
||||
dtype=paddle.bool
|
||||
Tensor
|
||||
Mask tensor for self-attention.
|
||||
dtype=paddle.bool
|
||||
|
||||
Examples
|
||||
-------
|
||||
>>> ilens = [5, 3]
|
||||
>>> self._source_mask(ilens)
|
||||
tensor([[[1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 0, 0]]]) bool
|
||||
>>> ilens = [5, 3]
|
||||
>>> self._source_mask(ilens)
|
||||
tensor([[[1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 0, 0]]]) bool
|
||||
|
||||
"""
|
||||
x_masks = make_non_pad_mask(ilens)
|
||||
|
@ -509,10 +508,10 @@ class FastSpeech2Loss(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
use_masking : bool
|
||||
Whether to apply masking for padded part in loss calculation.
|
||||
use_weighted_masking : bool
|
||||
Whether to weighted masking in loss calculation.
|
||||
use_masking : bool
|
||||
Whether to apply masking for padded part in loss calculation.
|
||||
use_weighted_masking : bool
|
||||
Whether to weighted masking in loss calculation.
|
||||
"""
|
||||
assert check_argument_types()
|
||||
super().__init__()
|
||||
|
@ -545,39 +544,39 @@ class FastSpeech2Loss(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
after_outs : Tensor
|
||||
Batch of outputs after postnets (B, Lmax, odim).
|
||||
before_outs : Tensor
|
||||
Batch of outputs before postnets (B, Lmax, odim).
|
||||
d_outs : Tensor
|
||||
Batch of outputs of duration predictor (B, Tmax).
|
||||
p_outs : Tensor
|
||||
Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||
e_outs : Tensor
|
||||
Batch of outputs of energy predictor (B, Tmax, 1).
|
||||
ys : Tensor
|
||||
Batch of target features (B, Lmax, odim).
|
||||
ds : Tensor
|
||||
Batch of durations (B, Tmax).
|
||||
ps : Tensor
|
||||
Batch of target token-averaged pitch (B, Tmax, 1).
|
||||
es : Tensor
|
||||
Batch of target token-averaged energy (B, Tmax, 1).
|
||||
ilens : Tensor
|
||||
Batch of the lengths of each input (B,).
|
||||
olens : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
after_outs : Tensor
|
||||
Batch of outputs after postnets (B, Lmax, odim).
|
||||
before_outs : Tensor
|
||||
Batch of outputs before postnets (B, Lmax, odim).
|
||||
d_outs : Tensor
|
||||
Batch of outputs of duration predictor (B, Tmax).
|
||||
p_outs : Tensor
|
||||
Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||
e_outs : Tensor
|
||||
Batch of outputs of energy predictor (B, Tmax, 1).
|
||||
ys : Tensor
|
||||
Batch of target features (B, Lmax, odim).
|
||||
ds : Tensor
|
||||
Batch of durations (B, Tmax).
|
||||
ps : Tensor
|
||||
Batch of target token-averaged pitch (B, Tmax, 1).
|
||||
es : Tensor
|
||||
Batch of target token-averaged energy (B, Tmax, 1).
|
||||
ilens : Tensor
|
||||
Batch of the lengths of each input (B,).
|
||||
olens : Tensor
|
||||
Batch of the lengths of each target (B,).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
L1 loss value.
|
||||
Tensor
|
||||
Duration predictor loss value.
|
||||
Tensor
|
||||
Pitch predictor loss value.
|
||||
Tensor
|
||||
Energy predictor loss value.
|
||||
Tensor
|
||||
L1 loss value.
|
||||
Tensor
|
||||
Duration predictor loss value.
|
||||
Tensor
|
||||
Pitch predictor loss value.
|
||||
Tensor
|
||||
Energy predictor loss value.
|
||||
|
||||
"""
|
||||
# apply mask to remove padded part
|
||||
|
|
|
@ -32,10 +32,10 @@ class DurationPredictor(nn.Layer):
|
|||
|
||||
Note
|
||||
----------
|
||||
The calculation domain of outputs is different
|
||||
between in `forward` and in `inference`. In `forward`,
|
||||
the outputs are calculated in log domain but in `inference`,
|
||||
those are calculated in linear domain.
|
||||
The calculation domain of outputs is different
|
||||
between in `forward` and in `inference`. In `forward`,
|
||||
the outputs are calculated in log domain but in `inference`,
|
||||
those are calculated in linear domain.
|
||||
|
||||
"""
|
||||
|
||||
|
@ -50,18 +50,18 @@ class DurationPredictor(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
n_layers : int, optional
|
||||
Number of convolutional layers.
|
||||
n_chans : int, optional
|
||||
Number of channels of convolutional layers.
|
||||
kernel_size : int, optional
|
||||
Kernel size of convolutional layers.
|
||||
dropout_rate : float, optional
|
||||
Dropout rate.
|
||||
offset : float, optional
|
||||
Offset value to avoid nan in log domain.
|
||||
idim : int
|
||||
Input dimension.
|
||||
n_layers : int, optional
|
||||
Number of convolutional layers.
|
||||
n_chans : int, optional
|
||||
Number of channels of convolutional layers.
|
||||
kernel_size : int, optional
|
||||
Kernel size of convolutional layers.
|
||||
dropout_rate : float, optional
|
||||
Dropout rate.
|
||||
offset : float, optional
|
||||
Offset value to avoid nan in log domain.
|
||||
|
||||
"""
|
||||
super(DurationPredictor, self).__init__()
|
||||
|
@ -108,10 +108,10 @@ class DurationPredictor(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : ByteTensor, optional
|
||||
Batch of masks indicating padded part (B, Tmax).
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : ByteTensor, optional
|
||||
Batch of masks indicating padded part (B, Tmax).
|
||||
|
||||
Returns
|
||||
----------
|
||||
|
@ -125,15 +125,15 @@ class DurationPredictor(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : Tensor(bool), optional
|
||||
Batch of masks indicating padded part (B, Tmax).
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : Tensor(bool), optional
|
||||
Batch of masks indicating padded part (B, Tmax).
|
||||
|
||||
Returns
|
||||
----------
|
||||
LongTensor
|
||||
Batch of predicted durations in linear domain int64 (B, Tmax).
|
||||
Tensor
|
||||
Batch of predicted durations in linear domain int64 (B, Tmax).
|
||||
"""
|
||||
return self._forward(xs, x_masks, True)
|
||||
|
||||
|
@ -150,10 +150,10 @@ class DurationPredictorLoss(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
offset : float, optional
|
||||
Offset value to avoid nan in log domain.
|
||||
reduction : str
|
||||
Reduction type in loss calculation.
|
||||
offset : float, optional
|
||||
Offset value to avoid nan in log domain.
|
||||
reduction : str
|
||||
Reduction type in loss calculation.
|
||||
"""
|
||||
super(DurationPredictorLoss, self).__init__()
|
||||
self.criterion = nn.MSELoss(reduction=reduction)
|
||||
|
@ -164,19 +164,19 @@ class DurationPredictorLoss(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
outputs : Tensor
|
||||
Batch of prediction durations in log domain (B, T)
|
||||
targets : LongTensor
|
||||
Batch of groundtruth durations in linear domain (B, T)
|
||||
outputs : Tensor
|
||||
Batch of prediction durations in log domain (B, T)
|
||||
targets : Tensor
|
||||
Batch of groundtruth durations in linear domain (B, T)
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Mean squared error loss value.
|
||||
Tensor
|
||||
Mean squared error loss value.
|
||||
|
||||
Note
|
||||
----------
|
||||
`outputs` is in log domain but `targets` is in linear domain.
|
||||
`outputs` is in log domain but `targets` is in linear domain.
|
||||
"""
|
||||
# NOTE: outputs is in log domain while targets in linear
|
||||
targets = paddle.log(targets.cast(dtype='float32') + self.offset)
|
||||
|
|
|
@ -37,8 +37,8 @@ class LengthRegulator(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
pad_value : float, optional
|
||||
Value used for padding.
|
||||
pad_value : float, optional
|
||||
Value used for padding.
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
|
@ -70,17 +70,17 @@ class LengthRegulator(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
|
||||
ds : LongTensor
|
||||
Batch of durations of each frame (B, T).
|
||||
alpha : float, optional
|
||||
Alpha value to control speed of speech.
|
||||
xs : Tensor
|
||||
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
|
||||
ds : LongTensor
|
||||
Batch of durations of each frame (B, T).
|
||||
alpha : float, optional
|
||||
Alpha value to control speed of speech.
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
replicated input tensor based on durations (B, T*, D).
|
||||
Tensor
|
||||
replicated input tensor based on durations (B, T*, D).
|
||||
"""
|
||||
if alpha != 1.0:
|
||||
assert alpha > 0
|
||||
|
|
|
@ -45,20 +45,20 @@ class Postnet(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Dimension of the inputs.
|
||||
odim : int
|
||||
Dimension of the outputs.
|
||||
n_layers : int, optional
|
||||
The number of layers.
|
||||
n_filts : int, optional
|
||||
The number of filter size.
|
||||
n_units : int, optional
|
||||
The number of filter channels.
|
||||
use_batch_norm : bool, optional
|
||||
Whether to use batch normalization..
|
||||
dropout_rate : float, optional
|
||||
Dropout rate..
|
||||
idim : int
|
||||
Dimension of the inputs.
|
||||
odim : int
|
||||
Dimension of the outputs.
|
||||
n_layers : int, optional
|
||||
The number of layers.
|
||||
n_filts : int, optional
|
||||
The number of filter size.
|
||||
n_units : int, optional
|
||||
The number of filter channels.
|
||||
use_batch_norm : bool, optional
|
||||
Whether to use batch normalization..
|
||||
dropout_rate : float, optional
|
||||
Dropout rate..
|
||||
"""
|
||||
super(Postnet, self).__init__()
|
||||
self.postnet = nn.LayerList()
|
||||
|
@ -120,13 +120,13 @@ class Postnet(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of the sequences of padded input tensors (B, idim, Tmax).
|
||||
xs : Tensor
|
||||
Batch of the sequences of padded input tensors (B, idim, Tmax).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Batch of padded output tensor. (B, odim, Tmax).
|
||||
Tensor
|
||||
Batch of padded output tensor. (B, odim, Tmax).
|
||||
|
||||
"""
|
||||
for i in six.moves.range(len(self.postnet)):
|
||||
|
|
|
@ -43,16 +43,16 @@ class VariancePredictor(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
n_layers : int, optional
|
||||
Number of convolutional layers.
|
||||
n_chans : int, optional
|
||||
Number of channels of convolutional layers.
|
||||
kernel_size : int, optional
|
||||
Kernel size of convolutional layers.
|
||||
dropout_rate : float, optional
|
||||
Dropout rate.
|
||||
idim : int
|
||||
Input dimension.
|
||||
n_layers : int, optional
|
||||
Number of convolutional layers.
|
||||
n_chans : int, optional
|
||||
Number of channels of convolutional layers.
|
||||
kernel_size : int, optional
|
||||
Kernel size of convolutional layers.
|
||||
dropout_rate : float, optional
|
||||
Dropout rate.
|
||||
"""
|
||||
assert check_argument_types()
|
||||
super().__init__()
|
||||
|
|
|
@ -26,12 +26,12 @@ class MultiHeadedAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
n_head : int
|
||||
The number of heads.
|
||||
n_feat : int
|
||||
The number of features.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
n_head : int
|
||||
The number of heads.
|
||||
n_feat : int
|
||||
The number of features.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
"""
|
||||
|
||||
def __init__(self, n_head, n_feat, dropout_rate):
|
||||
|
@ -53,21 +53,21 @@ class MultiHeadedAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
query : paddle.Tensor
|
||||
query tensor (#batch, time1, size).
|
||||
key : paddle.Tensor
|
||||
Key tensor (#batch, time2, size).
|
||||
value : paddle.Tensor
|
||||
Value tensor (#batch, time2, size).
|
||||
query : paddle.Tensor
|
||||
query tensor (#batch, time1, size).
|
||||
key : paddle.Tensor
|
||||
Key tensor (#batch, time2, size).
|
||||
value : paddle.Tensor
|
||||
Value tensor (#batch, time2, size).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Transformed query tensor (#batch, n_head, time1, d_k).
|
||||
paddle.Tensor
|
||||
Transformed key tensor (#batch, n_head, time2, d_k).
|
||||
paddle.Tensor
|
||||
Transformed value tensor (#batch, n_head, time2, d_k).
|
||||
paddle.Tensor
|
||||
Transformed query tensor (#batch, n_head, time1, d_k).
|
||||
paddle.Tensor
|
||||
Transformed key tensor (#batch, n_head, time2, d_k).
|
||||
paddle.Tensor
|
||||
Transformed value tensor (#batch, n_head, time2, d_k).
|
||||
"""
|
||||
n_batch = query.shape[0]
|
||||
|
||||
|
@ -90,18 +90,18 @@ class MultiHeadedAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
value : paddle.Tensor
|
||||
Transformed value (#batch, n_head, time2, d_k).
|
||||
scores : paddle.Tensor
|
||||
Attention score (#batch, n_head, time1, time2).
|
||||
mask : paddle.Tensor
|
||||
Mask (#batch, 1, time2) or (#batch, time1, time2).
|
||||
value : paddle.Tensor
|
||||
Transformed value (#batch, n_head, time2, d_k).
|
||||
scores : paddle.Tensor
|
||||
Attention score (#batch, n_head, time1, time2).
|
||||
mask : paddle.Tensor
|
||||
Mask (#batch, 1, time2) or (#batch, time1, time2).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor:
|
||||
Transformed value (#batch, time1, d_model)
|
||||
weighted by the attention score (#batch, time1, time2).
|
||||
paddle.Tensor:
|
||||
Transformed value (#batch, time1, d_model)
|
||||
weighted by the attention score (#batch, time1, time2).
|
||||
"""
|
||||
n_batch = value.shape[0]
|
||||
softmax = paddle.nn.Softmax(axis=-1)
|
||||
|
@ -136,19 +136,19 @@ class MultiHeadedAttention(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
query : paddle.Tensor
|
||||
Query tensor (#batch, time1, size).
|
||||
key : paddle.Tensor
|
||||
Key tensor (#batch, time2, size).
|
||||
value : paddle.Tensor
|
||||
Value tensor (#batch, time2, size).
|
||||
mask : paddle.Tensor
|
||||
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
|
||||
query : paddle.Tensor
|
||||
Query tensor (#batch, time1, size).
|
||||
key : paddle.Tensor
|
||||
Key tensor (#batch, time2, size).
|
||||
value : paddle.Tensor
|
||||
Value tensor (#batch, time2, size).
|
||||
mask : paddle.Tensor
|
||||
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time1, d_model).
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time1, d_model).
|
||||
"""
|
||||
q, k, v = self.forward_qkv(query, key, value)
|
||||
scores = paddle.matmul(q, k.transpose(
|
||||
|
|
|
@ -24,14 +24,14 @@ class PositionalEncoding(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
d_model : int
|
||||
Embedding dimension.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
max_len : int
|
||||
Maximum input length.
|
||||
reverse : bool
|
||||
Whether to reverse the input position. Only for
|
||||
d_model : int
|
||||
Embedding dimension.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
max_len : int
|
||||
Maximum input length.
|
||||
reverse : bool
|
||||
Whether to reverse the input position.
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
|
||||
|
@ -68,13 +68,13 @@ class PositionalEncoding(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Input tensor (batch, time, `*`).
|
||||
x : paddle.Tensor
|
||||
Input tensor (batch, time, `*`).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Encoded tensor (batch, time, `*`).
|
||||
paddle.Tensor
|
||||
Encoded tensor (batch, time, `*`).
|
||||
"""
|
||||
self.extend_pe(x)
|
||||
x = x * self.xscale + self.pe[:, :x.shape[1]]
|
||||
|
|
|
@ -29,42 +29,42 @@ class Encoder(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
attention_dim : int
|
||||
Dimention of attention.
|
||||
attention_heads : int
|
||||
The number of heads of multi head attention.
|
||||
linear_units : int
|
||||
The number of units of position-wise feed forward.
|
||||
num_blocks : int
|
||||
The number of decoder blocks.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
positional_dropout_rate : float
|
||||
Dropout rate after adding positional encoding.
|
||||
attention_dropout_rate : float
|
||||
Dropout rate in attention.
|
||||
input_layer : Union[str, paddle.nn.Layer]
|
||||
Input layer type.
|
||||
pos_enc_class : paddle.nn.Layer
|
||||
Positional encoding module class.
|
||||
`PositionalEncoding `or `ScaledPositionalEncoding`
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
positionwise_layer_type : str
|
||||
"linear", "conv1d", or "conv1d-linear".
|
||||
positionwise_conv_kernel_size : int
|
||||
Kernel size of positionwise conv1d layer.
|
||||
selfattention_layer_type : str
|
||||
Encoder attention layer type.
|
||||
padding_idx : int
|
||||
Padding idx for input_layer=embed.
|
||||
idim : int
|
||||
Input dimension.
|
||||
attention_dim : int
|
||||
Dimention of attention.
|
||||
attention_heads : int
|
||||
The number of heads of multi head attention.
|
||||
linear_units : int
|
||||
The number of units of position-wise feed forward.
|
||||
num_blocks : int
|
||||
The number of decoder blocks.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
positional_dropout_rate : float
|
||||
Dropout rate after adding positional encoding.
|
||||
attention_dropout_rate : float
|
||||
Dropout rate in attention.
|
||||
input_layer : Union[str, paddle.nn.Layer]
|
||||
Input layer type.
|
||||
pos_enc_class : paddle.nn.Layer
|
||||
Positional encoding module class.
|
||||
`PositionalEncoding `or `ScaledPositionalEncoding`
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
positionwise_layer_type : str
|
||||
"linear", "conv1d", or "conv1d-linear".
|
||||
positionwise_conv_kernel_size : int
|
||||
Kernel size of positionwise conv1d layer.
|
||||
selfattention_layer_type : str
|
||||
Encoder attention layer type.
|
||||
padding_idx : int
|
||||
Padding idx for input_layer=embed.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -180,17 +180,17 @@ class Encoder(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : paddle.Tensor
|
||||
Input tensor (#batch, time, idim).
|
||||
masks : paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
xs : paddle.Tensor
|
||||
Input tensor (#batch, time, idim).
|
||||
masks : paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, attention_dim).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, attention_dim).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
"""
|
||||
xs = self.embed(xs)
|
||||
xs, masks = self.encoders(xs, masks)
|
||||
|
@ -203,21 +203,21 @@ class Encoder(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : paddle.Tensor
|
||||
Input tensor.
|
||||
masks : paddle.Tensor
|
||||
Mask tensor.
|
||||
cache : List[paddle.Tensor]
|
||||
List of cache tensors.
|
||||
xs : paddle.Tensor
|
||||
Input tensor.
|
||||
masks : paddle.Tensor
|
||||
Mask tensor.
|
||||
cache : List[paddle.Tensor]
|
||||
List of cache tensors.
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor.
|
||||
paddle.Tensor
|
||||
Mask tensor.
|
||||
List[paddle.Tensor]
|
||||
List of new cache tensors.
|
||||
paddle.Tensor
|
||||
Output tensor.
|
||||
paddle.Tensor
|
||||
Mask tensor.
|
||||
List[paddle.Tensor]
|
||||
List of new cache tensors.
|
||||
"""
|
||||
|
||||
xs = self.embed(xs)
|
||||
|
@ -229,4 +229,4 @@ class Encoder(nn.Layer):
|
|||
new_cache.append(xs)
|
||||
if self.normalize_before:
|
||||
xs = self.after_norm(xs)
|
||||
return xs, masks, new_cache
|
||||
return xs, masks, new_cache
|
|
@ -22,23 +22,23 @@ class EncoderLayer(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
size : int
|
||||
Input dimension.
|
||||
self_attn : paddle.nn.Layer
|
||||
Self-attention module instance.
|
||||
`MultiHeadedAttention` instance can be used as the argument.
|
||||
feed_forward : paddle.nn.Layer
|
||||
Feed-forward module instance.
|
||||
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
size : int
|
||||
Input dimension.
|
||||
self_attn : paddle.nn.Layer
|
||||
Self-attention module instance.
|
||||
`MultiHeadedAttention` instance can be used as the argument.
|
||||
feed_forward : paddle.nn.Layer
|
||||
Feed-forward module instance.
|
||||
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -67,19 +67,19 @@ class EncoderLayer(nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
x_input : paddle.Tensor
|
||||
Input tensor (#batch, time, size).
|
||||
mask : paddle.Tensor
|
||||
Mask tensor for the input (#batch, time).
|
||||
cache : paddle.Tensor
|
||||
Cache tensor of the input (#batch, time - 1, size).
|
||||
x_input : paddle.Tensor
|
||||
Input tensor (#batch, time, size).
|
||||
mask : paddle.Tensor
|
||||
Mask tensor for the input (#batch, time).
|
||||
cache : paddle.Tensor
|
||||
Cache tensor of the input (#batch, time - 1, size).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, size).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, size).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
"""
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
|
|
|
@ -34,14 +34,14 @@ class MultiLayeredConv1d(paddle.nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
in_chans : int
|
||||
Number of input channels.
|
||||
hidden_chans : int
|
||||
Number of hidden channels.
|
||||
kernel_size : int
|
||||
Kernel size of conv1d.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
in_chans : int
|
||||
Number of input channels.
|
||||
hidden_chans : int
|
||||
Number of hidden channels.
|
||||
kernel_size : int
|
||||
Kernel size of conv1d.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
|
||||
"""
|
||||
super(MultiLayeredConv1d, self).__init__()
|
||||
|
@ -65,13 +65,13 @@ class MultiLayeredConv1d(paddle.nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Batch of input tensors (B, T, in_chans).
|
||||
x : paddle.Tensor
|
||||
Batch of input tensors (B, T, in_chans).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Batch of output tensors (B, T, in_chans).
|
||||
paddle.Tensor
|
||||
Batch of output tensors (B, T, in_chans).
|
||||
"""
|
||||
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
||||
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
|
||||
|
@ -90,14 +90,14 @@ class Conv1dLinear(paddle.nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
in_chans : int
|
||||
Number of input channels.
|
||||
hidden_chans : int
|
||||
Number of hidden channels.
|
||||
kernel_size : int
|
||||
Kernel size of conv1d.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
in_chans : int
|
||||
Number of input channels.
|
||||
hidden_chans : int
|
||||
Number of hidden channels.
|
||||
kernel_size : int
|
||||
Kernel size of conv1d.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
"""
|
||||
super(Conv1dLinear, self).__init__()
|
||||
self.w_1 = paddle.nn.Conv1D(
|
||||
|
@ -115,13 +115,13 @@ class Conv1dLinear(paddle.nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Batch of input tensors (B, T, in_chans).
|
||||
x : paddle.Tensor
|
||||
Batch of input tensors (B, T, in_chans).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Batch of output tensors (B, T, in_chans).
|
||||
paddle.Tensor
|
||||
Batch of output tensors (B, T, in_chans).
|
||||
|
||||
"""
|
||||
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
||||
|
|
|
@ -21,12 +21,12 @@ class PositionwiseFeedForward(paddle.nn.Layer):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimenstion.
|
||||
hidden_units : int
|
||||
The number of hidden units.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
idim : int
|
||||
Input dimenstion.
|
||||
hidden_units : int
|
||||
The number of hidden units.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
|
|
@ -31,14 +31,14 @@ def repeat(N, fn):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
N : int
|
||||
Number of repeat time.
|
||||
fn : Callable
|
||||
Function to generate module.
|
||||
N : int
|
||||
Number of repeat time.
|
||||
fn : Callable
|
||||
Function to generate module.
|
||||
|
||||
Returns
|
||||
----------
|
||||
MultiSequential
|
||||
Repeated model instance.
|
||||
MultiSequential
|
||||
Repeated model instance.
|
||||
"""
|
||||
return MultiSequential(* [fn(n) for n in range(N)])
|
||||
|
|
|
@ -21,10 +21,10 @@ class LayerNorm(paddle.nn.LayerNorm):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
nout : int
|
||||
Output dim size.
|
||||
dim : int
|
||||
Dimension to be normalized.
|
||||
nout : int
|
||||
Output dim size.
|
||||
dim : int
|
||||
Dimension to be normalized.
|
||||
"""
|
||||
|
||||
def __init__(self, nout, dim=-1):
|
||||
|
@ -37,13 +37,13 @@ class LayerNorm(paddle.nn.LayerNorm):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Input tensor.
|
||||
x : paddle.Tensor
|
||||
Input tensor.
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Normalized tensor.
|
||||
paddle.Tensor
|
||||
Normalized tensor.
|
||||
"""
|
||||
if self.dim == -1:
|
||||
return super(LayerNorm, self).forward(x)
|
||||
|
|
|
@ -22,25 +22,25 @@ def pad_list(xs, pad_value):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
xs : List[Tensor]
|
||||
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
||||
pad_value : float)
|
||||
Value for padding.
|
||||
xs : List[Tensor]
|
||||
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
||||
pad_value : float)
|
||||
Value for padding.
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Padded tensor (B, Tmax, `*`).
|
||||
Tensor
|
||||
Padded tensor (B, Tmax, `*`).
|
||||
|
||||
Examples
|
||||
----------
|
||||
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
|
||||
>>> x
|
||||
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
|
||||
>>> pad_list(x, 0)
|
||||
tensor([[1., 1., 1., 1.],
|
||||
[1., 1., 0., 0.],
|
||||
[1., 0., 0., 0.]])
|
||||
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
|
||||
>>> x
|
||||
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
|
||||
>>> pad_list(x, 0)
|
||||
tensor([[1., 1., 1., 1.],
|
||||
[1., 1., 0., 0.],
|
||||
[1., 0., 0., 0.]])
|
||||
"""
|
||||
n_batch = len(xs)
|
||||
max_len = max(x.shape[0] for x in xs)
|
||||
|
@ -57,23 +57,23 @@ def make_pad_mask(lengths, length_dim=-1):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
lengths : LongTensor or List
|
||||
Batch of lengths (B,).
|
||||
lengths : LongTensor or List
|
||||
Batch of lengths (B,).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor(bool)
|
||||
Mask tensor containing indices of padded part bool.
|
||||
Tensor(bool)
|
||||
Mask tensor containing indices of padded part bool.
|
||||
|
||||
Examples
|
||||
----------
|
||||
With only lengths.
|
||||
With only lengths.
|
||||
|
||||
>>> lengths = [5, 3, 2]
|
||||
>>> make_non_pad_mask(lengths)
|
||||
masks = [[0, 0, 0, 0 ,0],
|
||||
[0, 0, 0, 1, 1],
|
||||
[0, 0, 1, 1, 1]]
|
||||
>>> lengths = [5, 3, 2]
|
||||
>>> make_non_pad_mask(lengths)
|
||||
masks = [[0, 0, 0, 0 ,0],
|
||||
[0, 0, 0, 1, 1],
|
||||
[0, 0, 1, 1, 1]]
|
||||
"""
|
||||
if length_dim == 0:
|
||||
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
|
||||
|
@ -99,29 +99,29 @@ def make_non_pad_mask(lengths, length_dim=-1):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
lengths : LongTensor or List
|
||||
Batch of lengths (B,).
|
||||
xs : Tensor, optional
|
||||
The reference tensor.
|
||||
If set, masks will be the same shape as this tensor.
|
||||
length_dim : int, optional
|
||||
Dimension indicator of the above tensor.
|
||||
See the example.
|
||||
lengths : LongTensor or List
|
||||
Batch of lengths (B,).
|
||||
xs : Tensor, optional
|
||||
The reference tensor.
|
||||
If set, masks will be the same shape as this tensor.
|
||||
length_dim : int, optional
|
||||
Dimension indicator of the above tensor.
|
||||
See the example.
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor(bool)
|
||||
mask tensor containing indices of padded part bool.
|
||||
Tensor(bool)
|
||||
mask tensor containing indices of padded part bool.
|
||||
|
||||
Examples
|
||||
----------
|
||||
With only lengths.
|
||||
With only lengths.
|
||||
|
||||
>>> lengths = [5, 3, 2]
|
||||
>>> make_non_pad_mask(lengths)
|
||||
masks = [[1, 1, 1, 1 ,1],
|
||||
[1, 1, 1, 0, 0],
|
||||
[1, 1, 0, 0, 0]]
|
||||
>>> lengths = [5, 3, 2]
|
||||
>>> make_non_pad_mask(lengths)
|
||||
masks = [[1, 1, 1, 1 ,1],
|
||||
[1, 1, 1, 0, 0],
|
||||
[1, 1, 0, 0, 0]]
|
||||
"""
|
||||
return paddle.logical_not(make_pad_mask(lengths, length_dim))
|
||||
|
||||
|
@ -135,10 +135,10 @@ def initialize(model: nn.Layer, init: str):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
model : paddle.nn.Layer
|
||||
Target.
|
||||
init : str
|
||||
Method of initialization.
|
||||
model : paddle.nn.Layer
|
||||
Target.
|
||||
init : str
|
||||
Method of initialization.
|
||||
"""
|
||||
assert check_argument_types()
|
||||
|
||||
|
|
|
@ -29,8 +29,8 @@ class SpectralConvergenceLoss(nn.Layer):
|
|||
def forward(self, x_mag, y_mag):
|
||||
"""Calculate forward propagation.
|
||||
Args:
|
||||
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, C, T).
|
||||
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, C, T).
|
||||
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
||||
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
||||
Returns:
|
||||
Tensor: Spectral convergence loss value.
|
||||
"""
|
||||
|
@ -50,11 +50,16 @@ class LogSTFTMagnitudeLoss(nn.Layer):
|
|||
|
||||
def forward(self, x_mag, y_mag):
|
||||
"""Calculate forward propagation.
|
||||
Args:
|
||||
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
||||
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
||||
Returns:
|
||||
Tensor: Log STFT magnitude loss value.
|
||||
Parameters
|
||||
----------
|
||||
x_mag : Tensor
|
||||
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
||||
y_mag : Tensor
|
||||
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Log STFT magnitude loss value.
|
||||
"""
|
||||
return F.l1_loss(
|
||||
paddle.log(paddle.clip(
|
||||
|
@ -86,15 +91,23 @@ class STFTLoss(nn.Layer):
|
|||
|
||||
def forward(self, x, y):
|
||||
"""Calculate forward propagation.
|
||||
Args:
|
||||
x (Tensor): Predicted signal (B, T).
|
||||
y (Tensor): Groundtruth signal (B, T).
|
||||
Returns:
|
||||
Tensor: Spectral convergence loss value.
|
||||
Tensor: Log STFT magnitude loss value.
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
Predicted signal (B, T).
|
||||
y : Tensor
|
||||
Groundtruth signal (B, T).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Spectral convergence loss value.
|
||||
Tensor
|
||||
Log STFT magnitude loss value.
|
||||
"""
|
||||
x_mag = self.stft.magnitude(x)
|
||||
y_mag = self.stft.magnitude(y)
|
||||
x_mag = x_mag.transpose([0, 2, 1])
|
||||
y_mag = y_mag.transpose([0, 2, 1])
|
||||
sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
|
||||
mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
|
||||
|
||||
|
@ -111,11 +124,16 @@ class MultiResolutionSTFTLoss(nn.Layer):
|
|||
win_lengths=[600, 1200, 240],
|
||||
window="hann", ):
|
||||
"""Initialize Multi resolution STFT loss module.
|
||||
Args:
|
||||
fft_sizes (list): List of FFT sizes.
|
||||
hop_sizes (list): List of hop sizes.
|
||||
win_lengths (list): List of window lengths.
|
||||
window (str): Window function type.
|
||||
Parameters
|
||||
----------
|
||||
fft_sizes : list
|
||||
List of FFT sizes.
|
||||
hop_sizes : list
|
||||
List of hop sizes.
|
||||
win_lengths : list
|
||||
List of window lengths.
|
||||
window : str
|
||||
Window function type.
|
||||
"""
|
||||
super().__init__()
|
||||
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
|
||||
|
@ -125,13 +143,24 @@ class MultiResolutionSTFTLoss(nn.Layer):
|
|||
|
||||
def forward(self, x, y):
|
||||
"""Calculate forward propagation.
|
||||
Args:
|
||||
x (Tensor): Predicted signal (B, T).
|
||||
y (Tensor): Groundtruth signal (B, T).
|
||||
Returns:
|
||||
Tensor: Multi resolution spectral convergence loss value.
|
||||
Tensor: Multi resolution log STFT magnitude loss value.
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
Predicted signal (B, T) or (B, #subband, T).
|
||||
y : Tensor
|
||||
Groundtruth signal (B, T) or (B, #subband, T).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Multi resolution spectral convergence loss value.
|
||||
Tensor
|
||||
Multi resolution log STFT magnitude loss value.
|
||||
"""
|
||||
if len(x.shape) == 3:
|
||||
# (B, C, T) -> (B x C, T)
|
||||
x = x.reshape([-1, x.shape[2]])
|
||||
# (B, C, T) -> (B x C, T)
|
||||
y = y.reshape([-1, y.shape[2]])
|
||||
sc_loss = 0.0
|
||||
mag_loss = 0.0
|
||||
for f in self.stft_losses:
|
||||
|
|
Loading…
Reference in New Issue