This commit is contained in:
TianYuan 2021-08-05 12:29:20 +00:00
parent 3ac2e01263
commit 796fafbac8
36 changed files with 728 additions and 742 deletions

View File

@ -21,10 +21,10 @@ from typing import List, Dict, Any
import jsonlines
import librosa
import numpy as np
from parakeet.data.get_feats import LogMelFBank, Energy, Pitch
import tqdm
from config import get_cfg_default
from get_feats import LogMelFBank, Energy, Pitch
def get_phn_dur(file_name):

View File

@ -94,7 +94,7 @@ def main():
parser.add_argument(
"--fastspeech2-config",
type=str,
help="config file to overwrite default config")
help="config file to overwrite default config.")
parser.add_argument(
"--fastspeech2-checkpoint",
type=str,
@ -121,13 +121,13 @@ def main():
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt ",
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument("--test-metadata", type=str, help="test metadata")
parser.add_argument("--output-dir", type=str, help="output dir")
parser.add_argument("--test-metadata", type=str, help="test metadata.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--device", type=str, default="gpu", help="device type to use")
parser.add_argument("--verbose", type=int, default=1, help="verbose")
"--device", type=str, default="gpu", help="device type to use.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
with open(args.fastspeech2_config) as f:

View File

@ -99,7 +99,7 @@ def main():
parser.add_argument(
"--fastspeech2-config",
type=str,
help="config file to overwrite default config")
help="fastspeech2 config file to overwrite default config.")
parser.add_argument(
"--fastspeech2-checkpoint",
type=str,
@ -112,8 +112,7 @@ def main():
parser.add_argument(
"--pwg-config",
type=str,
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
)
help="parallel wavegan config file to overwrite default config.")
parser.add_argument(
"--pwg-params",
type=str,
@ -126,16 +125,16 @@ def main():
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt ",
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument(
"--text",
type=str,
help="text to synthesize, a 'utt_id sentence' pair per line")
parser.add_argument("--output-dir", type=str, help="output dir")
help="text to synthesize, a 'utt_id sentence' pair per line.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--device", type=str, default="gpu", help="device type to use")
parser.add_argument("--verbose", type=int, default=1, help="verbose")
"--device", type=str, default="gpu", help="device type to use.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
with open(args.fastspeech2_config) as f:

View File

@ -169,18 +169,18 @@ def train_sp(args, config):
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
parser = argparse.ArgumentParser(description="Train a FastSpeech2 "
"model with Baker Mandrin TTS dataset.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config")
parser.add_argument("--train-metadata", type=str, help="training data")
parser.add_argument("--dev-metadata", type=str, help="dev data")
parser.add_argument("--output-dir", type=str, help="output dir")
"--config", type=str, help="config file to overwrite default config.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--device", type=str, default="gpu", help="device type to use")
"--device", type=str, default="gpu", help="device type to use.")
parser.add_argument(
"--nprocs", type=int, default=1, help="number of processes")
parser.add_argument("--verbose", type=int, default=1, help="verbose")
"--nprocs", type=int, default=1, help="number of processes.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
parser.add_argument(
"--phones-dict",
type=str,

View File

@ -27,10 +27,14 @@ class Clip(object):
aux_context_window=0, ):
"""Initialize customized collater for DataLoader.
Args:
batch_max_steps (int): The maximum length of input signal in batch.
hop_size (int): Hop size of auxiliary features.
aux_context_window (int): Context window size for auxiliary feature conv.
Parameters
----------
batch_max_steps : int
The maximum length of input signal in batch.
hop_size : int
Hop size of auxiliary features.
aux_context_window : int
Context window size for auxiliary feature conv.
"""
if batch_max_steps % hop_size != 0:
@ -49,14 +53,18 @@ class Clip(object):
def __call__(self, examples):
"""Convert into batch tensors.
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape
(T, ), features shape(T', C).
Parameters
----------
batch : list
list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns:
Tensor: Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Tensor: Target signal batch (B, 1, T).
Returns
----------
Tensor
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Tensor
Target signal batch (B, 1, T).
"""
# check length
@ -93,10 +101,11 @@ class Clip(object):
def _adjust_length(self, x, c):
"""Adjust the audio and feature lengths.
Note:
Basically we assume that the length of x and c are adjusted
through preprocessing stage, but if we use other library processed
features, this process will be needed.
Note
-------
Basically we assume that the length of x and c are adjusted
through preprocessing stage, but if we use other library processed
features, this process will be needed.
"""
if len(x) < c.shape[1] * self.hop_size:

View File

@ -82,7 +82,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 6 # Batch size.
batch_size: 8 # Batch size.
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader.

View File

@ -12,88 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Dict, Any
import soundfile as sf
import librosa
import numpy as np
import argparse
import yaml
import json
import jsonlines
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from pathlib import Path
import tqdm
from operator import itemgetter
from praatio import tgio
from typing import List, Dict, Any
import argparse
import jsonlines
import librosa
import logging
import numpy as np
import tqdm
from concurrent.futures import ThreadPoolExecutor
from parakeet.data.get_feats import LogMelFBank
from pathlib import Path
from praatio import tgio
from config import get_cfg_default
def logmelfilterbank(audio,
sr,
n_fft=1024,
hop_length=256,
win_length=None,
window="hann",
n_mels=80,
fmin=None,
fmax=None,
eps=1e-10):
"""Compute log-Mel filterbank feature.
Parameters
----------
audio : ndarray
Audio signal (T,).
sr : int
Sampling rate.
n_fft : int
FFT size. (Default value = 1024)
hop_length : int
Hop size. (Default value = 256)
win_length : int
Window length. If set to None, it will be the same as fft_size. (Default value = None)
window : str
Window function type. (Default value = "hann")
n_mels : int
Number of mel basis. (Default value = 80)
fmin : int
Minimum frequency in mel basis calculation. (Default value = None)
fmax : int
Maximum frequency in mel basis calculation. (Default value = None)
eps : float
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
Returns
-------
np.ndarray
Log Mel filterbank feature (#frames, num_mels).
"""
# get amplitude spectrogram
x_stft = librosa.stft(
audio,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
pad_mode="reflect")
spc = np.abs(x_stft) # (#bins, #frames,)
# get mel basis
fmin = 0 if fmin is None else fmin
fmax = sr / 2 if fmax is None else fmax
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
def process_sentence(config: Dict[str, Any],
fp: Path,
alignment_fp: Path,
output_dir: Path):
output_dir: Path,
mel_extractor=None):
utt_id = fp.stem
# reading
@ -134,19 +74,11 @@ def process_sentence(config: Dict[str, Any],
frame_length=config.trim_frame_length,
hop_length=config.trim_hop_length)
logmel = logmelfilterbank(
y,
sr=sr,
n_fft=config.n_fft,
window=config.window,
win_length=config.win_length,
hop_length=config.hop_length,
n_mels=config.n_mels,
fmin=config.fmin,
fmax=config.fmax)
# extract mel feats
logmel = mel_extractor.get_log_mel_fbank(y)
# adjust time to make num_samples == num_frames * hop_length
num_frames = logmel.shape[1]
num_frames = logmel.shape[0]
if y.size < num_frames * config.hop_length:
y = np.pad(y, (0, num_frames * config.hop_length - y.size),
mode="reflect")
@ -157,7 +89,7 @@ def process_sentence(config: Dict[str, Any],
mel_path = output_dir / (utt_id + "_feats.npy")
wav_path = output_dir / (utt_id + "_wave.npy")
np.save(wav_path, y) # (num_samples, )
np.save(mel_path, logmel.T) # (num_frames, n_mels)
np.save(mel_path, logmel) # (num_frames, n_mels)
record = {
"utt_id": utt_id,
"num_samples": num_sample,
@ -172,19 +104,22 @@ def process_sentences(config,
fps: List[Path],
alignment_fps: List[Path],
output_dir: Path,
mel_extractor=None,
nprocs: int=1):
if nprocs == 1:
results = []
for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
results.append(
process_sentence(config, fp, alignment_fp, output_dir))
process_sentence(config, fp, alignment_fp, output_dir,
mel_extractor))
else:
with ThreadPoolExecutor(nprocs) as pool:
futures = []
with tqdm.tqdm(total=len(fps)) as progress:
for fp, alignment_fp in zip(fps, alignment_fps):
future = pool.submit(process_sentence, config, fp,
alignment_fp, output_dir)
alignment_fp, output_dir,
mel_extractor)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
@ -260,24 +195,37 @@ def main():
test_dump_dir = dumpdir / "test" / "raw"
test_dump_dir.mkdir(parents=True, exist_ok=True)
mel_extractor = LogMelFBank(
sr=C.sr,
n_fft=C.n_fft,
hop_length=C.hop_length,
win_length=C.win_length,
window=C.window,
n_mels=C.n_mels,
fmin=C.fmin,
fmax=C.fmax)
# process for the 3 sections
process_sentences(
C,
train_wav_files,
train_alignment_files,
train_dump_dir,
mel_extractor=mel_extractor,
nprocs=args.num_cpu)
process_sentences(
C,
dev_wav_files,
dev_alignment_files,
dev_dump_dir,
mel_extractor=mel_extractor,
nprocs=args.num_cpu)
process_sentences(
C,
test_wav_files,
test_alignment_files,
test_dump_dir,
mel_extractor=mel_extractor,
nprocs=args.num_cpu)

0
examples/parallelwave_gan/baker/preprocess.sh Normal file → Executable file
View File

View File

@ -78,16 +78,17 @@ class PWGUpdater(StandardUpdater):
wav_ = self.generator(noise, mel)
logging.debug(f"Generator takes {t.elapse}s.")
## Multi-resolution stft loss
# initialize
gen_loss = 0.0
## Multi-resolution stft loss
with timer() as t:
sc_loss, mag_loss = self.criterion_stft(
wav_.squeeze(1), wav.squeeze(1))
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")
report("train/spectral_convergence_loss", float(sc_loss))
report("train/log_stft_magnitude_loss", float(mag_loss))
gen_loss = sc_loss + mag_loss
gen_loss += sc_loss + mag_loss
## Adversarial loss
if self.state.iteration > self.discriminator_train_start_steps:
@ -119,9 +120,9 @@ class PWGUpdater(StandardUpdater):
p_ = self.discriminator(wav_.detach())
real_loss = self.criterion_mse(p, paddle.ones_like(p))
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
dis_loss = real_loss + fake_loss
report("train/real_loss", float(real_loss))
report("train/fake_loss", float(fake_loss))
dis_loss = real_loss + fake_loss
report("train/discriminator_loss", float(dis_loss))
self.optimizer_d.clear_grad()
@ -164,8 +165,7 @@ class PWGEvaluator(StandardEvaluator):
# stft loss
with timer() as t:
sc_loss, mag_loss = self.criterion_stft(
wav_.squeeze(1), wav.squeeze(1))
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")
report("eval/spectral_convergence_loss", float(sc_loss))
@ -178,7 +178,7 @@ class PWGEvaluator(StandardEvaluator):
p = self.discriminator(wav)
real_loss = self.criterion_mse(p, paddle.ones_like(p))
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
dis_loss = real_loss + fake_loss
report("eval/real_loss", float(real_loss))
report("eval/fake_loss", float(fake_loss))
dis_loss = real_loss + fake_loss
report("eval/discriminator_loss", float(dis_loss))

0
examples/parallelwave_gan/baker/run.sh Normal file → Executable file
View File

View File

@ -32,14 +32,14 @@ from parakeet.models.parallel_wavegan import PWGGenerator
from config import get_cfg_default
parser = argparse.ArgumentParser(
description="synthesize with parallel wavegan.")
description="Synthesize with parallel wavegan.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config")
parser.add_argument("--checkpoint", type=str, help="snapshot to load")
parser.add_argument("--test-metadata", type=str, help="dev data")
parser.add_argument("--output-dir", type=str, help="output dir")
parser.add_argument("--device", type=str, default="gpu", help="device to run")
parser.add_argument("--verbose", type=int, default=1, help="verbose")
"--config", type=str, help="config file to overwrite default config.")
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
parser.add_argument("--test-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument("--device", type=str, default="gpu", help="device to run.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
config = get_cfg_default()
@ -89,5 +89,5 @@ for example in test_dataset:
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
)
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr)
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.sr)
print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")

View File

@ -0,0 +1,5 @@
python3 synthesize.py \
--config=conf/default.yaml \
--checkpoint=exp/default/checkpoints/snapshot_iter_220000.pdz \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/debug/test

View File

@ -0,0 +1,111 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import logging
from pathlib import Path
import librosa
import numpy as np
import paddle
import soundfile as sf
import yaml
from parakeet.data.get_feats import LogMelFBank
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
from parakeet.modules.normalizer import ZScore
from config import get_cfg_default
def evaluate(args, config):
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
vocoder = PWGGenerator(**config["generator_params"])
state_dict = paddle.load(args.checkpoint)
vocoder.set_state_dict(state_dict["generator_params"])
vocoder.remove_weight_norm()
vocoder.eval()
print("model done!")
stat = np.load(args.stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
normalizer = ZScore(mu, std)
pwg_inference = PWGInference(normalizer, vocoder)
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
mel_extractor = LogMelFBank(
sr=config.sr,
n_fft=config.n_fft,
hop_length=config.hop_length,
win_length=config.win_length,
window=config.window,
n_mels=config.n_mels,
fmin=config.fmin,
fmax=config.fmax)
for utt_name in os.listdir(input_dir):
wav, _ = librosa.load(str(input_dir / utt_name), sr=config.sr)
# extract mel feats
mel = mel_extractor.get_log_mel_fbank(wav)
mel = paddle.to_tensor(mel)
gen_wav = pwg_inference(mel)
sf.write(
str(output_dir / ("gen_" + utt_name)),
gen_wav.numpy(),
samplerate=config.sr)
print(f"{utt_name} done!")
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(
description="Synthesize with parallel wavegan.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config.")
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
parser.add_argument(
"--stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
)
parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--device", type=str, default="gpu", help="device to run.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
config = get_cfg_default()
if args.config:
config.merge_from_file(args.config)
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(config)
evaluate(args, config)
if __name__ == "__main__":
main()

View File

@ -12,36 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import logging
import argparse
import dataclasses
from pathlib import Path
import os
import logging
import yaml
import jsonlines
import paddle
import numpy as np
from paddle import nn
from paddle.nn import functional as F
import paddle
import yaml
from paddle import DataParallel
from paddle import distributed as dist
from paddle import nn
from paddle.io import DataLoader, DistributedBatchSampler
from paddle.optimizer import Adam # No RAdaom
from paddle.optimizer.lr import StepDecay
from paddle import DataParallel
from visualdl import LogWriter
from parakeet.datasets.data_table import DataTable
from parakeet.training.updater import UpdaterBase
from parakeet.training.trainer import Trainer
from parakeet.training.reporter import report
from parakeet.training import extension
from parakeet.training.extensions.snapshot import Snapshot
from parakeet.training.extensions.visualizer import VisualDL
from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
from parakeet.training.extensions.snapshot import Snapshot
from parakeet.training.extensions.visualizer import VisualDL
from parakeet.training.seeding import seed_everything
from parakeet.training.trainer import Trainer
from pathlib import Path
from visualdl import LogWriter
from batch_fn import Clip
from config import get_cfg_default
@ -210,15 +203,15 @@ def main():
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
"model with Baker Mandrin TTS dataset.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config")
parser.add_argument("--train-metadata", type=str, help="training data")
parser.add_argument("--dev-metadata", type=str, help="dev data")
parser.add_argument("--output-dir", type=str, help="output dir")
"--config", type=str, help="config file to overwrite default config.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--device", type=str, default="gpu", help="device type to use")
"--device", type=str, default="gpu", help="device type to use.")
parser.add_argument(
"--nprocs", type=int, default=1, help="number of processes")
parser.add_argument("--verbose", type=int, default=1, help="verbose")
"--nprocs", type=int, default=1, help="number of processes.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
if args.device == "cpu" and args.nprocs > 1:

View File

@ -12,94 +12,34 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Dict, Any
import soundfile as sf
import librosa
import numpy as np
import argparse
import yaml
import json
import re
import jsonlines
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from pathlib import Path
import tqdm
from operator import itemgetter
from praatio import tgio
from typing import List, Dict, Any
import argparse
import jsonlines
import librosa
import logging
import numpy as np
import re
import tqdm
from concurrent.futures import ThreadPoolExecutor
from parakeet.data.get_feats import LogMelFBank
from pathlib import Path
from praatio import tgio
from config import get_cfg_default
from tg_utils import validate_textgrid
def logmelfilterbank(audio,
sr,
n_fft=1024,
hop_length=256,
win_length=None,
window="hann",
n_mels=80,
fmin=None,
fmax=None,
eps=1e-10):
"""Compute log-Mel filterbank feature.
Parameters
----------
audio : ndarray
Audio signal (T,).
sr : int
Sampling rate.
n_fft : int
FFT size. (Default value = 1024)
hop_length : int
Hop size. (Default value = 256)
win_length : int
Window length. If set to None, it will be the same as fft_size. (Default value = None)
window : str
Window function type. (Default value = "hann")
n_mels : int
Number of mel basis. (Default value = 80)
fmin : int
Minimum frequency in mel basis calculation. (Default value = None)
fmax : int
Maximum frequency in mel basis calculation. (Default value = None)
eps : float
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
Returns
-------
np.ndarray
Log Mel filterbank feature (#frames, num_mels).
"""
# get amplitude spectrogram
x_stft = librosa.stft(
audio,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
pad_mode="reflect")
spc = np.abs(x_stft) # (#bins, #frames,)
# get mel basis
fmin = 0 if fmin is None else fmin
fmax = sr / 2 if fmax is None else fmax
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
def process_sentence(config: Dict[str, Any],
fp: Path,
alignment_fp: Path,
output_dir: Path):
output_dir: Path,
mel_extractor=None):
utt_id = fp.stem
# reading
y, sr = librosa.load(fp, sr=config.sr) # resampling may occur
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
assert np.abs(y).max(
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
@ -125,16 +65,8 @@ def process_sentence(config: Dict[str, Any],
f" There is something wrong with the last interval {last} in utterance: {utt_id}"
)
logmel = logmelfilterbank(
y,
sr=sr,
n_fft=config.n_fft,
window=config.window,
win_length=config.win_length,
hop_length=config.hop_length,
n_mels=config.n_mels,
fmin=config.fmin,
fmax=config.fmax)
# extract mel feats
logmel = mel_extractor.get_log_mel_fbank(y)
# extract phone and duration
phones = []
@ -162,7 +94,7 @@ def process_sentence(config: Dict[str, Any],
ends, sr=sr, hop_length=config.hop_length)
durations_frame = np.diff(frame_pos, prepend=0)
num_frames = logmel.shape[-1] # number of frames of the spectrogram
num_frames = logmel.shape[0] # number of frames of the spectrogram
extra = np.sum(durations_frame) - num_frames
assert extra <= 0, (
f"Number of frames inferred from alignemnt is "
@ -173,7 +105,7 @@ def process_sentence(config: Dict[str, Any],
durations_frame = durations_frame.tolist()
mel_path = output_dir / (utt_id + "_feats.npy")
np.save(mel_path, logmel.T) # (num_frames, n_mels)
np.save(mel_path, logmel) # (num_frames, n_mels)
record = {
"utt_id": utt_id,
"phones": phones,
@ -190,20 +122,23 @@ def process_sentences(config,
fps: List[Path],
alignment_fps: List[Path],
output_dir: Path,
mel_extractor=None,
nprocs: int=1):
if nprocs == 1:
results = []
for fp, alignment_fp in tqdm.tqdm(
zip(fps, alignment_fps), total=len(fps)):
results.append(
process_sentence(config, fp, alignment_fp, output_dir))
process_sentence(config, fp, alignment_fp, output_dir,
mel_extractor))
else:
with ThreadPoolExecutor(nprocs) as pool:
futures = []
with tqdm.tqdm(total=len(fps)) as progress:
for fp, alignment_fp in zip(fps, alignment_fps):
future = pool.submit(process_sentence, config, fp,
alignment_fp, output_dir)
alignment_fp, output_dir,
mel_extractor)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
@ -284,24 +219,37 @@ def main():
test_dump_dir = dumpdir / "test" / "raw"
test_dump_dir.mkdir(parents=True, exist_ok=True)
mel_extractor = LogMelFBank(
sr=C.sr,
n_fft=C.n_fft,
hop_length=C.hop_length,
win_length=C.win_length,
window=C.window,
n_mels=C.n_mels,
fmin=C.fmin,
fmax=C.fmax)
# process for the 3 sections
process_sentences(
C,
train_wav_files,
train_alignment_files,
train_dump_dir,
mel_extractor=mel_extractor,
nprocs=args.num_cpu)
process_sentences(
C,
dev_wav_files,
dev_alignment_files,
dev_dump_dir,
mel_extractor=mel_extractor,
nprocs=args.num_cpu)
process_sentences(
C,
test_wav_files,
test_alignment_files,
test_dump_dir,
mel_extractor=mel_extractor,
nprocs=args.num_cpu)

0
examples/speedyspeech/baker/preprocess.sh Normal file → Executable file
View File

0
examples/speedyspeech/baker/run.sh Normal file → Executable file
View File

View File

@ -12,40 +12,31 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import logging
import argparse
import dataclasses
from pathlib import Path
import logging
import os
import yaml
import jsonlines
import paddle
import numpy as np
from paddle import nn
from paddle.nn import functional as F
import paddle
import yaml
from paddle import distributed as dist
from paddle import DataParallel
from paddle import nn
from paddle.io import DataLoader, DistributedBatchSampler
from paddle.optimizer import Adam # No RAdaom
from paddle.optimizer.lr import StepDecay
from paddle import DataParallel
from visualdl import LogWriter
from parakeet.datasets.data_table import DataTable
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.training.updater import UpdaterBase
from parakeet.training.trainer import Trainer
from parakeet.training.reporter import report
from parakeet.training import extension
from parakeet.training.extensions.snapshot import Snapshot
from parakeet.training.extensions.visualizer import VisualDL
from parakeet.training.seeding import seed_everything
from parakeet.training.trainer import Trainer
from pathlib import Path
from visualdl import LogWriter
from batch_fn import collate_baker_examples
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
from config import get_cfg_default
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
def train_sp(args, config):
@ -93,10 +84,6 @@ def train_sp(args, config):
batch_size=config.batch_size,
shuffle=False,
drop_last=True)
# dev_sampler = DistributedBatchSampler(dev_dataset,
# batch_size=config.batch_size,
# shuffle=False,
# drop_last=False)
print("samplers done!")
train_dataloader = DataLoader(
@ -113,13 +100,9 @@ def train_sp(args, config):
num_workers=config.num_workers)
print("dataloaders done!")
# batch = collate_baker_examples([train_dataset[i] for i in range(10)])
# # batch = collate_baker_examples([dev_dataset[i] for i in range(10)])
# import pdb; pdb.set_trace()
model = SpeedySpeech(**config["model"])
if world_size > 1:
model = DataParallel(model) # TODO, do not use vocab size from config
# print(model)
print("model done!")
optimizer = Adam(
0.001,
@ -147,18 +130,18 @@ def train_sp(args, config):
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
parser = argparse.ArgumentParser(description="Train a SpeedySpeech "
"model with Baker Mandrin TTS dataset.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config")
parser.add_argument("--train-metadata", type=str, help="training data")
parser.add_argument("--dev-metadata", type=str, help="dev data")
parser.add_argument("--output-dir", type=str, help="output dir")
"--config", type=str, help="config file to overwrite default config.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--device", type=str, default="gpu", help="device type to use")
"--device", type=str, default="gpu", help="device type to use.")
parser.add_argument(
"--nprocs", type=int, default=1, help="number of processes")
parser.add_argument("--verbose", type=int, default=1, help="verbose")
"--nprocs", type=int, default=1, help="number of processes.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
args = parser.parse_args()
if args.device == "cpu" and args.nprocs > 1:

View File

@ -27,5 +27,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.data.dataset import *
from parakeet.data.batch import *
from parakeet.data.dataset import *
from parakeet.data.get_feats import *

View File

@ -17,8 +17,6 @@ import numpy as np
import pyworld
from scipy.interpolate import interp1d
from config import get_cfg_default
class LogMelFBank():
def __init__(self,
@ -42,8 +40,8 @@ class LogMelFBank():
# mel
self.n_mels = n_mels
self.fmin = fmin
self.fmax = fmax
self.fmin = 0 if fmin is None else fmin
self.fmax = sr / 2 if fmax is None else fmax
self.mel_filter = self._create_mel_filter()
@ -217,41 +215,3 @@ class Energy():
if use_token_averaged_energy and duration is not None:
energy = self._average_by_duration(energy, duration)
return energy
if __name__ == "__main__":
C = get_cfg_default()
filename = "../raw_data/data/format.1/000001.flac"
wav, _ = librosa.load(filename, sr=C.fs)
mel_extractor = LogMelFBank(
sr=C.fs,
n_fft=C.n_fft,
hop_length=C.n_shift,
win_length=C.win_length,
window=C.window,
n_mels=C.n_mels,
fmin=C.fmin,
fmax=C.fmax, )
mel = mel_extractor.get_log_mel_fbank(wav)
print(mel)
print(mel.shape)
pitch_extractor = Pitch(
sr=C.fs, hop_length=C.n_shift, f0min=C.f0min, f0max=C.f0max)
duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
duration = np.array([int(x) for x in duration.split(" ")])
avg_f0 = pitch_extractor.get_pitch(wav, duration=duration)
print(avg_f0)
print(avg_f0.shape)
energy_extractor = Energy(
sr=C.fs,
n_fft=C.n_fft,
hop_length=C.n_shift,
win_length=C.win_length,
window=C.window)
duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5"
duration = np.array([int(x) for x in duration.split(" ")])
avg_energy = energy_extractor.get_energy(wav, duration=duration)
print(avg_energy)
print(avg_energy.sum())

View File

@ -109,4 +109,5 @@ class Frontend():
def get_phonemes(self, sentence):
sentences = self.text_normalizer.normalize(sentence)
phonemes = self._g2p(sentences)
print("phonemes:", phonemes)
return phonemes

View File

@ -15,7 +15,6 @@
from typing import Dict, Sequence, Tuple
import numpy as np
import paddle
from paddle import nn
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss
@ -252,36 +251,36 @@ class FastSpeech2(nn.Layer):
Parameters
----------
text : Tensor
Batch of padded token ids (B, Tmax).
text_lengths : Tensor)
Batch of lengths of each input (B,).
speech : Tensor
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor
Batch of the lengths of each target (B,).
durations : Tensor
Batch of padded durations (B, Tmax).
pitch : Tensor
Batch of padded token-averaged pitch (B, Tmax, 1).
energy : Tensor
Batch of padded token-averaged energy (B, Tmax, 1).
text : Tensor
Batch of padded token ids (B, Tmax).
text_lengths : Tensor)
Batch of lengths of each input (B,).
speech : Tensor
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor
Batch of the lengths of each target (B,).
durations : Tensor
Batch of padded durations (B, Tmax).
pitch : Tensor
Batch of padded token-averaged pitch (B, Tmax, 1).
energy : Tensor
Batch of padded token-averaged energy (B, Tmax, 1).
Returns
----------
Tensor
mel outs before postnet
Tensor
mel outs after postnet
Tensor
duration predictor's output
Tensor
pitch predictor's output
Tensor
energy predictor's output
Tensor
speech
Tensor
speech_lengths, modified if reduction_factor >1
Tensor
mel outs before postnet
Tensor
mel outs after postnet
Tensor
duration predictor's output
Tensor
pitch predictor's output
Tensor
energy predictor's output
Tensor
speech
Tensor
speech_lengths, modified if reduction_factor > 1
"""
xs = text
@ -389,26 +388,26 @@ class FastSpeech2(nn.Layer):
Parameters
----------
text : Tensor
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : Tensor, optional
Groundtruth of duration (T,).
pitch : Tensor, optional
Groundtruth of token-averaged pitch (T, 1).
energy : Tensor, optional
Groundtruth of token-averaged energy (T, 1).
alpha : float, optional
Alpha to control the speed.
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
text : Tensor
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : Tensor, optional
Groundtruth of duration (T,).
pitch : Tensor, optional
Groundtruth of token-averaged pitch (T, 1).
energy : Tensor, optional
Groundtruth of token-averaged energy (T, 1).
alpha : float, optional
Alpha to control the speed.
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
Returns
----------
Tensor
Output sequence of features (L, odim).
Tensor
Output sequence of features (L, odim).
"""
x, y = text, speech
d, p, e = durations, pitch, energy
@ -448,21 +447,21 @@ class FastSpeech2(nn.Layer):
Parameters
----------
ilens : Tensor
Batch of lengths (B,).
ilens : Tensor
Batch of lengths (B,).
Returns
-------
Tensor
Mask tensor for self-attention.
dtype=paddle.bool
Tensor
Mask tensor for self-attention.
dtype=paddle.bool
Examples
-------
>>> ilens = [5, 3]
>>> self._source_mask(ilens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 0, 0]]]) bool
>>> ilens = [5, 3]
>>> self._source_mask(ilens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 0, 0]]]) bool
"""
x_masks = make_non_pad_mask(ilens)
@ -509,10 +508,10 @@ class FastSpeech2Loss(nn.Layer):
Parameters
----------
use_masking : bool
Whether to apply masking for padded part in loss calculation.
use_weighted_masking : bool
Whether to weighted masking in loss calculation.
use_masking : bool
Whether to apply masking for padded part in loss calculation.
use_weighted_masking : bool
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
@ -545,39 +544,39 @@ class FastSpeech2Loss(nn.Layer):
Parameters
----------
after_outs : Tensor
Batch of outputs after postnets (B, Lmax, odim).
before_outs : Tensor
Batch of outputs before postnets (B, Lmax, odim).
d_outs : Tensor
Batch of outputs of duration predictor (B, Tmax).
p_outs : Tensor
Batch of outputs of pitch predictor (B, Tmax, 1).
e_outs : Tensor
Batch of outputs of energy predictor (B, Tmax, 1).
ys : Tensor
Batch of target features (B, Lmax, odim).
ds : Tensor
Batch of durations (B, Tmax).
ps : Tensor
Batch of target token-averaged pitch (B, Tmax, 1).
es : Tensor
Batch of target token-averaged energy (B, Tmax, 1).
ilens : Tensor
Batch of the lengths of each input (B,).
olens : Tensor
Batch of the lengths of each target (B,).
after_outs : Tensor
Batch of outputs after postnets (B, Lmax, odim).
before_outs : Tensor
Batch of outputs before postnets (B, Lmax, odim).
d_outs : Tensor
Batch of outputs of duration predictor (B, Tmax).
p_outs : Tensor
Batch of outputs of pitch predictor (B, Tmax, 1).
e_outs : Tensor
Batch of outputs of energy predictor (B, Tmax, 1).
ys : Tensor
Batch of target features (B, Lmax, odim).
ds : Tensor
Batch of durations (B, Tmax).
ps : Tensor
Batch of target token-averaged pitch (B, Tmax, 1).
es : Tensor
Batch of target token-averaged energy (B, Tmax, 1).
ilens : Tensor
Batch of the lengths of each input (B,).
olens : Tensor
Batch of the lengths of each target (B,).
Returns
----------
Tensor
L1 loss value.
Tensor
Duration predictor loss value.
Tensor
Pitch predictor loss value.
Tensor
Energy predictor loss value.
Tensor
L1 loss value.
Tensor
Duration predictor loss value.
Tensor
Pitch predictor loss value.
Tensor
Energy predictor loss value.
"""
# apply mask to remove padded part

View File

@ -32,10 +32,10 @@ class DurationPredictor(nn.Layer):
Note
----------
The calculation domain of outputs is different
between in `forward` and in `inference`. In `forward`,
the outputs are calculated in log domain but in `inference`,
those are calculated in linear domain.
The calculation domain of outputs is different
between in `forward` and in `inference`. In `forward`,
the outputs are calculated in log domain but in `inference`,
those are calculated in linear domain.
"""
@ -50,18 +50,18 @@ class DurationPredictor(nn.Layer):
Parameters
----------
idim : int
Input dimension.
n_layers : int, optional
Number of convolutional layers.
n_chans : int, optional
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
offset : float, optional
Offset value to avoid nan in log domain.
idim : int
Input dimension.
n_layers : int, optional
Number of convolutional layers.
n_chans : int, optional
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
offset : float, optional
Offset value to avoid nan in log domain.
"""
super(DurationPredictor, self).__init__()
@ -108,10 +108,10 @@ class DurationPredictor(nn.Layer):
Parameters
----------
xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : ByteTensor, optional
Batch of masks indicating padded part (B, Tmax).
xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : ByteTensor, optional
Batch of masks indicating padded part (B, Tmax).
Returns
----------
@ -125,15 +125,15 @@ class DurationPredictor(nn.Layer):
Parameters
----------
xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : Tensor(bool), optional
Batch of masks indicating padded part (B, Tmax).
xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : Tensor(bool), optional
Batch of masks indicating padded part (B, Tmax).
Returns
----------
LongTensor
Batch of predicted durations in linear domain int64 (B, Tmax).
Tensor
Batch of predicted durations in linear domain int64 (B, Tmax).
"""
return self._forward(xs, x_masks, True)
@ -150,10 +150,10 @@ class DurationPredictorLoss(nn.Layer):
Parameters
----------
offset : float, optional
Offset value to avoid nan in log domain.
reduction : str
Reduction type in loss calculation.
offset : float, optional
Offset value to avoid nan in log domain.
reduction : str
Reduction type in loss calculation.
"""
super(DurationPredictorLoss, self).__init__()
self.criterion = nn.MSELoss(reduction=reduction)
@ -164,19 +164,19 @@ class DurationPredictorLoss(nn.Layer):
Parameters
----------
outputs : Tensor
Batch of prediction durations in log domain (B, T)
targets : LongTensor
Batch of groundtruth durations in linear domain (B, T)
outputs : Tensor
Batch of prediction durations in log domain (B, T)
targets : Tensor
Batch of groundtruth durations in linear domain (B, T)
Returns
----------
Tensor
Mean squared error loss value.
Tensor
Mean squared error loss value.
Note
----------
`outputs` is in log domain but `targets` is in linear domain.
`outputs` is in log domain but `targets` is in linear domain.
"""
# NOTE: outputs is in log domain while targets in linear
targets = paddle.log(targets.cast(dtype='float32') + self.offset)

View File

@ -37,8 +37,8 @@ class LengthRegulator(nn.Layer):
Parameters
----------
pad_value : float, optional
Value used for padding.
pad_value : float, optional
Value used for padding.
"""
super().__init__()
@ -70,17 +70,17 @@ class LengthRegulator(nn.Layer):
Parameters
----------
xs : Tensor
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
ds : LongTensor
Batch of durations of each frame (B, T).
alpha : float, optional
Alpha value to control speed of speech.
xs : Tensor
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
ds : LongTensor
Batch of durations of each frame (B, T).
alpha : float, optional
Alpha value to control speed of speech.
Returns
----------
Tensor
replicated input tensor based on durations (B, T*, D).
Tensor
replicated input tensor based on durations (B, T*, D).
"""
if alpha != 1.0:
assert alpha > 0

View File

@ -45,20 +45,20 @@ class Postnet(nn.Layer):
Parameters
----------
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
n_layers : int, optional
The number of layers.
n_filts : int, optional
The number of filter size.
n_units : int, optional
The number of filter channels.
use_batch_norm : bool, optional
Whether to use batch normalization..
dropout_rate : float, optional
Dropout rate..
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
n_layers : int, optional
The number of layers.
n_filts : int, optional
The number of filter size.
n_units : int, optional
The number of filter channels.
use_batch_norm : bool, optional
Whether to use batch normalization..
dropout_rate : float, optional
Dropout rate..
"""
super(Postnet, self).__init__()
self.postnet = nn.LayerList()
@ -120,13 +120,13 @@ class Postnet(nn.Layer):
Parameters
----------
xs : Tensor
Batch of the sequences of padded input tensors (B, idim, Tmax).
xs : Tensor
Batch of the sequences of padded input tensors (B, idim, Tmax).
Returns
----------
Tensor
Batch of padded output tensor. (B, odim, Tmax).
Tensor
Batch of padded output tensor. (B, odim, Tmax).
"""
for i in six.moves.range(len(self.postnet)):

View File

@ -43,16 +43,16 @@ class VariancePredictor(nn.Layer):
Parameters
----------
idim : int
Input dimension.
n_layers : int, optional
Number of convolutional layers.
n_chans : int, optional
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
idim : int
Input dimension.
n_layers : int, optional
Number of convolutional layers.
n_chans : int, optional
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
"""
assert check_argument_types()
super().__init__()

View File

@ -26,12 +26,12 @@ class MultiHeadedAttention(nn.Layer):
Parameters
----------
n_head : int
The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
n_head : int
The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
"""
def __init__(self, n_head, n_feat, dropout_rate):
@ -53,21 +53,21 @@ class MultiHeadedAttention(nn.Layer):
Parameters
----------
query : paddle.Tensor
query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
query : paddle.Tensor
query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
Returns
----------
paddle.Tensor
Transformed query tensor (#batch, n_head, time1, d_k).
paddle.Tensor
Transformed key tensor (#batch, n_head, time2, d_k).
paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k).
paddle.Tensor
Transformed query tensor (#batch, n_head, time1, d_k).
paddle.Tensor
Transformed key tensor (#batch, n_head, time2, d_k).
paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k).
"""
n_batch = query.shape[0]
@ -90,18 +90,18 @@ class MultiHeadedAttention(nn.Layer):
Parameters
----------
value : paddle.Tensor
Transformed value (#batch, n_head, time2, d_k).
scores : paddle.Tensor
Attention score (#batch, n_head, time1, time2).
mask : paddle.Tensor
Mask (#batch, 1, time2) or (#batch, time1, time2).
value : paddle.Tensor
Transformed value (#batch, n_head, time2, d_k).
scores : paddle.Tensor
Attention score (#batch, n_head, time1, time2).
mask : paddle.Tensor
Mask (#batch, 1, time2) or (#batch, time1, time2).
Returns
----------
paddle.Tensor:
Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
paddle.Tensor:
Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
"""
n_batch = value.shape[0]
softmax = paddle.nn.Softmax(axis=-1)
@ -136,19 +136,19 @@ class MultiHeadedAttention(nn.Layer):
Parameters
----------
query : paddle.Tensor
Query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
query : paddle.Tensor
Query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
Returns
----------
paddle.Tensor
Output tensor (#batch, time1, d_model).
paddle.Tensor
Output tensor (#batch, time1, d_model).
"""
q, k, v = self.forward_qkv(query, key, value)
scores = paddle.matmul(q, k.transpose(

View File

@ -24,14 +24,14 @@ class PositionalEncoding(nn.Layer):
Parameters
----------
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
reverse : bool
Whether to reverse the input position. Only for
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
reverse : bool
Whether to reverse the input position.
"""
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
@ -68,13 +68,13 @@ class PositionalEncoding(nn.Layer):
Parameters
----------
x : paddle.Tensor
Input tensor (batch, time, `*`).
x : paddle.Tensor
Input tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
paddle.Tensor
Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale + self.pe[:, :x.shape[1]]

View File

@ -29,42 +29,42 @@ class Encoder(nn.Layer):
Parameters
----------
idim : int
Input dimension.
attention_dim : int
Dimention of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
selfattention_layer_type : str
Encoder attention layer type.
padding_idx : int
Padding idx for input_layer=embed.
idim : int
Input dimension.
attention_dim : int
Dimention of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
selfattention_layer_type : str
Encoder attention layer type.
padding_idx : int
Padding idx for input_layer=embed.
"""
def __init__(
@ -180,17 +180,17 @@ class Encoder(nn.Layer):
Parameters
----------
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks : paddle.Tensor
Mask tensor (#batch, time).
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks : paddle.Tensor
Mask tensor (#batch, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
"""
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
@ -203,21 +203,21 @@ class Encoder(nn.Layer):
Parameters
----------
xs : paddle.Tensor
Input tensor.
masks : paddle.Tensor
Mask tensor.
cache : List[paddle.Tensor]
List of cache tensors.
xs : paddle.Tensor
Input tensor.
masks : paddle.Tensor
Mask tensor.
cache : List[paddle.Tensor]
List of cache tensors.
Returns
----------
paddle.Tensor
Output tensor.
paddle.Tensor
Mask tensor.
List[paddle.Tensor]
List of new cache tensors.
paddle.Tensor
Output tensor.
paddle.Tensor
Mask tensor.
List[paddle.Tensor]
List of new cache tensors.
"""
xs = self.embed(xs)
@ -229,4 +229,4 @@ class Encoder(nn.Layer):
new_cache.append(xs)
if self.normalize_before:
xs = self.after_norm(xs)
return xs, masks, new_cache
return xs, masks, new_cache

View File

@ -22,23 +22,23 @@ class EncoderLayer(nn.Layer):
Parameters
----------
size : int
Input dimension.
self_attn : paddle.nn.Layer
Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
feed_forward : paddle.nn.Layer
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
dropout_rate : float
Dropout rate.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
size : int
Input dimension.
self_attn : paddle.nn.Layer
Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
feed_forward : paddle.nn.Layer
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
dropout_rate : float
Dropout rate.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
"""
def __init__(
@ -67,19 +67,19 @@ class EncoderLayer(nn.Layer):
Parameters
----------
x_input : paddle.Tensor
Input tensor (#batch, time, size).
mask : paddle.Tensor
Mask tensor for the input (#batch, time).
cache : paddle.Tensor
Cache tensor of the input (#batch, time - 1, size).
x_input : paddle.Tensor
Input tensor (#batch, time, size).
mask : paddle.Tensor
Mask tensor for the input (#batch, time).
cache : paddle.Tensor
Cache tensor of the input (#batch, time - 1, size).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, size).
paddle.Tensor
Mask tensor (#batch, time).
paddle.Tensor
Output tensor (#batch, time, size).
paddle.Tensor
Mask tensor (#batch, time).
"""
residual = x
if self.normalize_before:

View File

@ -34,14 +34,14 @@ class MultiLayeredConv1d(paddle.nn.Layer):
Parameters
----------
in_chans : int
Number of input channels.
hidden_chans : int
Number of hidden channels.
kernel_size : int
Kernel size of conv1d.
dropout_rate : float
Dropout rate.
in_chans : int
Number of input channels.
hidden_chans : int
Number of hidden channels.
kernel_size : int
Kernel size of conv1d.
dropout_rate : float
Dropout rate.
"""
super(MultiLayeredConv1d, self).__init__()
@ -65,13 +65,13 @@ class MultiLayeredConv1d(paddle.nn.Layer):
Parameters
----------
x : paddle.Tensor
Batch of input tensors (B, T, in_chans).
x : paddle.Tensor
Batch of input tensors (B, T, in_chans).
Returns
----------
paddle.Tensor
Batch of output tensors (B, T, in_chans).
paddle.Tensor
Batch of output tensors (B, T, in_chans).
"""
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
@ -90,14 +90,14 @@ class Conv1dLinear(paddle.nn.Layer):
Parameters
----------
in_chans : int
Number of input channels.
hidden_chans : int
Number of hidden channels.
kernel_size : int
Kernel size of conv1d.
dropout_rate : float
Dropout rate.
in_chans : int
Number of input channels.
hidden_chans : int
Number of hidden channels.
kernel_size : int
Kernel size of conv1d.
dropout_rate : float
Dropout rate.
"""
super(Conv1dLinear, self).__init__()
self.w_1 = paddle.nn.Conv1D(
@ -115,13 +115,13 @@ class Conv1dLinear(paddle.nn.Layer):
Parameters
----------
x : paddle.Tensor
Batch of input tensors (B, T, in_chans).
x : paddle.Tensor
Batch of input tensors (B, T, in_chans).
Returns
----------
paddle.Tensor
Batch of output tensors (B, T, in_chans).
paddle.Tensor
Batch of output tensors (B, T, in_chans).
"""
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])

View File

@ -21,12 +21,12 @@ class PositionwiseFeedForward(paddle.nn.Layer):
Parameters
----------
idim : int
Input dimenstion.
hidden_units : int
The number of hidden units.
dropout_rate : float
Dropout rate.
idim : int
Input dimenstion.
hidden_units : int
The number of hidden units.
dropout_rate : float
Dropout rate.
"""
def __init__(self,

View File

@ -31,14 +31,14 @@ def repeat(N, fn):
Parameters
----------
N : int
Number of repeat time.
fn : Callable
Function to generate module.
N : int
Number of repeat time.
fn : Callable
Function to generate module.
Returns
----------
MultiSequential
Repeated model instance.
MultiSequential
Repeated model instance.
"""
return MultiSequential(* [fn(n) for n in range(N)])

View File

@ -21,10 +21,10 @@ class LayerNorm(paddle.nn.LayerNorm):
Parameters
----------
nout : int
Output dim size.
dim : int
Dimension to be normalized.
nout : int
Output dim size.
dim : int
Dimension to be normalized.
"""
def __init__(self, nout, dim=-1):
@ -37,13 +37,13 @@ class LayerNorm(paddle.nn.LayerNorm):
Parameters
----------
x : paddle.Tensor
Input tensor.
x : paddle.Tensor
Input tensor.
Returns
----------
paddle.Tensor
Normalized tensor.
paddle.Tensor
Normalized tensor.
"""
if self.dim == -1:
return super(LayerNorm, self).forward(x)

View File

@ -22,25 +22,25 @@ def pad_list(xs, pad_value):
Parameters
----------
xs : List[Tensor]
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value : float)
Value for padding.
xs : List[Tensor]
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value : float)
Value for padding.
Returns
----------
Tensor
Padded tensor (B, Tmax, `*`).
Tensor
Padded tensor (B, Tmax, `*`).
Examples
----------
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
>>> x
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
>>> pad_list(x, 0)
tensor([[1., 1., 1., 1.],
[1., 1., 0., 0.],
[1., 0., 0., 0.]])
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
>>> x
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
>>> pad_list(x, 0)
tensor([[1., 1., 1., 1.],
[1., 1., 0., 0.],
[1., 0., 0., 0.]])
"""
n_batch = len(xs)
max_len = max(x.shape[0] for x in xs)
@ -57,23 +57,23 @@ def make_pad_mask(lengths, length_dim=-1):
Parameters
----------
lengths : LongTensor or List
Batch of lengths (B,).
lengths : LongTensor or List
Batch of lengths (B,).
Returns
----------
Tensor(bool)
Mask tensor containing indices of padded part bool.
Tensor(bool)
Mask tensor containing indices of padded part bool.
Examples
----------
With only lengths.
With only lengths.
>>> lengths = [5, 3, 2]
>>> make_non_pad_mask(lengths)
masks = [[0, 0, 0, 0 ,0],
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]]
>>> lengths = [5, 3, 2]
>>> make_non_pad_mask(lengths)
masks = [[0, 0, 0, 0 ,0],
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]]
"""
if length_dim == 0:
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@ -99,29 +99,29 @@ def make_non_pad_mask(lengths, length_dim=-1):
Parameters
----------
lengths : LongTensor or List
Batch of lengths (B,).
xs : Tensor, optional
The reference tensor.
If set, masks will be the same shape as this tensor.
length_dim : int, optional
Dimension indicator of the above tensor.
See the example.
lengths : LongTensor or List
Batch of lengths (B,).
xs : Tensor, optional
The reference tensor.
If set, masks will be the same shape as this tensor.
length_dim : int, optional
Dimension indicator of the above tensor.
See the example.
Returns
----------
Tensor(bool)
mask tensor containing indices of padded part bool.
Tensor(bool)
mask tensor containing indices of padded part bool.
Examples
----------
With only lengths.
With only lengths.
>>> lengths = [5, 3, 2]
>>> make_non_pad_mask(lengths)
masks = [[1, 1, 1, 1 ,1],
[1, 1, 1, 0, 0],
[1, 1, 0, 0, 0]]
>>> lengths = [5, 3, 2]
>>> make_non_pad_mask(lengths)
masks = [[1, 1, 1, 1 ,1],
[1, 1, 1, 0, 0],
[1, 1, 0, 0, 0]]
"""
return paddle.logical_not(make_pad_mask(lengths, length_dim))
@ -135,10 +135,10 @@ def initialize(model: nn.Layer, init: str):
Parameters
----------
model : paddle.nn.Layer
Target.
init : str
Method of initialization.
model : paddle.nn.Layer
Target.
init : str
Method of initialization.
"""
assert check_argument_types()

View File

@ -29,8 +29,8 @@ class SpectralConvergenceLoss(nn.Layer):
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
Args:
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, C, T).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, C, T).
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns:
Tensor: Spectral convergence loss value.
"""
@ -50,11 +50,16 @@ class LogSTFTMagnitudeLoss(nn.Layer):
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
Args:
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns:
Tensor: Log STFT magnitude loss value.
Parameters
----------
x_mag : Tensor
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag : Tensor
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns
----------
Tensor
Log STFT magnitude loss value.
"""
return F.l1_loss(
paddle.log(paddle.clip(
@ -86,15 +91,23 @@ class STFTLoss(nn.Layer):
def forward(self, x, y):
"""Calculate forward propagation.
Args:
x (Tensor): Predicted signal (B, T).
y (Tensor): Groundtruth signal (B, T).
Returns:
Tensor: Spectral convergence loss value.
Tensor: Log STFT magnitude loss value.
Parameters
----------
x : Tensor
Predicted signal (B, T).
y : Tensor
Groundtruth signal (B, T).
Returns
----------
Tensor
Spectral convergence loss value.
Tensor
Log STFT magnitude loss value.
"""
x_mag = self.stft.magnitude(x)
y_mag = self.stft.magnitude(y)
x_mag = x_mag.transpose([0, 2, 1])
y_mag = y_mag.transpose([0, 2, 1])
sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
@ -111,11 +124,16 @@ class MultiResolutionSTFTLoss(nn.Layer):
win_lengths=[600, 1200, 240],
window="hann", ):
"""Initialize Multi resolution STFT loss module.
Args:
fft_sizes (list): List of FFT sizes.
hop_sizes (list): List of hop sizes.
win_lengths (list): List of window lengths.
window (str): Window function type.
Parameters
----------
fft_sizes : list
List of FFT sizes.
hop_sizes : list
List of hop sizes.
win_lengths : list
List of window lengths.
window : str
Window function type.
"""
super().__init__()
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@ -125,13 +143,24 @@ class MultiResolutionSTFTLoss(nn.Layer):
def forward(self, x, y):
"""Calculate forward propagation.
Args:
x (Tensor): Predicted signal (B, T).
y (Tensor): Groundtruth signal (B, T).
Returns:
Tensor: Multi resolution spectral convergence loss value.
Tensor: Multi resolution log STFT magnitude loss value.
Parameters
----------
x : Tensor
Predicted signal (B, T) or (B, #subband, T).
y : Tensor
Groundtruth signal (B, T) or (B, #subband, T).
Returns
----------
Tensor
Multi resolution spectral convergence loss value.
Tensor
Multi resolution log STFT magnitude loss value.
"""
if len(x.shape) == 3:
# (B, C, T) -> (B x C, T)
x = x.reshape([-1, x.shape[2]])
# (B, C, T) -> (B x C, T)
y = y.reshape([-1, y.shape[2]])
sc_loss = 0.0
mag_loss = 0.0
for f in self.stft_losses: