diff --git a/examples/fastspeech2/baker/preprocess.py b/examples/fastspeech2/baker/preprocess.py index c079715..7dc1645 100644 --- a/examples/fastspeech2/baker/preprocess.py +++ b/examples/fastspeech2/baker/preprocess.py @@ -21,10 +21,10 @@ from typing import List, Dict, Any import jsonlines import librosa import numpy as np +from parakeet.data.get_feats import LogMelFBank, Energy, Pitch import tqdm from config import get_cfg_default -from get_feats import LogMelFBank, Energy, Pitch def get_phn_dur(file_name): diff --git a/examples/fastspeech2/baker/synthesize.py b/examples/fastspeech2/baker/synthesize.py index ec65b06..6770189 100644 --- a/examples/fastspeech2/baker/synthesize.py +++ b/examples/fastspeech2/baker/synthesize.py @@ -94,7 +94,7 @@ def main(): parser.add_argument( "--fastspeech2-config", type=str, - help="config file to overwrite default config") + help="config file to overwrite default config.") parser.add_argument( "--fastspeech2-checkpoint", type=str, @@ -121,13 +121,13 @@ def main(): parser.add_argument( "--phones-dict", type=str, - default="phone_id_map.txt ", + default="phone_id_map.txt", help="phone vocabulary file.") - parser.add_argument("--test-metadata", type=str, help="test metadata") - parser.add_argument("--output-dir", type=str, help="output dir") + parser.add_argument("--test-metadata", type=str, help="test metadata.") + parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") - parser.add_argument("--verbose", type=int, default=1, help="verbose") + "--device", type=str, default="gpu", help="device type to use.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() with open(args.fastspeech2_config) as f: diff --git a/examples/fastspeech2/baker/synthesize_e2e.py b/examples/fastspeech2/baker/synthesize_e2e.py index 8d57e2a..a34c826 100644 --- a/examples/fastspeech2/baker/synthesize_e2e.py +++ b/examples/fastspeech2/baker/synthesize_e2e.py @@ -99,7 +99,7 @@ def main(): parser.add_argument( "--fastspeech2-config", type=str, - help="config file to overwrite default config") + help="fastspeech2 config file to overwrite default config.") parser.add_argument( "--fastspeech2-checkpoint", type=str, @@ -112,8 +112,7 @@ def main(): parser.add_argument( "--pwg-config", type=str, - help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." - ) + help="parallel wavegan config file to overwrite default config.") parser.add_argument( "--pwg-params", type=str, @@ -126,16 +125,16 @@ def main(): parser.add_argument( "--phones-dict", type=str, - default="phone_id_map.txt ", + default="phone_id_map.txt", help="phone vocabulary file.") parser.add_argument( "--text", type=str, - help="text to synthesize, a 'utt_id sentence' pair per line") - parser.add_argument("--output-dir", type=str, help="output dir") + help="text to synthesize, a 'utt_id sentence' pair per line.") + parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") - parser.add_argument("--verbose", type=int, default=1, help="verbose") + "--device", type=str, default="gpu", help="device type to use.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() with open(args.fastspeech2_config) as f: diff --git a/examples/fastspeech2/baker/train.py b/examples/fastspeech2/baker/train.py index 805dc1c..741678b 100644 --- a/examples/fastspeech2/baker/train.py +++ b/examples/fastspeech2/baker/train.py @@ -169,18 +169,18 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN " + parser = argparse.ArgumentParser(description="Train a FastSpeech2 " "model with Baker Mandrin TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config") - parser.add_argument("--train-metadata", type=str, help="training data") - parser.add_argument("--dev-metadata", type=str, help="dev data") - parser.add_argument("--output-dir", type=str, help="output dir") + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") + "--device", type=str, default="gpu", help="device type to use.") parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes") - parser.add_argument("--verbose", type=int, default=1, help="verbose") + "--nprocs", type=int, default=1, help="number of processes.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, diff --git a/examples/parallelwave_gan/baker/batch_fn.py b/examples/parallelwave_gan/baker/batch_fn.py index 22af5af..11a45c5 100644 --- a/examples/parallelwave_gan/baker/batch_fn.py +++ b/examples/parallelwave_gan/baker/batch_fn.py @@ -27,10 +27,14 @@ class Clip(object): aux_context_window=0, ): """Initialize customized collater for DataLoader. - Args: - batch_max_steps (int): The maximum length of input signal in batch. - hop_size (int): Hop size of auxiliary features. - aux_context_window (int): Context window size for auxiliary feature conv. + Parameters + ---------- + batch_max_steps : int + The maximum length of input signal in batch. + hop_size : int + Hop size of auxiliary features. + aux_context_window : int + Context window size for auxiliary feature conv. """ if batch_max_steps % hop_size != 0: @@ -49,14 +53,18 @@ class Clip(object): def __call__(self, examples): """Convert into batch tensors. - Args: - batch (list): list of tuple of the pair of audio and features. Audio shape - (T, ), features shape(T', C). + Parameters + ---------- + batch : list + list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). - Returns: - Tensor: Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. - Tensor: Target signal batch (B, 1, T). + Returns + ---------- + Tensor + Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. + Tensor + Target signal batch (B, 1, T). """ # check length @@ -93,10 +101,11 @@ class Clip(object): def _adjust_length(self, x, c): """Adjust the audio and feature lengths. - Note: - Basically we assume that the length of x and c are adjusted - through preprocessing stage, but if we use other library processed - features, this process will be needed. + Note + ------- + Basically we assume that the length of x and c are adjusted + through preprocessing stage, but if we use other library processed + features, this process will be needed. """ if len(x) < c.shape[1] * self.hop_size: diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml index cbc2842..9036171 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -82,7 +82,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### # DATA LOADER SETTING # ########################################################### -batch_size: 6 # Batch size. +batch_size: 8 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 4 # Number of workers in Pytorch DataLoader. diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py index ff1540b..92021eb 100644 --- a/examples/parallelwave_gan/baker/preprocess.py +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -12,88 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Dict, Any -import soundfile as sf -import librosa -import numpy as np -import argparse -import yaml -import json -import jsonlines -import concurrent.futures -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from pathlib import Path -import tqdm from operator import itemgetter -from praatio import tgio +from typing import List, Dict, Any + +import argparse +import jsonlines +import librosa import logging +import numpy as np +import tqdm +from concurrent.futures import ThreadPoolExecutor +from parakeet.data.get_feats import LogMelFBank +from pathlib import Path +from praatio import tgio from config import get_cfg_default -def logmelfilterbank(audio, - sr, - n_fft=1024, - hop_length=256, - win_length=None, - window="hann", - n_mels=80, - fmin=None, - fmax=None, - eps=1e-10): - """Compute log-Mel filterbank feature. - - Parameters - ---------- - audio : ndarray - Audio signal (T,). - sr : int - Sampling rate. - n_fft : int - FFT size. (Default value = 1024) - hop_length : int - Hop size. (Default value = 256) - win_length : int - Window length. If set to None, it will be the same as fft_size. (Default value = None) - window : str - Window function type. (Default value = "hann") - n_mels : int - Number of mel basis. (Default value = 80) - fmin : int - Minimum frequency in mel basis calculation. (Default value = None) - fmax : int - Maximum frequency in mel basis calculation. (Default value = None) - eps : float - Epsilon value to avoid inf in log calculation. (Default value = 1e-10) - - Returns - ------- - np.ndarray - Log Mel filterbank feature (#frames, num_mels). - - """ - # get amplitude spectrogram - x_stft = librosa.stft( - audio, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - pad_mode="reflect") - spc = np.abs(x_stft) # (#bins, #frames,) - - # get mel basis - fmin = 0 if fmin is None else fmin - fmax = sr / 2 if fmax is None else fmax - mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax) - - return np.log10(np.maximum(eps, np.dot(mel_basis, spc))) - - def process_sentence(config: Dict[str, Any], fp: Path, alignment_fp: Path, - output_dir: Path): + output_dir: Path, + mel_extractor=None): utt_id = fp.stem # reading @@ -134,19 +74,11 @@ def process_sentence(config: Dict[str, Any], frame_length=config.trim_frame_length, hop_length=config.trim_hop_length) - logmel = logmelfilterbank( - y, - sr=sr, - n_fft=config.n_fft, - window=config.window, - win_length=config.win_length, - hop_length=config.hop_length, - n_mels=config.n_mels, - fmin=config.fmin, - fmax=config.fmax) + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(y) # adjust time to make num_samples == num_frames * hop_length - num_frames = logmel.shape[1] + num_frames = logmel.shape[0] if y.size < num_frames * config.hop_length: y = np.pad(y, (0, num_frames * config.hop_length - y.size), mode="reflect") @@ -157,7 +89,7 @@ def process_sentence(config: Dict[str, Any], mel_path = output_dir / (utt_id + "_feats.npy") wav_path = output_dir / (utt_id + "_wave.npy") np.save(wav_path, y) # (num_samples, ) - np.save(mel_path, logmel.T) # (num_frames, n_mels) + np.save(mel_path, logmel) # (num_frames, n_mels) record = { "utt_id": utt_id, "num_samples": num_sample, @@ -172,19 +104,22 @@ def process_sentences(config, fps: List[Path], alignment_fps: List[Path], output_dir: Path, + mel_extractor=None, nprocs: int=1): if nprocs == 1: results = [] for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)): results.append( - process_sentence(config, fp, alignment_fp, output_dir)) + process_sentence(config, fp, alignment_fp, output_dir, + mel_extractor)) else: with ThreadPoolExecutor(nprocs) as pool: futures = [] with tqdm.tqdm(total=len(fps)) as progress: for fp, alignment_fp in zip(fps, alignment_fps): future = pool.submit(process_sentence, config, fp, - alignment_fp, output_dir) + alignment_fp, output_dir, + mel_extractor) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -260,24 +195,37 @@ def main(): test_dump_dir = dumpdir / "test" / "raw" test_dump_dir.mkdir(parents=True, exist_ok=True) + mel_extractor = LogMelFBank( + sr=C.sr, + n_fft=C.n_fft, + hop_length=C.hop_length, + win_length=C.win_length, + window=C.window, + n_mels=C.n_mels, + fmin=C.fmin, + fmax=C.fmax) + # process for the 3 sections process_sentences( C, train_wav_files, train_alignment_files, train_dump_dir, + mel_extractor=mel_extractor, nprocs=args.num_cpu) process_sentences( C, dev_wav_files, dev_alignment_files, dev_dump_dir, + mel_extractor=mel_extractor, nprocs=args.num_cpu) process_sentences( C, test_wav_files, test_alignment_files, test_dump_dir, + mel_extractor=mel_extractor, nprocs=args.num_cpu) diff --git a/examples/parallelwave_gan/baker/preprocess.sh b/examples/parallelwave_gan/baker/preprocess.sh old mode 100644 new mode 100755 diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index dde7773..90cf655 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -78,16 +78,17 @@ class PWGUpdater(StandardUpdater): wav_ = self.generator(noise, mel) logging.debug(f"Generator takes {t.elapse}s.") - ## Multi-resolution stft loss + # initialize + gen_loss = 0.0 + ## Multi-resolution stft loss with timer() as t: - sc_loss, mag_loss = self.criterion_stft( - wav_.squeeze(1), wav.squeeze(1)) + sc_loss, mag_loss = self.criterion_stft(wav_, wav) logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.") report("train/spectral_convergence_loss", float(sc_loss)) report("train/log_stft_magnitude_loss", float(mag_loss)) - gen_loss = sc_loss + mag_loss + gen_loss += sc_loss + mag_loss ## Adversarial loss if self.state.iteration > self.discriminator_train_start_steps: @@ -119,9 +120,9 @@ class PWGUpdater(StandardUpdater): p_ = self.discriminator(wav_.detach()) real_loss = self.criterion_mse(p, paddle.ones_like(p)) fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_)) + dis_loss = real_loss + fake_loss report("train/real_loss", float(real_loss)) report("train/fake_loss", float(fake_loss)) - dis_loss = real_loss + fake_loss report("train/discriminator_loss", float(dis_loss)) self.optimizer_d.clear_grad() @@ -164,8 +165,7 @@ class PWGEvaluator(StandardEvaluator): # stft loss with timer() as t: - sc_loss, mag_loss = self.criterion_stft( - wav_.squeeze(1), wav.squeeze(1)) + sc_loss, mag_loss = self.criterion_stft(wav_, wav) logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s") report("eval/spectral_convergence_loss", float(sc_loss)) @@ -178,7 +178,7 @@ class PWGEvaluator(StandardEvaluator): p = self.discriminator(wav) real_loss = self.criterion_mse(p, paddle.ones_like(p)) fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_)) + dis_loss = real_loss + fake_loss report("eval/real_loss", float(real_loss)) report("eval/fake_loss", float(fake_loss)) - dis_loss = real_loss + fake_loss report("eval/discriminator_loss", float(dis_loss)) diff --git a/examples/parallelwave_gan/baker/run.sh b/examples/parallelwave_gan/baker/run.sh old mode 100644 new mode 100755 diff --git a/examples/parallelwave_gan/baker/synthesize.py b/examples/parallelwave_gan/baker/synthesize.py index 01cfbbf..1216220 100644 --- a/examples/parallelwave_gan/baker/synthesize.py +++ b/examples/parallelwave_gan/baker/synthesize.py @@ -32,14 +32,14 @@ from parakeet.models.parallel_wavegan import PWGGenerator from config import get_cfg_default parser = argparse.ArgumentParser( - description="synthesize with parallel wavegan.") + description="Synthesize with parallel wavegan.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config") -parser.add_argument("--checkpoint", type=str, help="snapshot to load") -parser.add_argument("--test-metadata", type=str, help="dev data") -parser.add_argument("--output-dir", type=str, help="output dir") -parser.add_argument("--device", type=str, default="gpu", help="device to run") -parser.add_argument("--verbose", type=int, default=1, help="verbose") + "--config", type=str, help="config file to overwrite default config.") +parser.add_argument("--checkpoint", type=str, help="snapshot to load.") +parser.add_argument("--test-metadata", type=str, help="dev data.") +parser.add_argument("--output-dir", type=str, help="output dir.") +parser.add_argument("--device", type=str, default="gpu", help="device to run.") +parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() config = get_cfg_default() @@ -89,5 +89,5 @@ for example in test_dataset: print( f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}." ) - sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr) + sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.sr) print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }") diff --git a/examples/parallelwave_gan/baker/synthesize.sh b/examples/parallelwave_gan/baker/synthesize.sh new file mode 100755 index 0000000..e349c35 --- /dev/null +++ b/examples/parallelwave_gan/baker/synthesize.sh @@ -0,0 +1,5 @@ +python3 synthesize.py \ + --config=conf/default.yaml \ + --checkpoint=exp/default/checkpoints/snapshot_iter_220000.pdz \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=exp/debug/test diff --git a/examples/parallelwave_gan/baker/synthesize_from_wav.py b/examples/parallelwave_gan/baker/synthesize_from_wav.py new file mode 100644 index 0000000..87eb35d --- /dev/null +++ b/examples/parallelwave_gan/baker/synthesize_from_wav.py @@ -0,0 +1,111 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import logging +from pathlib import Path + +import librosa +import numpy as np +import paddle +import soundfile as sf +import yaml +from parakeet.data.get_feats import LogMelFBank +from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference +from parakeet.modules.normalizer import ZScore + +from config import get_cfg_default + + +def evaluate(args, config): + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + vocoder = PWGGenerator(**config["generator_params"]) + state_dict = paddle.load(args.checkpoint) + vocoder.set_state_dict(state_dict["generator_params"]) + vocoder.remove_weight_norm() + vocoder.eval() + print("model done!") + + stat = np.load(args.stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + normalizer = ZScore(mu, std) + + pwg_inference = PWGInference(normalizer, vocoder) + + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + mel_extractor = LogMelFBank( + sr=config.sr, + n_fft=config.n_fft, + hop_length=config.hop_length, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + + for utt_name in os.listdir(input_dir): + wav, _ = librosa.load(str(input_dir / utt_name), sr=config.sr) + # extract mel feats + mel = mel_extractor.get_log_mel_fbank(wav) + mel = paddle.to_tensor(mel) + gen_wav = pwg_inference(mel) + sf.write( + str(output_dir / ("gen_" + utt_name)), + gen_wav.numpy(), + samplerate=config.sr) + print(f"{utt_name} done!") + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with parallel wavegan.") + + parser.add_argument( + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument( + "--stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." + ) + parser.add_argument("--input-dir", type=str, help="input dir of wavs.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--device", type=str, default="gpu", help="device to run.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") + + args = parser.parse_args() + config = get_cfg_default() + if args.config: + config.merge_from_file(args.config) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + + evaluate(args, config) + + +if __name__ == "__main__": + main() diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 3699e6f..1bf0a90 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -12,36 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys -import logging import argparse -import dataclasses -from pathlib import Path +import os +import logging -import yaml import jsonlines -import paddle import numpy as np -from paddle import nn -from paddle.nn import functional as F +import paddle +import yaml +from paddle import DataParallel from paddle import distributed as dist +from paddle import nn from paddle.io import DataLoader, DistributedBatchSampler from paddle.optimizer import Adam # No RAdaom from paddle.optimizer.lr import StepDecay -from paddle import DataParallel -from visualdl import LogWriter - from parakeet.datasets.data_table import DataTable -from parakeet.training.updater import UpdaterBase -from parakeet.training.trainer import Trainer -from parakeet.training.reporter import report -from parakeet.training import extension -from parakeet.training.extensions.snapshot import Snapshot -from parakeet.training.extensions.visualizer import VisualDL from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss +from parakeet.training.extensions.snapshot import Snapshot +from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.seeding import seed_everything +from parakeet.training.trainer import Trainer +from pathlib import Path +from visualdl import LogWriter from batch_fn import Clip from config import get_cfg_default @@ -210,15 +203,15 @@ def main(): parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN " "model with Baker Mandrin TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config") - parser.add_argument("--train-metadata", type=str, help="training data") - parser.add_argument("--dev-metadata", type=str, help="dev data") - parser.add_argument("--output-dir", type=str, help="output dir") + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") + "--device", type=str, default="gpu", help="device type to use.") parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes") - parser.add_argument("--verbose", type=int, default=1, help="verbose") + "--nprocs", type=int, default=1, help="number of processes.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() if args.device == "cpu" and args.nprocs > 1: diff --git a/examples/speedyspeech/baker/preprocess.py b/examples/speedyspeech/baker/preprocess.py index 1fda34f..a5a2dfd 100644 --- a/examples/speedyspeech/baker/preprocess.py +++ b/examples/speedyspeech/baker/preprocess.py @@ -12,94 +12,34 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Dict, Any -import soundfile as sf -import librosa -import numpy as np -import argparse -import yaml -import json -import re -import jsonlines -import concurrent.futures -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from pathlib import Path -import tqdm from operator import itemgetter -from praatio import tgio +from typing import List, Dict, Any + +import argparse +import jsonlines +import librosa import logging +import numpy as np +import re +import tqdm +from concurrent.futures import ThreadPoolExecutor +from parakeet.data.get_feats import LogMelFBank +from pathlib import Path +from praatio import tgio from config import get_cfg_default from tg_utils import validate_textgrid -def logmelfilterbank(audio, - sr, - n_fft=1024, - hop_length=256, - win_length=None, - window="hann", - n_mels=80, - fmin=None, - fmax=None, - eps=1e-10): - """Compute log-Mel filterbank feature. - - Parameters - ---------- - audio : ndarray - Audio signal (T,). - sr : int - Sampling rate. - n_fft : int - FFT size. (Default value = 1024) - hop_length : int - Hop size. (Default value = 256) - win_length : int - Window length. If set to None, it will be the same as fft_size. (Default value = None) - window : str - Window function type. (Default value = "hann") - n_mels : int - Number of mel basis. (Default value = 80) - fmin : int - Minimum frequency in mel basis calculation. (Default value = None) - fmax : int - Maximum frequency in mel basis calculation. (Default value = None) - eps : float - Epsilon value to avoid inf in log calculation. (Default value = 1e-10) - - Returns - ------- - np.ndarray - Log Mel filterbank feature (#frames, num_mels). - - """ - # get amplitude spectrogram - x_stft = librosa.stft( - audio, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - pad_mode="reflect") - spc = np.abs(x_stft) # (#bins, #frames,) - - # get mel basis - fmin = 0 if fmin is None else fmin - fmax = sr / 2 if fmax is None else fmax - mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax) - - return np.log10(np.maximum(eps, np.dot(mel_basis, spc))) - - def process_sentence(config: Dict[str, Any], fp: Path, alignment_fp: Path, - output_dir: Path): + output_dir: Path, + mel_extractor=None): utt_id = fp.stem # reading - y, sr = librosa.load(fp, sr=config.sr) # resampling may occur + y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(y).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." @@ -125,16 +65,8 @@ def process_sentence(config: Dict[str, Any], f" There is something wrong with the last interval {last} in utterance: {utt_id}" ) - logmel = logmelfilterbank( - y, - sr=sr, - n_fft=config.n_fft, - window=config.window, - win_length=config.win_length, - hop_length=config.hop_length, - n_mels=config.n_mels, - fmin=config.fmin, - fmax=config.fmax) + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(y) # extract phone and duration phones = [] @@ -162,7 +94,7 @@ def process_sentence(config: Dict[str, Any], ends, sr=sr, hop_length=config.hop_length) durations_frame = np.diff(frame_pos, prepend=0) - num_frames = logmel.shape[-1] # number of frames of the spectrogram + num_frames = logmel.shape[0] # number of frames of the spectrogram extra = np.sum(durations_frame) - num_frames assert extra <= 0, ( f"Number of frames inferred from alignemnt is " @@ -173,7 +105,7 @@ def process_sentence(config: Dict[str, Any], durations_frame = durations_frame.tolist() mel_path = output_dir / (utt_id + "_feats.npy") - np.save(mel_path, logmel.T) # (num_frames, n_mels) + np.save(mel_path, logmel) # (num_frames, n_mels) record = { "utt_id": utt_id, "phones": phones, @@ -190,20 +122,23 @@ def process_sentences(config, fps: List[Path], alignment_fps: List[Path], output_dir: Path, + mel_extractor=None, nprocs: int=1): if nprocs == 1: results = [] for fp, alignment_fp in tqdm.tqdm( zip(fps, alignment_fps), total=len(fps)): results.append( - process_sentence(config, fp, alignment_fp, output_dir)) + process_sentence(config, fp, alignment_fp, output_dir, + mel_extractor)) else: with ThreadPoolExecutor(nprocs) as pool: futures = [] with tqdm.tqdm(total=len(fps)) as progress: for fp, alignment_fp in zip(fps, alignment_fps): future = pool.submit(process_sentence, config, fp, - alignment_fp, output_dir) + alignment_fp, output_dir, + mel_extractor) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -284,24 +219,37 @@ def main(): test_dump_dir = dumpdir / "test" / "raw" test_dump_dir.mkdir(parents=True, exist_ok=True) + mel_extractor = LogMelFBank( + sr=C.sr, + n_fft=C.n_fft, + hop_length=C.hop_length, + win_length=C.win_length, + window=C.window, + n_mels=C.n_mels, + fmin=C.fmin, + fmax=C.fmax) + # process for the 3 sections process_sentences( C, train_wav_files, train_alignment_files, train_dump_dir, + mel_extractor=mel_extractor, nprocs=args.num_cpu) process_sentences( C, dev_wav_files, dev_alignment_files, dev_dump_dir, + mel_extractor=mel_extractor, nprocs=args.num_cpu) process_sentences( C, test_wav_files, test_alignment_files, test_dump_dir, + mel_extractor=mel_extractor, nprocs=args.num_cpu) diff --git a/examples/speedyspeech/baker/preprocess.sh b/examples/speedyspeech/baker/preprocess.sh old mode 100644 new mode 100755 diff --git a/examples/speedyspeech/baker/run.sh b/examples/speedyspeech/baker/run.sh old mode 100644 new mode 100755 diff --git a/examples/speedyspeech/baker/train.py b/examples/speedyspeech/baker/train.py index d51afec..e18f44f 100644 --- a/examples/speedyspeech/baker/train.py +++ b/examples/speedyspeech/baker/train.py @@ -12,40 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys -import logging import argparse -import dataclasses -from pathlib import Path +import logging +import os -import yaml import jsonlines -import paddle import numpy as np -from paddle import nn -from paddle.nn import functional as F +import paddle +import yaml from paddle import distributed as dist +from paddle import DataParallel +from paddle import nn from paddle.io import DataLoader, DistributedBatchSampler from paddle.optimizer import Adam # No RAdaom -from paddle.optimizer.lr import StepDecay -from paddle import DataParallel -from visualdl import LogWriter - from parakeet.datasets.data_table import DataTable from parakeet.models.speedyspeech import SpeedySpeech - -from parakeet.training.updater import UpdaterBase -from parakeet.training.trainer import Trainer -from parakeet.training.reporter import report -from parakeet.training import extension from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.seeding import seed_everything +from parakeet.training.trainer import Trainer +from pathlib import Path +from visualdl import LogWriter from batch_fn import collate_baker_examples -from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator from config import get_cfg_default +from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator def train_sp(args, config): @@ -93,10 +84,6 @@ def train_sp(args, config): batch_size=config.batch_size, shuffle=False, drop_last=True) - # dev_sampler = DistributedBatchSampler(dev_dataset, - # batch_size=config.batch_size, - # shuffle=False, - # drop_last=False) print("samplers done!") train_dataloader = DataLoader( @@ -113,13 +100,9 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - # batch = collate_baker_examples([train_dataset[i] for i in range(10)]) - # # batch = collate_baker_examples([dev_dataset[i] for i in range(10)]) - # import pdb; pdb.set_trace() model = SpeedySpeech(**config["model"]) if world_size > 1: model = DataParallel(model) # TODO, do not use vocab size from config - # print(model) print("model done!") optimizer = Adam( 0.001, @@ -147,18 +130,18 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN " + parser = argparse.ArgumentParser(description="Train a SpeedySpeech " "model with Baker Mandrin TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config") - parser.add_argument("--train-metadata", type=str, help="training data") - parser.add_argument("--dev-metadata", type=str, help="dev data") - parser.add_argument("--output-dir", type=str, help="output dir") + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( - "--device", type=str, default="gpu", help="device type to use") + "--device", type=str, default="gpu", help="device type to use.") parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes") - parser.add_argument("--verbose", type=int, default=1, help="verbose") + "--nprocs", type=int, default=1, help="number of processes.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() if args.device == "cpu" and args.nprocs > 1: diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py index a7846af..23476bc 100644 --- a/parakeet/data/__init__.py +++ b/parakeet/data/__init__.py @@ -27,5 +27,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parakeet.data.dataset import * from parakeet.data.batch import * +from parakeet.data.dataset import * +from parakeet.data.get_feats import * diff --git a/examples/fastspeech2/baker/get_feats.py b/parakeet/data/get_feats.py similarity index 84% rename from examples/fastspeech2/baker/get_feats.py rename to parakeet/data/get_feats.py index 4e500e5..4027e9b 100644 --- a/examples/fastspeech2/baker/get_feats.py +++ b/parakeet/data/get_feats.py @@ -17,8 +17,6 @@ import numpy as np import pyworld from scipy.interpolate import interp1d -from config import get_cfg_default - class LogMelFBank(): def __init__(self, @@ -42,8 +40,8 @@ class LogMelFBank(): # mel self.n_mels = n_mels - self.fmin = fmin - self.fmax = fmax + self.fmin = 0 if fmin is None else fmin + self.fmax = sr / 2 if fmax is None else fmax self.mel_filter = self._create_mel_filter() @@ -217,41 +215,3 @@ class Energy(): if use_token_averaged_energy and duration is not None: energy = self._average_by_duration(energy, duration) return energy - - -if __name__ == "__main__": - C = get_cfg_default() - filename = "../raw_data/data/format.1/000001.flac" - wav, _ = librosa.load(filename, sr=C.fs) - mel_extractor = LogMelFBank( - sr=C.fs, - n_fft=C.n_fft, - hop_length=C.n_shift, - win_length=C.win_length, - window=C.window, - n_mels=C.n_mels, - fmin=C.fmin, - fmax=C.fmax, ) - mel = mel_extractor.get_log_mel_fbank(wav) - print(mel) - print(mel.shape) - - pitch_extractor = Pitch( - sr=C.fs, hop_length=C.n_shift, f0min=C.f0min, f0max=C.f0max) - duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5" - duration = np.array([int(x) for x in duration.split(" ")]) - avg_f0 = pitch_extractor.get_pitch(wav, duration=duration) - print(avg_f0) - print(avg_f0.shape) - - energy_extractor = Energy( - sr=C.fs, - n_fft=C.n_fft, - hop_length=C.n_shift, - win_length=C.win_length, - window=C.window) - duration = "2 8 8 8 12 11 10 13 11 10 18 9 12 10 12 11 5" - duration = np.array([int(x) for x in duration.split(" ")]) - avg_energy = energy_extractor.get_energy(wav, duration=duration) - print(avg_energy) - print(avg_energy.sum()) diff --git a/parakeet/frontend/cn_frontend.py b/parakeet/frontend/cn_frontend.py index 52624e0..50e21a8 100644 --- a/parakeet/frontend/cn_frontend.py +++ b/parakeet/frontend/cn_frontend.py @@ -109,4 +109,5 @@ class Frontend(): def get_phonemes(self, sentence): sentences = self.text_normalizer.normalize(sentence) phonemes = self._g2p(sentences) + print("phonemes:", phonemes) return phonemes diff --git a/parakeet/models/fastspeech2.py b/parakeet/models/fastspeech2.py index bff0b39..c351e92 100644 --- a/parakeet/models/fastspeech2.py +++ b/parakeet/models/fastspeech2.py @@ -15,7 +15,6 @@ from typing import Dict, Sequence, Tuple -import numpy as np import paddle from paddle import nn from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor, DurationPredictorLoss @@ -252,36 +251,36 @@ class FastSpeech2(nn.Layer): Parameters ---------- - text : Tensor - Batch of padded token ids (B, Tmax). - text_lengths : Tensor) - Batch of lengths of each input (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor - Batch of the lengths of each target (B,). - durations : Tensor - Batch of padded durations (B, Tmax). - pitch : Tensor - Batch of padded token-averaged pitch (B, Tmax, 1). - energy : Tensor - Batch of padded token-averaged energy (B, Tmax, 1). + text : Tensor + Batch of padded token ids (B, Tmax). + text_lengths : Tensor) + Batch of lengths of each input (B,). + speech : Tensor + Batch of padded target features (B, Lmax, odim). + speech_lengths : Tensor + Batch of the lengths of each target (B,). + durations : Tensor + Batch of padded durations (B, Tmax). + pitch : Tensor + Batch of padded token-averaged pitch (B, Tmax, 1). + energy : Tensor + Batch of padded token-averaged energy (B, Tmax, 1). Returns ---------- - Tensor - mel outs before postnet - Tensor - mel outs after postnet - Tensor - duration predictor's output - Tensor - pitch predictor's output - Tensor - energy predictor's output - Tensor - speech - Tensor - speech_lengths, modified if reduction_factor >1 + Tensor + mel outs before postnet + Tensor + mel outs after postnet + Tensor + duration predictor's output + Tensor + pitch predictor's output + Tensor + energy predictor's output + Tensor + speech + Tensor + speech_lengths, modified if reduction_factor > 1 """ xs = text @@ -389,26 +388,26 @@ class FastSpeech2(nn.Layer): Parameters ---------- - text : Tensor - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : Tensor, optional - Groundtruth of duration (T,). - pitch : Tensor, optional - Groundtruth of token-averaged pitch (T, 1). - energy : Tensor, optional - Groundtruth of token-averaged energy (T, 1). - alpha : float, optional - Alpha to control the speed. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - If true, groundtruth of duration, pitch and energy will be used. + text : Tensor + Input sequence of characters (T,). + speech : Tensor, optional + Feature sequence to extract style (N, idim). + durations : Tensor, optional + Groundtruth of duration (T,). + pitch : Tensor, optional + Groundtruth of token-averaged pitch (T, 1). + energy : Tensor, optional + Groundtruth of token-averaged energy (T, 1). + alpha : float, optional + Alpha to control the speed. + use_teacher_forcing : bool, optional + Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. Returns ---------- - Tensor - Output sequence of features (L, odim). + Tensor + Output sequence of features (L, odim). """ x, y = text, speech d, p, e = durations, pitch, energy @@ -448,21 +447,21 @@ class FastSpeech2(nn.Layer): Parameters ---------- - ilens : Tensor - Batch of lengths (B,). + ilens : Tensor + Batch of lengths (B,). Returns ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool + Tensor + Mask tensor for self-attention. + dtype=paddle.bool Examples ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) @@ -509,10 +508,10 @@ class FastSpeech2Loss(nn.Layer): Parameters ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to weighted masking in loss calculation. + use_masking : bool + Whether to apply masking for padded part in loss calculation. + use_weighted_masking : bool + Whether to weighted masking in loss calculation. """ assert check_argument_types() super().__init__() @@ -545,39 +544,39 @@ class FastSpeech2Loss(nn.Layer): Parameters ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - d_outs : Tensor - Batch of outputs of duration predictor (B, Tmax). - p_outs : Tensor - Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs : Tensor - Batch of outputs of energy predictor (B, Tmax, 1). - ys : Tensor - Batch of target features (B, Lmax, odim). - ds : Tensor - Batch of durations (B, Tmax). - ps : Tensor - Batch of target token-averaged pitch (B, Tmax, 1). - es : Tensor - Batch of target token-averaged energy (B, Tmax, 1). - ilens : Tensor - Batch of the lengths of each input (B,). - olens : Tensor - Batch of the lengths of each target (B,). + after_outs : Tensor + Batch of outputs after postnets (B, Lmax, odim). + before_outs : Tensor + Batch of outputs before postnets (B, Lmax, odim). + d_outs : Tensor + Batch of outputs of duration predictor (B, Tmax). + p_outs : Tensor + Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs : Tensor + Batch of outputs of energy predictor (B, Tmax, 1). + ys : Tensor + Batch of target features (B, Lmax, odim). + ds : Tensor + Batch of durations (B, Tmax). + ps : Tensor + Batch of target token-averaged pitch (B, Tmax, 1). + es : Tensor + Batch of target token-averaged energy (B, Tmax, 1). + ilens : Tensor + Batch of the lengths of each input (B,). + olens : Tensor + Batch of the lengths of each target (B,). Returns ---------- - Tensor - L1 loss value. - Tensor - Duration predictor loss value. - Tensor - Pitch predictor loss value. - Tensor - Energy predictor loss value. + Tensor + L1 loss value. + Tensor + Duration predictor loss value. + Tensor + Pitch predictor loss value. + Tensor + Energy predictor loss value. """ # apply mask to remove padded part diff --git a/parakeet/modules/fastspeech2_predictor/duration_predictor.py b/parakeet/modules/fastspeech2_predictor/duration_predictor.py index 288df2f..10e3f38 100644 --- a/parakeet/modules/fastspeech2_predictor/duration_predictor.py +++ b/parakeet/modules/fastspeech2_predictor/duration_predictor.py @@ -32,10 +32,10 @@ class DurationPredictor(nn.Layer): Note ---------- - The calculation domain of outputs is different - between in `forward` and in `inference`. In `forward`, - the outputs are calculated in log domain but in `inference`, - those are calculated in linear domain. + The calculation domain of outputs is different + between in `forward` and in `inference`. In `forward`, + the outputs are calculated in log domain but in `inference`, + those are calculated in linear domain. """ @@ -50,18 +50,18 @@ class DurationPredictor(nn.Layer): Parameters ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. - offset : float, optional - Offset value to avoid nan in log domain. + idim : int + Input dimension. + n_layers : int, optional + Number of convolutional layers. + n_chans : int, optional + Number of channels of convolutional layers. + kernel_size : int, optional + Kernel size of convolutional layers. + dropout_rate : float, optional + Dropout rate. + offset : float, optional + Offset value to avoid nan in log domain. """ super(DurationPredictor, self).__init__() @@ -108,10 +108,10 @@ class DurationPredictor(nn.Layer): Parameters ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : ByteTensor, optional - Batch of masks indicating padded part (B, Tmax). + xs : Tensor + Batch of input sequences (B, Tmax, idim). + x_masks : ByteTensor, optional + Batch of masks indicating padded part (B, Tmax). Returns ---------- @@ -125,15 +125,15 @@ class DurationPredictor(nn.Layer): Parameters ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax). + xs : Tensor + Batch of input sequences (B, Tmax, idim). + x_masks : Tensor(bool), optional + Batch of masks indicating padded part (B, Tmax). Returns ---------- - LongTensor - Batch of predicted durations in linear domain int64 (B, Tmax). + Tensor + Batch of predicted durations in linear domain int64 (B, Tmax). """ return self._forward(xs, x_masks, True) @@ -150,10 +150,10 @@ class DurationPredictorLoss(nn.Layer): Parameters ---------- - offset : float, optional - Offset value to avoid nan in log domain. - reduction : str - Reduction type in loss calculation. + offset : float, optional + Offset value to avoid nan in log domain. + reduction : str + Reduction type in loss calculation. """ super(DurationPredictorLoss, self).__init__() self.criterion = nn.MSELoss(reduction=reduction) @@ -164,19 +164,19 @@ class DurationPredictorLoss(nn.Layer): Parameters ---------- - outputs : Tensor - Batch of prediction durations in log domain (B, T) - targets : LongTensor - Batch of groundtruth durations in linear domain (B, T) + outputs : Tensor + Batch of prediction durations in log domain (B, T) + targets : Tensor + Batch of groundtruth durations in linear domain (B, T) Returns ---------- - Tensor - Mean squared error loss value. + Tensor + Mean squared error loss value. Note ---------- - `outputs` is in log domain but `targets` is in linear domain. + `outputs` is in log domain but `targets` is in linear domain. """ # NOTE: outputs is in log domain while targets in linear targets = paddle.log(targets.cast(dtype='float32') + self.offset) diff --git a/parakeet/modules/fastspeech2_predictor/length_regulator.py b/parakeet/modules/fastspeech2_predictor/length_regulator.py index 62b29a3..0e6233c 100644 --- a/parakeet/modules/fastspeech2_predictor/length_regulator.py +++ b/parakeet/modules/fastspeech2_predictor/length_regulator.py @@ -37,8 +37,8 @@ class LengthRegulator(nn.Layer): Parameters ---------- - pad_value : float, optional - Value used for padding. + pad_value : float, optional + Value used for padding. """ super().__init__() @@ -70,17 +70,17 @@ class LengthRegulator(nn.Layer): Parameters ---------- - xs : Tensor - Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds : LongTensor - Batch of durations of each frame (B, T). - alpha : float, optional - Alpha value to control speed of speech. + xs : Tensor + Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds : LongTensor + Batch of durations of each frame (B, T). + alpha : float, optional + Alpha value to control speed of speech. Returns ---------- - Tensor - replicated input tensor based on durations (B, T*, D). + Tensor + replicated input tensor based on durations (B, T*, D). """ if alpha != 1.0: assert alpha > 0 diff --git a/parakeet/modules/fastspeech2_predictor/postnet.py b/parakeet/modules/fastspeech2_predictor/postnet.py index fe9fd21..50b849e 100644 --- a/parakeet/modules/fastspeech2_predictor/postnet.py +++ b/parakeet/modules/fastspeech2_predictor/postnet.py @@ -45,20 +45,20 @@ class Postnet(nn.Layer): Parameters ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of layers. - n_filts : int, optional - The number of filter size. - n_units : int, optional - The number of filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization.. - dropout_rate : float, optional - Dropout rate.. + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + n_layers : int, optional + The number of layers. + n_filts : int, optional + The number of filter size. + n_units : int, optional + The number of filter channels. + use_batch_norm : bool, optional + Whether to use batch normalization.. + dropout_rate : float, optional + Dropout rate.. """ super(Postnet, self).__init__() self.postnet = nn.LayerList() @@ -120,13 +120,13 @@ class Postnet(nn.Layer): Parameters ---------- - xs : Tensor - Batch of the sequences of padded input tensors (B, idim, Tmax). + xs : Tensor + Batch of the sequences of padded input tensors (B, idim, Tmax). Returns ---------- - Tensor - Batch of padded output tensor. (B, odim, Tmax). + Tensor + Batch of padded output tensor. (B, odim, Tmax). """ for i in six.moves.range(len(self.postnet)): diff --git a/parakeet/modules/fastspeech2_predictor/variance_predictor.py b/parakeet/modules/fastspeech2_predictor/variance_predictor.py index 5cbc091..92136a2 100644 --- a/parakeet/modules/fastspeech2_predictor/variance_predictor.py +++ b/parakeet/modules/fastspeech2_predictor/variance_predictor.py @@ -43,16 +43,16 @@ class VariancePredictor(nn.Layer): Parameters ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. + idim : int + Input dimension. + n_layers : int, optional + Number of convolutional layers. + n_chans : int, optional + Number of channels of convolutional layers. + kernel_size : int, optional + Kernel size of convolutional layers. + dropout_rate : float, optional + Dropout rate. """ assert check_argument_types() super().__init__() diff --git a/parakeet/modules/fastspeech2_transformer/attention.py b/parakeet/modules/fastspeech2_transformer/attention.py index 3c04c6c..9cb6001 100644 --- a/parakeet/modules/fastspeech2_transformer/attention.py +++ b/parakeet/modules/fastspeech2_transformer/attention.py @@ -26,12 +26,12 @@ class MultiHeadedAttention(nn.Layer): Parameters ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. + n_head : int + The number of heads. + n_feat : int + The number of features. + dropout_rate : float + Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -53,21 +53,21 @@ class MultiHeadedAttention(nn.Layer): Parameters ---------- - query : paddle.Tensor - query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). + query : paddle.Tensor + query tensor (#batch, time1, size). + key : paddle.Tensor + Key tensor (#batch, time2, size). + value : paddle.Tensor + Value tensor (#batch, time2, size). Returns ---------- - paddle.Tensor - Transformed query tensor (#batch, n_head, time1, d_k). - paddle.Tensor - Transformed key tensor (#batch, n_head, time2, d_k). - paddle.Tensor - Transformed value tensor (#batch, n_head, time2, d_k). + paddle.Tensor + Transformed query tensor (#batch, n_head, time1, d_k). + paddle.Tensor + Transformed key tensor (#batch, n_head, time2, d_k). + paddle.Tensor + Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = query.shape[0] @@ -90,18 +90,18 @@ class MultiHeadedAttention(nn.Layer): Parameters ---------- - value : paddle.Tensor - Transformed value (#batch, n_head, time2, d_k). - scores : paddle.Tensor - Attention score (#batch, n_head, time1, time2). - mask : paddle.Tensor - Mask (#batch, 1, time2) or (#batch, time1, time2). + value : paddle.Tensor + Transformed value (#batch, n_head, time2, d_k). + scores : paddle.Tensor + Attention score (#batch, n_head, time1, time2). + mask : paddle.Tensor + Mask (#batch, 1, time2) or (#batch, time1, time2). Returns ---------- - paddle.Tensor: - Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). + paddle.Tensor: + Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). """ n_batch = value.shape[0] softmax = paddle.nn.Softmax(axis=-1) @@ -136,19 +136,19 @@ class MultiHeadedAttention(nn.Layer): Parameters ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + query : paddle.Tensor + Query tensor (#batch, time1, size). + key : paddle.Tensor + Key tensor (#batch, time2, size). + value : paddle.Tensor + Value tensor (#batch, time2, size). + mask : paddle.Tensor + Mask tensor (#batch, 1, time2) or (#batch, time1, time2). Returns ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + paddle.Tensor + Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) scores = paddle.matmul(q, k.transpose( diff --git a/parakeet/modules/fastspeech2_transformer/embedding.py b/parakeet/modules/fastspeech2_transformer/embedding.py index 51a4c1b..9767193 100644 --- a/parakeet/modules/fastspeech2_transformer/embedding.py +++ b/parakeet/modules/fastspeech2_transformer/embedding.py @@ -24,14 +24,14 @@ class PositionalEncoding(nn.Layer): Parameters ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - reverse : bool - Whether to reverse the input position. Only for + d_model : int + Embedding dimension. + dropout_rate : float + Dropout rate. + max_len : int + Maximum input length. + reverse : bool + Whether to reverse the input position. """ def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): @@ -68,13 +68,13 @@ class PositionalEncoding(nn.Layer): Parameters ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). + x : paddle.Tensor + Input tensor (batch, time, `*`). Returns ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + paddle.Tensor + Encoded tensor (batch, time, `*`). """ self.extend_pe(x) x = x * self.xscale + self.pe[:, :x.shape[1]] diff --git a/parakeet/modules/fastspeech2_transformer/encoder.py b/parakeet/modules/fastspeech2_transformer/encoder.py index 0288ab4..84a6142 100644 --- a/parakeet/modules/fastspeech2_transformer/encoder.py +++ b/parakeet/modules/fastspeech2_transformer/encoder.py @@ -29,42 +29,42 @@ class Encoder(nn.Layer): Parameters ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, paddle.nn.Layer] - Input layer type. - pos_enc_class : paddle.nn.Layer - Positional encoding module class. - `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - selfattention_layer_type : str - Encoder attention layer type. - padding_idx : int - Padding idx for input_layer=embed. + idim : int + Input dimension. + attention_dim : int + Dimention of attention. + attention_heads : int + The number of heads of multi head attention. + linear_units : int + The number of units of position-wise feed forward. + num_blocks : int + The number of decoder blocks. + dropout_rate : float + Dropout rate. + positional_dropout_rate : float + Dropout rate after adding positional encoding. + attention_dropout_rate : float + Dropout rate in attention. + input_layer : Union[str, paddle.nn.Layer] + Input layer type. + pos_enc_class : paddle.nn.Layer + Positional encoding module class. + `PositionalEncoding `or `ScaledPositionalEncoding` + normalize_before : bool + Whether to use layer_norm before the first block. + concat_after : bool + Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type : str + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size : int + Kernel size of positionwise conv1d layer. + selfattention_layer_type : str + Encoder attention layer type. + padding_idx : int + Padding idx for input_layer=embed. """ def __init__( @@ -180,17 +180,17 @@ class Encoder(nn.Layer): Parameters ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, time). + xs : paddle.Tensor + Input tensor (#batch, time, idim). + masks : paddle.Tensor + Mask tensor (#batch, time). Returns ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, time). + paddle.Tensor + Output tensor (#batch, time, attention_dim). + paddle.Tensor + Mask tensor (#batch, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -203,21 +203,21 @@ class Encoder(nn.Layer): Parameters ---------- - xs : paddle.Tensor - Input tensor. - masks : paddle.Tensor - Mask tensor. - cache : List[paddle.Tensor] - List of cache tensors. + xs : paddle.Tensor + Input tensor. + masks : paddle.Tensor + Mask tensor. + cache : List[paddle.Tensor] + List of cache tensors. Returns ---------- - paddle.Tensor - Output tensor. - paddle.Tensor - Mask tensor. - List[paddle.Tensor] - List of new cache tensors. + paddle.Tensor + Output tensor. + paddle.Tensor + Mask tensor. + List[paddle.Tensor] + List of new cache tensors. """ xs = self.embed(xs) @@ -229,4 +229,4 @@ class Encoder(nn.Layer): new_cache.append(xs) if self.normalize_before: xs = self.after_norm(xs) - return xs, masks, new_cache + return xs, masks, new_cache \ No newline at end of file diff --git a/parakeet/modules/fastspeech2_transformer/encoder_layer.py b/parakeet/modules/fastspeech2_transformer/encoder_layer.py index e416348..00d551e 100644 --- a/parakeet/modules/fastspeech2_transformer/encoder_layer.py +++ b/parakeet/modules/fastspeech2_transformer/encoder_layer.py @@ -22,23 +22,23 @@ class EncoderLayer(nn.Layer): Parameters ---------- - size : int - Input dimension. - self_attn : paddle.nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : paddle.nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + size : int + Input dimension. + self_attn : paddle.nn.Layer + Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward : paddle.nn.Layer + Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate : float + Dropout rate. + normalize_before : bool + Whether to use layer_norm before the first block. + concat_after : bool + Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ def __init__( @@ -67,19 +67,19 @@ class EncoderLayer(nn.Layer): Parameters ---------- - x_input : paddle.Tensor - Input tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache : paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). + x_input : paddle.Tensor + Input tensor (#batch, time, size). + mask : paddle.Tensor + Mask tensor for the input (#batch, time). + cache : paddle.Tensor + Cache tensor of the input (#batch, time - 1, size). Returns ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + paddle.Tensor + Output tensor (#batch, time, size). + paddle.Tensor + Mask tensor (#batch, time). """ residual = x if self.normalize_before: diff --git a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py index cde2168..273d8d0 100644 --- a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py +++ b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py @@ -34,14 +34,14 @@ class MultiLayeredConv1d(paddle.nn.Layer): Parameters ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + in_chans : int + Number of input channels. + hidden_chans : int + Number of hidden channels. + kernel_size : int + Kernel size of conv1d. + dropout_rate : float + Dropout rate. """ super(MultiLayeredConv1d, self).__init__() @@ -65,13 +65,13 @@ class MultiLayeredConv1d(paddle.nn.Layer): Parameters ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + x : paddle.Tensor + Batch of input tensors (B, T, in_chans). Returns ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + paddle.Tensor + Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( @@ -90,14 +90,14 @@ class Conv1dLinear(paddle.nn.Layer): Parameters ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + in_chans : int + Number of input channels. + hidden_chans : int + Number of hidden channels. + kernel_size : int + Kernel size of conv1d. + dropout_rate : float + Dropout rate. """ super(Conv1dLinear, self).__init__() self.w_1 = paddle.nn.Conv1D( @@ -115,13 +115,13 @@ class Conv1dLinear(paddle.nn.Layer): Parameters ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + x : paddle.Tensor + Batch of input tensors (B, T, in_chans). Returns ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + paddle.Tensor + Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) diff --git a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py index 930d496..c57fba6 100644 --- a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py +++ b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py @@ -21,12 +21,12 @@ class PositionwiseFeedForward(paddle.nn.Layer): Parameters ---------- - idim : int - Input dimenstion. - hidden_units : int - The number of hidden units. - dropout_rate : float - Dropout rate. + idim : int + Input dimenstion. + hidden_units : int + The number of hidden units. + dropout_rate : float + Dropout rate. """ def __init__(self, diff --git a/parakeet/modules/fastspeech2_transformer/repeat.py b/parakeet/modules/fastspeech2_transformer/repeat.py index 62b21de..250a3a4 100644 --- a/parakeet/modules/fastspeech2_transformer/repeat.py +++ b/parakeet/modules/fastspeech2_transformer/repeat.py @@ -31,14 +31,14 @@ def repeat(N, fn): Parameters ---------- - N : int - Number of repeat time. - fn : Callable - Function to generate module. + N : int + Number of repeat time. + fn : Callable + Function to generate module. Returns ---------- - MultiSequential - Repeated model instance. + MultiSequential + Repeated model instance. """ return MultiSequential(* [fn(n) for n in range(N)]) diff --git a/parakeet/modules/layer_norm.py b/parakeet/modules/layer_norm.py index 5a9fe4e..2ff91b8 100644 --- a/parakeet/modules/layer_norm.py +++ b/parakeet/modules/layer_norm.py @@ -21,10 +21,10 @@ class LayerNorm(paddle.nn.LayerNorm): Parameters ---------- - nout : int - Output dim size. - dim : int - Dimension to be normalized. + nout : int + Output dim size. + dim : int + Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -37,13 +37,13 @@ class LayerNorm(paddle.nn.LayerNorm): Parameters ---------- - x : paddle.Tensor - Input tensor. + x : paddle.Tensor + Input tensor. Returns ---------- - paddle.Tensor - Normalized tensor. + paddle.Tensor + Normalized tensor. """ if self.dim == -1: return super(LayerNorm, self).forward(x) diff --git a/parakeet/modules/nets_utils.py b/parakeet/modules/nets_utils.py index d218f62..5997873 100644 --- a/parakeet/modules/nets_utils.py +++ b/parakeet/modules/nets_utils.py @@ -22,25 +22,25 @@ def pad_list(xs, pad_value): Parameters ---------- - xs : List[Tensor] - List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value : float) - Value for padding. + xs : List[Tensor] + List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value : float) + Value for padding. Returns ---------- - Tensor - Padded tensor (B, Tmax, `*`). + Tensor + Padded tensor (B, Tmax, `*`). Examples ---------- - >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) + >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] + >>> x + [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] + >>> pad_list(x, 0) + tensor([[1., 1., 1., 1.], + [1., 1., 0., 0.], + [1., 0., 0., 0.]]) """ n_batch = len(xs) max_len = max(x.shape[0] for x in xs) @@ -57,23 +57,23 @@ def make_pad_mask(lengths, length_dim=-1): Parameters ---------- - lengths : LongTensor or List - Batch of lengths (B,). + lengths : LongTensor or List + Batch of lengths (B,). Returns ---------- - Tensor(bool) - Mask tensor containing indices of padded part bool. + Tensor(bool) + Mask tensor containing indices of padded part bool. Examples ---------- - With only lengths. + With only lengths. - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] """ if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) @@ -99,29 +99,29 @@ def make_non_pad_mask(lengths, length_dim=-1): Parameters ---------- - lengths : LongTensor or List - Batch of lengths (B,). - xs : Tensor, optional - The reference tensor. - If set, masks will be the same shape as this tensor. - length_dim : int, optional - Dimension indicator of the above tensor. - See the example. + lengths : LongTensor or List + Batch of lengths (B,). + xs : Tensor, optional + The reference tensor. + If set, masks will be the same shape as this tensor. + length_dim : int, optional + Dimension indicator of the above tensor. + See the example. Returns ---------- - Tensor(bool) - mask tensor containing indices of padded part bool. + Tensor(bool) + mask tensor containing indices of padded part bool. Examples ---------- - With only lengths. + With only lengths. - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[1, 1, 1, 1 ,1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0]] """ return paddle.logical_not(make_pad_mask(lengths, length_dim)) @@ -135,10 +135,10 @@ def initialize(model: nn.Layer, init: str): Parameters ---------- - model : paddle.nn.Layer - Target. - init : str - Method of initialization. + model : paddle.nn.Layer + Target. + init : str + Method of initialization. """ assert check_argument_types() diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py index cdc066f..7c3779c 100644 --- a/parakeet/modules/stft_loss.py +++ b/parakeet/modules/stft_loss.py @@ -29,8 +29,8 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, C, T). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, C, T). + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Spectral convergence loss value. """ @@ -50,11 +50,16 @@ class LogSTFTMagnitudeLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Args: - x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns: - Tensor: Log STFT magnitude loss value. + Parameters + ---------- + x_mag : Tensor + Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag : Tensor + Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns + ---------- + Tensor + Log STFT magnitude loss value. """ return F.l1_loss( paddle.log(paddle.clip( @@ -86,15 +91,23 @@ class STFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Args: - x (Tensor): Predicted signal (B, T). - y (Tensor): Groundtruth signal (B, T). - Returns: - Tensor: Spectral convergence loss value. - Tensor: Log STFT magnitude loss value. + Parameters + ---------- + x : Tensor + Predicted signal (B, T). + y : Tensor + Groundtruth signal (B, T). + Returns + ---------- + Tensor + Spectral convergence loss value. + Tensor + Log STFT magnitude loss value. """ x_mag = self.stft.magnitude(x) y_mag = self.stft.magnitude(y) + x_mag = x_mag.transpose([0, 2, 1]) + y_mag = y_mag.transpose([0, 2, 1]) sc_loss = self.spectral_convergence_loss(x_mag, y_mag) mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) @@ -111,11 +124,16 @@ class MultiResolutionSTFTLoss(nn.Layer): win_lengths=[600, 1200, 240], window="hann", ): """Initialize Multi resolution STFT loss module. - Args: - fft_sizes (list): List of FFT sizes. - hop_sizes (list): List of hop sizes. - win_lengths (list): List of window lengths. - window (str): Window function type. + Parameters + ---------- + fft_sizes : list + List of FFT sizes. + hop_sizes : list + List of hop sizes. + win_lengths : list + List of window lengths. + window : str + Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) @@ -125,13 +143,24 @@ class MultiResolutionSTFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Args: - x (Tensor): Predicted signal (B, T). - y (Tensor): Groundtruth signal (B, T). - Returns: - Tensor: Multi resolution spectral convergence loss value. - Tensor: Multi resolution log STFT magnitude loss value. + Parameters + ---------- + x : Tensor + Predicted signal (B, T) or (B, #subband, T). + y : Tensor + Groundtruth signal (B, T) or (B, #subband, T). + Returns + ---------- + Tensor + Multi resolution spectral convergence loss value. + Tensor + Multi resolution log STFT magnitude loss value. """ + if len(x.shape) == 3: + # (B, C, T) -> (B x C, T) + x = x.reshape([-1, x.shape[2]]) + # (B, C, T) -> (B x C, T) + y = y.reshape([-1, y.shape[2]]) sc_loss = 0.0 mag_loss = 0.0 for f in self.stft_losses: