From 83d6a85b8663a0cd700e88ed03f7bde27f8ceacc Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 12 Dec 2019 17:58:10 -0800 Subject: [PATCH 01/10] add waveflow model valid for training only --- parakeet/models/waveflow/README.md | 97 +++++++ .../waveflow_ljspeech_sqz16_r64_layer8x8.yaml | 24 ++ ...flow_ljspeech_sqz16_r64_layer8x8_s123.yaml | 24 ++ parakeet/models/waveflow/data.py | 139 ++++++++++ parakeet/models/waveflow/slurm.py | 113 ++++++++ parakeet/models/waveflow/synthesis.py | 85 ++++++ parakeet/models/waveflow/train.py | 139 ++++++++++ parakeet/models/waveflow/utils.py | 135 +++++++++ parakeet/models/waveflow/waveflow.py | 174 ++++++++++++ parakeet/models/waveflow/waveflow_modules.py | 256 ++++++++++++++++++ 10 files changed, 1186 insertions(+) create mode 100644 parakeet/models/waveflow/README.md create mode 100644 parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml create mode 100644 parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml create mode 100644 parakeet/models/waveflow/data.py create mode 100644 parakeet/models/waveflow/slurm.py create mode 100644 parakeet/models/waveflow/synthesis.py create mode 100644 parakeet/models/waveflow/train.py create mode 100644 parakeet/models/waveflow/utils.py create mode 100644 parakeet/models/waveflow/waveflow.py create mode 100644 parakeet/models/waveflow/waveflow_modules.py diff --git a/parakeet/models/waveflow/README.md b/parakeet/models/waveflow/README.md new file mode 100644 index 0000000..18efd0b --- /dev/null +++ b/parakeet/models/waveflow/README.md @@ -0,0 +1,97 @@ +# WaveNet with Paddle Fluid + +Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms. +WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499). +Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. + +We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures. + +## Project Structure +```text +├── configs # yaml configuration files of preset model hyperparameters +├── data.py # dataset and dataloader settings for LJSpeech +├── slurm.py # optional slurm helper functions if you use slurm to train model +├── synthesis.py # script for speech synthesis +├── train.py # script for model training +├── utils.py # helper functions for e.g., model checkpointing +├── wavenet.py # WaveNet model high level APIs +└── wavenet_modules.py # WaveNet model implementation +``` + +## Usage + +There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on. Hyperparameters that are known to work good for the LJSpeech dataset are provided as yaml files in `./configs/` folder. Specifically, we provide `wavenet_ljspeech_single_gaussian.yaml`, `wavenet_ljspeech_mix_gaussian.yaml`, and `wavenet_ljspeech_softmax.yaml` config files for WaveNet with single Gaussian, 10-component mixture of Gaussians, and softmax (with 2048 linearly quantized channels) output distributions, respectively. + +Note that `train.py` and `synthesis.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training and synthesizing. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`. For example `--config=${yaml} --batch_size=8 --layers=20` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`. + +Note that you also need to specify some additional parameters for `train.py` and `synthesis.py`, and the details can be found in `train.add_options_to_parser` and `synthesis.add_options_to_parser`, respectively. + +### Dataset + +Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). + +```bash +wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +tar xjvf LJSpeech-1.1.tar.bz2 +``` + +In this example, assume that the path of unzipped LJSpeech dataset is `./data/LJSpeech-1.1`. + +### Train on single GPU + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u train.py --config=${yaml} \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --batch_size=4 \ + --parallel=false --use_gpu=true +``` + +#### Save and Load checkpoints + +Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default. +The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. + +There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): +1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. +2. Use `--iteration=500000`. +3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`. + +### Train on multiple GPUs + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -u -m paddle.distributed.launch train.py \ + --config=${yaml} \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --parallel=true --use_gpu=true +``` + +Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode. + +### Monitor with Tensorboard + +By default, the logs are saved in `./runs/wavenet/${ModelName}/logs/`. You can monitor logs by tensorboard. + +```bash +tensorboard --logdir=${log_dir} --port=8888 +``` + +### Synthesize from a checkpoint + +Check the [Save and load checkpoint](#save-and-load-checkpoints) section on how to load a specific checkpoint. +The following example will automatically load the latest checkpoint: + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u synthesis.py --config=${yaml} \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --use_gpu=true \ + --output=./syn_audios \ + --sample=${SAMPLE} +``` + +In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. diff --git a/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml b/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml new file mode 100644 index 0000000..f9bbc83 --- /dev/null +++ b/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml @@ -0,0 +1,24 @@ +valid_size: 16 +segment_length: 16000 +sample_rate: 22050 +fft_window_shift: 256 +fft_window_size: 1024 +fft_size: 1024 +mel_bands: 80 +mel_fmin: 0.0 +mel_fmax: 8000.0 + +seed: 1234 +learning_rate: 0.0002 +batch_size: 8 +test_every: 2000 +save_every: 5000 +max_iterations: 2000000 + +sigma: 1.0 +n_flows: 8 +n_group: 16 +n_layers: 8 +n_channels: 64 +kernel_h: 3 +kernel_w: 3 diff --git a/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml b/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml new file mode 100644 index 0000000..7d45212 --- /dev/null +++ b/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml @@ -0,0 +1,24 @@ +valid_size: 16 +segment_length: 16000 +sample_rate: 22050 +fft_window_shift: 256 +fft_window_size: 1024 +fft_size: 1024 +mel_bands: 80 +mel_fmin: 0.0 +mel_fmax: 8000.0 + +seed: 123 +learning_rate: 0.0002 +batch_size: 8 +test_every: 2000 +save_every: 5000 +max_iterations: 2000000 + +sigma: 1.0 +n_flows: 8 +n_group: 16 +n_layers: 8 +n_channels: 64 +kernel_h: 3 +kernel_w: 3 diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py new file mode 100644 index 0000000..3c70ce0 --- /dev/null +++ b/parakeet/models/waveflow/data.py @@ -0,0 +1,139 @@ +import random + +import librosa +import numpy as np +from paddle import fluid + +import utils +from parakeet.datasets import ljspeech +from parakeet.data import dataset +from parakeet.data.batch import SpecBatcher, WavBatcher +from parakeet.data.datacargo import DataCargo +from parakeet.data.sampler import DistributedSampler, BatchSampler +from scipy.io.wavfile import read + +MAX_WAV_VALUE = 32768.0 + + +class Dataset(ljspeech.LJSpeech): + def __init__(self, config): + super(Dataset, self).__init__(config.root) + self.config = config + + def _get_example(self, metadatum): + fname, _, _ = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + loaded_sr, audio = read(wav_path) + assert loaded_sr == self.config.sample_rate + + return audio + + +class Subset(dataset.Dataset): + def __init__(self, dataset, indices, valid): + self.dataset = dataset + self.indices = indices + self.valid = valid + self.config = dataset.config + + def get_mel(self, audio): + spectrogram = librosa.core.stft( + audio, n_fft=self.config.fft_size, + hop_length=self.config.fft_window_shift, + win_length=self.config.fft_window_size) + spectrogram_magnitude = np.abs(spectrogram) + + # mel_filter_bank shape: [n_mels, 1 + n_fft/2] + mel_filter_bank = librosa.filters.mel( + sr=self.config.sample_rate, + n_fft=self.config.fft_size, + n_mels=self.config.mel_bands, + fmin=self.config.mel_fmin, + fmax=self.config.mel_fmax) + # mel shape: [n_mels, num_frames] + mel = np.dot(mel_filter_bank, spectrogram_magnitude) + + # Normalize mel. + clip_val = 1e-5 + ref_constant = 1 + mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant) + + return mel + + def __getitem__(self, idx): + audio = self.dataset[self.indices[idx]] + segment_length = self.config.segment_length + + if self.valid: + # whole audio for valid set + pass + else: + # audio shape: [len] + if audio.shape[0] >= segment_length: + max_audio_start = audio.shape[0] - segment_length + audio_start = random.randint(0, max_audio_start) + audio = audio[audio_start : (audio_start + segment_length)] + else: + audio = np.pad(audio, (0, segment_length - audio.shape[0]), + mode='constant', constant_values=0) + + # Normalize audio. + audio = audio / MAX_WAV_VALUE + mel = self.get_mel(audio) + + return audio, mel + + def _batch_examples(self, batch): + audio_batch = [] + mel_batch = [] + for audio, mel in batch: + audio_batch + + audios = [sample[0] for sample in batch] + mels = [sample[1] for sample in batch] + + audios = WavBatcher(pad_value=0.0)(audios) + mels = SpecBatcher(pad_value=0.0)(mels) + + return audios, mels + + def __len__(self): + return len(self.indices) + + +class LJSpeech: + def __init__(self, config, nranks, rank): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + # Whole LJSpeech dataset. + ds = Dataset(config) + + # Split into train and valid dataset. + indices = list(range(len(ds))) + train_indices = indices[config.valid_size:] + valid_indices = indices[:config.valid_size] + random.shuffle(train_indices) + + # Train dataset. + trainset = Subset(ds, train_indices, valid=False) + sampler = DistributedSampler(len(trainset), nranks, rank) + total_bs = config.batch_size + assert total_bs % nranks == 0 + train_sampler = BatchSampler(sampler, total_bs // nranks, + drop_last=True) + trainloader = DataCargo(trainset, batch_sampler=train_sampler) + + trainreader = fluid.io.PyReader(capacity=50, return_list=True) + trainreader.decorate_batch_generator(trainloader, place) + self.trainloader = (data for _ in iter(int, 1) + for data in trainreader()) + + # Valid dataset. + validset = Subset(ds, valid_indices, valid=True) + # Currently only support batch_size = 1 for valid loader. + validloader = DataCargo(validset, batch_size=1, shuffle=False) + + validreader = fluid.io.PyReader(capacity=20, return_list=True) + validreader.decorate_batch_generator(validloader, place) + self.validloader = validreader diff --git a/parakeet/models/waveflow/slurm.py b/parakeet/models/waveflow/slurm.py new file mode 100644 index 0000000..de1818c --- /dev/null +++ b/parakeet/models/waveflow/slurm.py @@ -0,0 +1,113 @@ +""" +Utility module for restarting training when using SLURM. +""" +import subprocess +import os +import sys +import shlex +import re +import time + + +def job_info(): + """Get information about the current job using `scontrol show job`. + Returns a dict mapping parameter names (e.g. "UserId", "RunTime", etc) to + their values, both as strings. + """ + job_id = int(os.environ["SLURM_JOB_ID"]) + + command = ["scontrol", "show", "job", str(job_id)] + output = subprocess.check_output(command).decode("utf-8") + + # Use a regex to extract the parameter names and values + pattern = "([A-Za-z/]*)=([^ \t\n]*)" + return dict(re.findall(pattern, output)) + + +def parse_hours(text): + """Parse a time format HH or DD-HH into a number of hours.""" + hour_chunks = text.split("-") + if len(hour_chunks) == 1: + return int(hour_chunks[0]) + elif len(hour_chunks) == 2: + return 24 * int(hour_chunks[0]) + int(hour_chunks[1]) + else: + raise ValueError("Unexpected hour format (expected HH or " + "DD-HH, but got {}).".format(text)) + + +def parse_time(text): + """Convert slurm time to an integer. + Expects time to be of the form: + "hours:minutes:seconds" or "day-hours:minutes:seconds". + """ + hours, minutes, seconds = text.split(":") + try: + return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds) + except ValueError as e: + raise ValueError("Error parsing time {}. Got error {}.".format( + text, str(e))) + + +def restart_command(): + """Using the environment and SLURM command, create a command that, when, + run, will enqueue a repeat of the current job using `sbatch`. + Return the command as a list of strings, suitable for passing to + `subprocess.check_call` or similar functions. + Returns: + resume_command: list, command to run to restart job. + end_time: int or None; the time the job will end or None + if the job has unlimited runtime. + """ + # Make sure `RunTime` could be parsed correctly. + while job_info()["RunTime"] == "INVALID": + time.sleep(1) + + # Get all the necessary information by querying SLURM with this job id + info = job_info() + + try: + num_cpus = int(info["CPUs/Task"]) + except KeyError: + num_cpus = int(os.environ["SLURM_CPUS_PER_TASK"]) + + num_tasks = int(os.environ["SLURM_NTASKS"]) + nodes = info["NumNodes"] + gres, partition = info.get("Gres"), info.get("Partition") + stderr, stdout = info.get("StdErr"), info.get("StdOut") + job_name = info.get("JobName") + command = ["sbatch", "--job-name={}".format(job_name), + "--ntasks={}".format(num_tasks), + "--exclude=asimov-186"] + + if partition: + command.extend(["--partition", partition]) + + if gres and gres != "(null)": + command.extend(["--gres", gres]) + num_gpu = int(gres.split(':')[-1]) + print("number of gpu assigned by slurm is {}".format(num_gpu)) + + if stderr: + command.extend(["--error", stderr]) + + if stdout: + command.extend(["--output", stdout]) + + python = subprocess.check_output( + ["/usr/bin/which", "python3"]).decode("utf-8").strip() + dist_setting = ['-m', 'paddle.distributed.launch'] + wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv + + command.append( + "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd))) + time_limit_string = info["TimeLimit"] + if time_limit_string.lower() == "unlimited": + print("UNLIMITED detected: restart OFF, infinite learning ON.", + flush=True) + return command, None + time_limit = parse_time(time_limit_string) + runtime = parse_time(info["RunTime"]) + end_time = time.time() + time_limit - runtime + + return command, end_time diff --git a/parakeet/models/waveflow/synthesis.py b/parakeet/models/waveflow/synthesis.py new file mode 100644 index 0000000..d87a188 --- /dev/null +++ b/parakeet/models/waveflow/synthesis.py @@ -0,0 +1,85 @@ +import os +import random +from pprint import pprint + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid + +import utils +from wavenet import WaveNet + + +def add_options_to_parser(parser): + parser.add_argument('--model', type=str, default='wavenet', + help="general name of the model") + parser.add_argument('--name', type=str, + help="specific name of the training model") + parser.add_argument('--root', type=str, + help="root path of the LJSpeech dataset") + + parser.add_argument('--use_gpu', type=bool, default=True, + help="option to use gpu training") + + parser.add_argument('--iteration', type=int, default=None, + help=("which iteration of checkpoint to load, " + "default to load the latest checkpoint")) + parser.add_argument('--checkpoint', type=str, default=None, + help="path of the checkpoint to load") + + parser.add_argument('--output', type=str, default="./syn_audios", + help="path to write synthesized audio files") + parser.add_argument('--sample', type=int, + help="which of the valid samples to synthesize audio") + + +def synthesize(config): + pprint(jsonargparse.namespace_to_dict(config)) + + # Get checkpoint directory path. + run_dir = os.path.join("runs", config.model, config.name) + checkpoint_dir = os.path.join(run_dir, "checkpoint") + + # Configurate device. + place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace() + + with dg.guard(place): + # Fix random seed. + seed = config.seed + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + print("Random Seed: ", seed) + + # Build model. + model = WaveNet(config, checkpoint_dir) + model.build(training=False) + + # Obtain the current iteration. + if config.checkpoint is None: + if config.iteration is None: + iteration = utils.load_latest_checkpoint(checkpoint_dir) + else: + iteration = config.iteration + else: + iteration = int(config.checkpoint.split('/')[-1].split('-')[-1]) + + # Run model inference. + model.infer(iteration) + + +if __name__ == "__main__": + # Create parser. + parser = jsonargparse.ArgumentParser( + description="Synthesize audio using WaveNet model", + formatter_class='default_argparse') + add_options_to_parser(parser) + utils.add_config_options_to_parser(parser) + + # Parse argument from both command line and yaml config file. + # For conflicting updates to the same field, + # the preceding update will be overwritten by the following one. + config = parser.parse_args() + synthesize(config) diff --git a/parakeet/models/waveflow/train.py b/parakeet/models/waveflow/train.py new file mode 100644 index 0000000..a125d97 --- /dev/null +++ b/parakeet/models/waveflow/train.py @@ -0,0 +1,139 @@ +import os +import random +import subprocess +import time +from pprint import pprint + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid +from tensorboardX import SummaryWriter + +import slurm +import utils +from waveflow import WaveFlow + +MAXIMUM_SAVE_TIME = 10 * 60 + + +def add_options_to_parser(parser): + parser.add_argument('--model', type=str, default='waveflow', + help="general name of the model") + parser.add_argument('--name', type=str, + help="specific name of the training model") + parser.add_argument('--root', type=str, + help="root path of the LJSpeech dataset") + + parser.add_argument('--parallel', type=bool, default=True, + help="option to use data parallel training") + parser.add_argument('--use_gpu', type=bool, default=True, + help="option to use gpu training") + + parser.add_argument('--iteration', type=int, default=None, + help=("which iteration of checkpoint to load, " + "default to load the latest checkpoint")) + parser.add_argument('--checkpoint', type=str, default=None, + help="path of the checkpoint to load") + parser.add_argument('--slurm', type=bool, default=False, + help="whether you are using slurm to submit training jobs") + + +def train(config): + use_gpu = config.use_gpu + parallel = config.parallel if use_gpu else False + + # Get the rank of the current training process. + rank = dg.parallel.Env().local_rank if parallel else 0 + nranks = dg.parallel.Env().nranks if parallel else 1 + + if rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(config)) + + # Make checkpoint directory. + run_dir = os.path.join("runs", config.model, config.name) + checkpoint_dir = os.path.join(run_dir, "checkpoint") + os.makedirs(checkpoint_dir, exist_ok=True) + + # Create tensorboard logger. + tb = SummaryWriter(os.path.join(run_dir, "logs")) \ + if rank == 0 else None + + # Configurate device + place = fluid.CUDAPlace(rank) if use_gpu else fluid.CPUPlace() + + with dg.guard(place): + # Fix random seed. + seed = config.seed + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + print("Random Seed: ", seed) + + # Build model. + model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb) + model.build() + + # Obtain the current iteration. + if config.checkpoint is None: + if config.iteration is None: + iteration = utils.load_latest_checkpoint(checkpoint_dir, rank) + else: + iteration = config.iteration + else: + iteration = int(config.checkpoint.split('/')[-1].split('-')[-1]) + + # Get restart command if using slurm. + if config.slurm: + resume_command, death_time = slurm.restart_command() + if rank == 0: + print("Restart command:", " ".join(resume_command)) + done = False + + while iteration < config.max_iterations: + # Run one single training step. + model.train_step(iteration) + + iteration += 1 + + if iteration % config.test_every == 0: + # Run validation step. + model.valid_step(iteration) + + # Check whether reaching the time limit. + if config.slurm: + done = (death_time is not None and death_time - time.time() < + MAXIMUM_SAVE_TIME) + + if rank == 0 and done: + print("Saving progress before exiting.") + model.save(iteration) + + print("Running restart command:", " ".join(resume_command)) + # Submit restart command. + subprocess.check_call(resume_command) + break + + if rank == 0 and iteration % config.save_every == 0: + # Save parameters. + model.save(iteration) + + # Close TensorBoard. + if rank == 0: + tb.close() + + +if __name__ == "__main__": + # Create parser. + parser = jsonargparse.ArgumentParser(description="Train WaveFlow model", + formatter_class='default_argparse') + add_options_to_parser(parser) + utils.add_config_options_to_parser(parser) + + # Parse argument from both command line and yaml config file. + # For conflicting updates to the same field, + # the preceding update will be overwritten by the following one. + config = parser.parse_args() + train(config) diff --git a/parakeet/models/waveflow/utils.py b/parakeet/models/waveflow/utils.py new file mode 100644 index 0000000..494a409 --- /dev/null +++ b/parakeet/models/waveflow/utils.py @@ -0,0 +1,135 @@ +import itertools +import os +import time + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg + + +def add_config_options_to_parser(parser): + parser.add_argument('--valid_size', type=int, + help="size of the valid dataset") + parser.add_argument('--segment_length', type=int, + help="the length of audio clip for training") + parser.add_argument('--sample_rate', type=int, + help="sampling rate of audio data file") + parser.add_argument('--fft_window_shift', type=int, + help="the shift of fft window for each frame") + parser.add_argument('--fft_window_size', type=int, + help="the size of fft window for each frame") + parser.add_argument('--fft_size', type=int, + help="the size of fft filter on each frame") + parser.add_argument('--mel_bands', type=int, + help="the number of mel bands when calculating mel spectrograms") + parser.add_argument('--mel_fmin', type=float, + help="lowest frequency in calculating mel spectrograms") + parser.add_argument('--mel_fmax', type=float, + help="highest frequency in calculating mel spectrograms") + + parser.add_argument('--seed', type=int, + help="seed of random initialization for the model") + parser.add_argument('--learning_rate', type=float) + parser.add_argument('--batch_size', type=int, + help="batch size for training") + parser.add_argument('--test_every', type=int, + help="test interval during training") + parser.add_argument('--save_every', type=int, + help="checkpointing interval during training") + parser.add_argument('--max_iterations', type=int, + help="maximum training iterations") + + parser.add_argument('--sigma', type=float, + help="standard deviation of the latent Gaussian variable") + parser.add_argument('--n_flows', type=int, + help="number of flows") + parser.add_argument('--n_group', type=int, + help="number of adjacent audio samples to squeeze into one column") + parser.add_argument('--n_layers', type=int, + help="number of conv2d layer in one wavenet-like flow architecture") + parser.add_argument('--n_channels', type=int, + help="number of residual channels in flow") + parser.add_argument('--kernel_h', type=int, + help="height of the kernel in the conv2d layer") + parser.add_argument('--kernel_w', type=int, + help="width of the kernel in the conv2d layer") + + parser.add_argument('--config', action=jsonargparse.ActionConfigFile) + + +def pad_to_size(array, length, pad_with=0.0): + """ + Pad an array on the first (length) axis to a given length. + """ + padding = length - array.shape[0] + assert padding >= 0, "Padding required was less than zero" + + paddings = [(0, 0)] * len(array.shape) + paddings[0] = (0, padding) + + return np.pad(array, paddings, mode='constant', constant_values=pad_with) + + +def calculate_context_size(config): + dilations = list( + itertools.islice( + itertools.cycle(config.dilation_block), config.layers)) + config.context_size = sum(dilations) + 1 + print("Context size is", config.context_size) + + +def load_latest_checkpoint(checkpoint_dir, rank=0): + checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") + # Create checkpoint index file if not exist. + if (not os.path.isfile(checkpoint_path)) and rank == 0: + with open(checkpoint_path, "w") as handle: + handle.write("model_checkpoint_path: step-0") + + # Make sure that other process waits until checkpoint file is created + # by process 0. + while not os.path.isfile(checkpoint_path): + time.sleep(1) + + # Fetch the latest checkpoint index. + with open(checkpoint_path, "r") as handle: + latest_checkpoint = handle.readline().split()[-1] + iteration = int(latest_checkpoint.split("-")[-1]) + + return iteration + + +def save_latest_checkpoint(checkpoint_dir, iteration): + checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") + # Update the latest checkpoint index. + with open(checkpoint_path, "w") as handle: + handle.write("model_checkpoint_path: step-{}".format(iteration)) + + +def load_parameters(checkpoint_dir, rank, model, optimizer=None, + iteration=None, file_path=None): + if file_path is None: + if iteration is None: + iteration = load_latest_checkpoint(checkpoint_dir, rank) + if iteration == 0: + return + file_path = "{}/step-{}".format(checkpoint_dir, iteration) + + model_dict, optimizer_dict = dg.load_dygraph(file_path) + model.set_dict(model_dict) + print("[checkpoint] Rank {}: loaded model from {}".format(rank, file_path)) + if optimizer and optimizer_dict: + optimizer.set_dict(optimizer_dict) + print("[checkpoint] Rank {}: loaded optimizer state from {}".format( + rank, file_path)) + + +def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None): + file_path = "{}/step-{}".format(checkpoint_dir, iteration) + model_dict = model.state_dict() + dg.save_dygraph(model_dict, file_path) + print("[checkpoint] Saved model to {}".format(file_path)) + + if optimizer: + opt_dict = optimizer.state_dict() + dg.save_dygraph(opt_dict, file_path) + print("[checkpoint] Saved optimzier state to {}".format(file_path)) diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py new file mode 100644 index 0000000..b778497 --- /dev/null +++ b/parakeet/models/waveflow/waveflow.py @@ -0,0 +1,174 @@ +import itertools +import os +import time + +import librosa +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid + +import utils +from data import LJSpeech +from waveflow_modules import WaveFlowLoss, WaveFlowModule + + +class WaveFlow(): + def __init__(self, config, checkpoint_dir, parallel=False, rank=0, + nranks=1, tb_logger=None): + self.config = config + self.checkpoint_dir = checkpoint_dir + self.parallel = parallel + self.rank = rank + self.nranks = nranks + self.tb_logger = tb_logger + + def build(self, training=True): + config = self.config + dataset = LJSpeech(config, self.nranks, self.rank) + self.trainloader = dataset.trainloader + self.validloader = dataset.validloader + +# if self.rank == 0: +# for i, (audios, mels) in enumerate(self.validloader()): +# print("audios {}, mels {}".format(audios.dtype, mels.dtype)) +# print("{}: rank {}, audios {}, mels {}".format( +# i, self.rank, audios.shape, mels.shape)) +# +# for i, (audios, mels) in enumerate(self.trainloader): +# print("{}: rank {}, audios {}, mels {}".format( +# i, self.rank, audios.shape, mels.shape)) +# +# exit() + + waveflow = WaveFlowModule("waveflow", config) + + # Dry run once to create and initalize all necessary parameters. + audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32)) + mel = dg.to_variable( + np.random.randn(1, config.mel_bands, 63).astype(np.float32)) + waveflow(audio, mel) + + if training: + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=config.learning_rate) + + # Load parameters. + utils.load_parameters(self.checkpoint_dir, self.rank, + waveflow, optimizer, + iteration=config.iteration, + file_path=config.checkpoint) + print("Rank {}: checkpoint loaded.".format(self.rank)) + + # Data parallelism. + if self.parallel: + strategy = dg.parallel.prepare_context() + waveflow = dg.parallel.DataParallel(waveflow, strategy) + + self.waveflow = waveflow + self.optimizer = optimizer + self.criterion = WaveFlowLoss(config.sigma) + + else: + # Load parameters. + utils.load_parameters(self.checkpoint_dir, self.rank, waveflow, + iteration=config.iteration, + file_path=config.checkpoint) + print("Rank {}: checkpoint loaded.".format(self.rank)) + + self.waveflow = waveflow + + def train_step(self, iteration): + self.waveflow.train() + + start_time = time.time() + audios, mels = next(self.trainloader) + load_time = time.time() + + outputs = self.waveflow(audios, mels) + loss = self.criterion(outputs) + + if self.parallel: + # loss = loss / num_trainers + loss = self.waveflow.scale_loss(loss) + loss.backward() + self.waveflow.apply_collective_grads() + else: + loss.backward() + + current_lr = self.optimizer._learning_rate + + self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters()) + self.waveflow.clear_gradients() + + graph_time = time.time() + + if self.rank == 0: + loss_val = float(loss.numpy()) * self.nranks + log = "Rank: {} Step: {:^8d} Loss: {:<8.3f} " \ + "Time: {:.3f}/{:.3f}".format( + self.rank, iteration, loss_val, + load_time - start_time, graph_time - load_time) + print(log) + + tb = self.tb_logger + tb.add_scalar("Train-Loss-Rank-0", loss_val, iteration) + tb.add_scalar("Learning-Rate", current_lr, iteration) + + @dg.no_grad + def valid_step(self, iteration): + self.waveflow.eval() + tb = self.tb_logger + + total_loss = [] + sample_audios = [] + start_time = time.time() + + for i, batch in enumerate(self.validloader()): + audios, mels = batch + valid_outputs = self.waveflow(audios, mels) + valid_z, valid_log_s_list = valid_outputs + + # Visualize latent z and scale log_s. + if self.rank == 0 and i == 0: + tb.add_histogram("Valid-Latent_z", valid_z.numpy(), iteration) + for j, valid_log_s in enumerate(valid_log_s_list): + hist_name = "Valid-{}th-Flow-Log_s".format(j) + tb.add_histogram(hist_name, valid_log_s.numpy(), iteration) + + valid_loss = self.criterion(valid_outputs) + total_loss.append(float(valid_loss.numpy())) + + total_time = time.time() - start_time + if self.rank == 0: + loss_val = np.mean(total_loss) + log = "Test | Rank: {} AvgLoss: {:<8.3f} Time {:<8.3f}".format( + self.rank, loss_val, total_time) + print(log) + tb.add_scalar("Valid-Avg-Loss", loss_val, iteration) + + @dg.no_grad + def infer(self, iteration): + self.waveflow.eval() + + config = self.config + sample = config.sample + + output = "{}/{}/iter-{}".format(config.output, config.name, iteration) + os.makedirs(output, exist_ok=True) + + filename = "{}/valid_{}.wav".format(output, sample) + print("Synthesize sample {}, save as {}".format(sample, filename)) + + mels_list = [mels for _, mels, _ in self.validloader()] + start_time = time.time() + syn_audio = self.waveflow.synthesize(mels_list[sample]) + syn_time = time.time() - start_time + print("audio shape {}, synthesis time {}".format( + syn_audio.shape, syn_time)) + librosa.output.write_wav(filename, syn_audio, + sr=config.sample_rate) + + def save(self, iteration): + utils.save_latest_parameters(self.checkpoint_dir, iteration, + self.waveflow, self.optimizer) + utils.save_latest_checkpoint(self.checkpoint_dir, iteration) diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py new file mode 100644 index 0000000..a4b9c4f --- /dev/null +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -0,0 +1,256 @@ +import itertools + +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid +from parakeet.modules import conv, modules, weight_norm + + +def set_param_attr(layer, c_in=1): + if isinstance(layer, (weight_norm.Conv2DTranspose, weight_norm.Conv2D)): + k = np.sqrt(1.0 / (c_in * np.prod(layer._filter_size))) + weight_init = fluid.initializer.UniformInitializer(low=-k, high=k) + bias_init = fluid.initializer.UniformInitializer(low=-k, high=k) + elif isinstance(layer, dg.Conv2D): + weight_init = fluid.initializer.ConstantInitializer(0.0) + bias_init = fluid.initializer.ConstantInitializer(0.0) + else: + raise TypeError("Unsupported layer type.") + + layer._param_attr = fluid.ParamAttr(initializer=weight_init) + layer._bias_attr = fluid.ParamAttr(initializer=bias_init) + + +def unfold(x, n_group): + length = x.shape[-1] + #assert length % n_group == 0 + new_shape = x.shape[:-1] + [length // n_group, n_group] + return fluid.layers.reshape(x, new_shape) + + +class WaveFlowLoss: + def __init__(self, sigma=1.0): + self.sigma = sigma + + def __call__(self, model_output): + z, log_s_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = fluid.layers.reduce_sum(log_s) + else: + log_s_total = log_s_total + fluid.layers.reduce_sum(log_s) + + loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \ + - log_s_total + loss = loss / np.prod(z.shape) + const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma) + + return loss + const + + +class Conditioner(dg.Layer): + def __init__(self, name_scope): + super(Conditioner, self).__init__(name_scope) + upsample_factors = [16, 16] + + self.upsample_conv2d = [] + for s in upsample_factors: + in_channel = 1 + conv_trans2d = modules.Conv2DTranspose( + self.full_name(), + num_filters=1, + filter_size=(3, 2 * s), + padding=(1, s // 2), + stride=(1, s)) + set_param_attr(conv_trans2d, c_in=in_channel) + self.upsample_conv2d.append(conv_trans2d) + + for i, layer in enumerate(self.upsample_conv2d): + self.add_sublayer("conv2d_transpose_{}".format(i), layer) + + def forward(self, x): + x = fluid.layers.unsqueeze(x, 1) + for layer in self.upsample_conv2d: + x = fluid.layers.leaky_relu(layer(x), alpha=0.4) + + return fluid.layers.squeeze(x, [1]) + + +class Flow(dg.Layer): + def __init__(self, name_scope, config): + super(Flow, self).__init__(name_scope) + self.n_layers = config.n_layers + self.n_channels = config.n_channels + self.kernel_h = config.kernel_h + self.kernel_w = config.kernel_w + + # Transform audio: [batch, 1, n_group, time/n_group] + # => [batch, n_channels, n_group, time/n_group] + self.start = weight_norm.Conv2D( + self.full_name(), + num_filters=self.n_channels, + filter_size=(1, 1)) + set_param_attr(self.start, c_in=1) + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + # output shape: [batch, 2, n_group, time/n_group] + self.end = dg.Conv2D( + self.full_name(), + num_filters=2, + filter_size=(1, 1)) + set_param_attr(self.end) + + # receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze + dilation_dict = {8: [1, 1, 1, 1, 1, 1, 1, 1], + 16: [1, 1, 1, 1, 1, 1, 1, 1], + 32: [1, 2, 4, 1, 2, 4, 1, 2], + 64: [1, 2, 4, 8, 16, 1, 2, 4], + 128: [1, 2, 4, 8, 16, 32, 64, 1]} + self.dilation_h_list = dilation_dict[config.n_group] + + self.in_layers = [] + self.cond_layers = [] + self.res_skip_layers = [] + for i in range(self.n_layers): + dilation_h = self.dilation_h_list[i] + dilation_w = 2 ** i + + in_layer = weight_norm.Conv2D( + self.full_name(), + num_filters=2 * self.n_channels, + filter_size=(self.kernel_h, self.kernel_w), + dilation=(dilation_h, dilation_w)) + set_param_attr(in_layer, c_in=self.n_channels) + self.in_layers.append(in_layer) + + cond_layer = weight_norm.Conv2D( + self.full_name(), + num_filters=2 * self.n_channels, + filter_size=(1, 1)) + set_param_attr(cond_layer, c_in=config.mel_bands) + self.cond_layers.append(cond_layer) + + if i < self.n_layers - 1: + res_skip_channels = 2 * self.n_channels + else: + res_skip_channels = self.n_channels + res_skip_layer = weight_norm.Conv2D( + self.full_name(), + num_filters=res_skip_channels, + filter_size=(1, 1)) + set_param_attr(res_skip_layer, c_in=self.n_channels) + self.res_skip_layers.append(res_skip_layer) + + self.add_sublayer("in_layer_{}".format(i), in_layer) + self.add_sublayer("cond_layer_{}".format(i), cond_layer) + self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer) + + def forward(self, audio, mel): + # audio: [bs, 1, n_group, time/group] + # mel: [bs, mel_bands, n_group, time/n_group] + audio = self.start(audio) + + for i in range(self.n_layers): + dilation_h = self.dilation_h_list[i] + dilation_w = 2 ** i + + # Pad height dim (n_group): causal convolution + # Pad width dim (time): dialated non-causal convolution + pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0 + pad_left = pad_right = int((self.kernel_w-1) * dilation_w / 2) + audio_pad = fluid.layers.pad2d(audio, + paddings=[pad_top, pad_bottom, pad_left, pad_right]) + + hidden = self.in_layers[i](audio_pad) + cond_hidden = self.cond_layers[i](mel) + in_acts = hidden + cond_hidden + out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \ + fluid.layers.sigmoid(in_acts[:, self.n_channels:, :]) + res_skip_acts = self.res_skip_layers[i](out_acts) + + if i < self.n_layers - 1: + audio += res_skip_acts[:, :self.n_channels, :, :] + skip_acts = res_skip_acts[:, self.n_channels:, :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output += skip_acts + + return self.end(output) + + +class WaveFlowModule(dg.Layer): + def __init__(self, name_scope, config): + super(WaveFlowModule, self).__init__(name_scope) + self.n_flows = config.n_flows + self.n_group = config.n_group + assert self.n_group % 2 == 0 + + self.conditioner = Conditioner(self.full_name()) + self.flows = [] + for i in range(self.n_flows): + flow = Flow(self.full_name(), config) + self.flows.append(flow) + self.add_sublayer("flow_{}".format(i), flow) + + self.perms = [[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8], + [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8], + [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8], + [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]] + + def forward(self, audio, mel): + mel = self.conditioner(mel) + assert mel.shape[2] >= audio.shape[1] + # Prune out the tail of audio/mel so that time/n_group == 0. + pruned_len = audio.shape[1] // self.n_group * self.n_group + + if audio.shape[1] > pruned_len: + audio = audio[:, :pruned_len] + if mel.shape[2] > pruned_len: + mel = mel[:, :, :pruned_len] + + # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] + mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) + # From [bs, time] to [bs, n_group, time/n_group] + audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1]) + # [bs, 1, n_group, time/n_group] + audio = fluid.layers.unsqueeze(audio, 1) + + log_s_list = [] + for i in range(self.n_flows): + inputs = audio[:, :, :-1, :] + conds = mel[:, :, 1:, :] + outputs = self.flows[i](inputs, conds) + log_s = outputs[:, :1, :, :] + b = outputs[:, 1:, :, :] + log_s_list.append(log_s) + + audio_0 = audio[:, :, :1, :] + audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b + audio = fluid.layers.concat([audio_0, audio_out], axis=2) + + # Permute over the height dim. + audio_slices = [audio[:, :, j, :] for j in self.perms[i]] + audio = fluid.layers.stack(audio_slices, axis=2) + mel_slices = [mel[:, :, j, :] for j in self.perms[i]] + mel = fluid.layers.stack(mel_slices, axis=2) + + z = fluid.layers.squeeze(audio, [1]) + + return z, log_s_list + + def synthesize(self, mels): + pass + + def start_new_sequence(self): + for layer in self.sublayers(): + if isinstance(layer, conv.Conv1D): + layer.start_new_sequence() From f6f0a2ca2129b69b64844d62a046e62e3e3bb677 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 12 Dec 2019 18:11:32 -0800 Subject: [PATCH 02/10] add documentation --- parakeet/models/waveflow/README.md | 67 ++--------------------- parakeet/models/waveflow/requirements.txt | 3 + 2 files changed, 7 insertions(+), 63 deletions(-) create mode 100644 parakeet/models/waveflow/requirements.txt diff --git a/parakeet/models/waveflow/README.md b/parakeet/models/waveflow/README.md index 18efd0b..355ca31 100644 --- a/parakeet/models/waveflow/README.md +++ b/parakeet/models/waveflow/README.md @@ -1,30 +1,6 @@ -# WaveNet with Paddle Fluid +### Install -Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms. -WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499). -Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. - -We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures. - -## Project Structure -```text -├── configs # yaml configuration files of preset model hyperparameters -├── data.py # dataset and dataloader settings for LJSpeech -├── slurm.py # optional slurm helper functions if you use slurm to train model -├── synthesis.py # script for speech synthesis -├── train.py # script for model training -├── utils.py # helper functions for e.g., model checkpointing -├── wavenet.py # WaveNet model high level APIs -└── wavenet_modules.py # WaveNet model implementation -``` - -## Usage - -There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on. Hyperparameters that are known to work good for the LJSpeech dataset are provided as yaml files in `./configs/` folder. Specifically, we provide `wavenet_ljspeech_single_gaussian.yaml`, `wavenet_ljspeech_mix_gaussian.yaml`, and `wavenet_ljspeech_softmax.yaml` config files for WaveNet with single Gaussian, 10-component mixture of Gaussians, and softmax (with 2048 linearly quantized channels) output distributions, respectively. - -Note that `train.py` and `synthesis.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training and synthesizing. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`. For example `--config=${yaml} --batch_size=8 --layers=20` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`. - -Note that you also need to specify some additional parameters for `train.py` and `synthesis.py`, and the details can be found in `train.add_options_to_parser` and `synthesis.add_options_to_parser`, respectively. +pip install -r requirements.txt ### Dataset @@ -48,50 +24,15 @@ python -u train.py --config=${yaml} \ --parallel=false --use_gpu=true ``` -#### Save and Load checkpoints - -Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default. -The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. - -There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): -1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. -2. Use `--iteration=500000`. -3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`. - ### Train on multiple GPUs ```bash export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." export CUDA_VISIBLE_DEVICES=0,1,2,3 python -u -m paddle.distributed.launch train.py \ - --config=${yaml} \ + --config=./configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml \ --root=./data/LJSpeech-1.1 \ - --name=${ModelName} --parallel=true --use_gpu=true + --name=test_speed --parallel=true --use_gpu=true ``` Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode. - -### Monitor with Tensorboard - -By default, the logs are saved in `./runs/wavenet/${ModelName}/logs/`. You can monitor logs by tensorboard. - -```bash -tensorboard --logdir=${log_dir} --port=8888 -``` - -### Synthesize from a checkpoint - -Check the [Save and load checkpoint](#save-and-load-checkpoints) section on how to load a specific checkpoint. -The following example will automatically load the latest checkpoint: - -```bash -export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." -export CUDA_VISIBLE_DEVICES=0 -python -u synthesis.py --config=${yaml} \ - --root=./data/LJSpeech-1.1 \ - --name=${ModelName} --use_gpu=true \ - --output=./syn_audios \ - --sample=${SAMPLE} -``` - -In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. diff --git a/parakeet/models/waveflow/requirements.txt b/parakeet/models/waveflow/requirements.txt new file mode 100644 index 0000000..f575339 --- /dev/null +++ b/parakeet/models/waveflow/requirements.txt @@ -0,0 +1,3 @@ +paddlepaddle-gpu==1.6.1.post97 +tensorboardX==1.9 +librosa==0.7.1 From 8a9bbc2634637ded7461bf111d4b2c067d9a4a1b Mon Sep 17 00:00:00 2001 From: lifuchen Date: Mon, 16 Dec 2019 09:04:22 +0000 Subject: [PATCH 03/10] add_TransformerTTS --- parakeet/data/batch.py | 2 +- .../transformerTTS/config/synthesis.yaml | 20 + .../transformerTTS/config/train_postnet.yaml | 27 + .../config/train_transformer.yaml | 32 ++ parakeet/models/transformerTTS/layers.py | 170 ++++++ parakeet/models/transformerTTS/module.py | 525 ++++++++++++++++++ parakeet/models/transformerTTS/network.py | 207 +++++++ parakeet/models/transformerTTS/parse.py | 63 +++ parakeet/models/transformerTTS/preprocess.py | 137 +++++ parakeet/models/transformerTTS/synthesis.py | 67 +++ .../models/transformerTTS/train_postnet.py | 135 +++++ .../transformerTTS/train_transformer.py | 166 ++++++ parakeet/models/transformerTTS/utils.py | 42 ++ tests/test_ljspeech.py | 2 +- 14 files changed, 1593 insertions(+), 2 deletions(-) create mode 100644 parakeet/models/transformerTTS/config/synthesis.yaml create mode 100644 parakeet/models/transformerTTS/config/train_postnet.yaml create mode 100644 parakeet/models/transformerTTS/config/train_transformer.yaml create mode 100644 parakeet/models/transformerTTS/layers.py create mode 100644 parakeet/models/transformerTTS/module.py create mode 100644 parakeet/models/transformerTTS/network.py create mode 100644 parakeet/models/transformerTTS/parse.py create mode 100644 parakeet/models/transformerTTS/preprocess.py create mode 100644 parakeet/models/transformerTTS/synthesis.py create mode 100644 parakeet/models/transformerTTS/train_postnet.py create mode 100644 parakeet/models/transformerTTS/train_transformer.py create mode 100644 parakeet/models/transformerTTS/utils.py diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 9303b46..8777472 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -88,7 +88,7 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): mono_channel = False lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame) - max_len = np.max(lengths) + max_len = np.max(lengths) batch = [] for example in minibatch: diff --git a/parakeet/models/transformerTTS/config/synthesis.yaml b/parakeet/models/transformerTTS/config/synthesis.yaml new file mode 100644 index 0000000..c3c3f8c --- /dev/null +++ b/parakeet/models/transformerTTS/config/synthesis.yaml @@ -0,0 +1,20 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +max_len: 50 +transformer_step: 1 +postnet_step: 1 +use_gpu: True + +checkpoint_path: ./checkpoint +log_dir: ./log +sample_path: ./sample \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml new file mode 100644 index 0000000..90ac94e --- /dev/null +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -0,0 +1,27 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +network: + hidden_size: 256 + embedding_size: 512 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +save_path: ./checkpoint +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml new file mode 100644 index 0000000..17db190 --- /dev/null +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -0,0 +1,32 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +network: + hidden_size: 256 + embedding_size: 512 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +image_step: 2000 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +save_path: ./checkpoint +log_dir: ./log + + + + \ No newline at end of file diff --git a/parakeet/models/transformerTTS/layers.py b/parakeet/models/transformerTTS/layers.py new file mode 100644 index 0000000..88f110f --- /dev/null +++ b/parakeet/models/transformerTTS/layers.py @@ -0,0 +1,170 @@ +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + name_scope, + in_channels, + num_filters, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(name_scope, dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + self.full_name(), + num_filters=num_filters, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + name_scope, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT', + dtype='float32'): + super(Pool1D, self).__init__(name_scope, dtype=dtype) + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + self.dtype = dtype + + + self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class DynamicGRU(dg.Layer): + def __init__(self, + scope_name, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__(scope_name) + self.gru_unit = dg.GRUUnit( + self.full_name(), + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py new file mode 100644 index 0000000..76bdffb --- /dev/null +++ b/parakeet/models/transformerTTS/module.py @@ -0,0 +1,525 @@ +import math +from parakeet.g2p.text.symbols import symbols +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from layers import Conv1D, Pool1D, DynamicGRU +import numpy as np + +class FC(dg.Layer): + def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1): + super(FC, self).__init__(name_scope) + self.in_features = in_features + self.out_features = out_features + self.is_bias = is_bias + self.dtype = dtype + self.gain = gain + + self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features), + dtype=dtype, + default_initializer = fluid.initializer.XavierInitializer()) + #self.weight = gain * self.weight + # mind the implicit conversion to ParamAttr for many cases + if is_bias is not False: + k = math.sqrt(1 / in_features) + self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ), + is_bias=True, + dtype=dtype, + default_initializer = fluid.initializer.Uniform(low=-k, high=k)) + + # 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature + + def forward(self, x): + x = fluid.layers.matmul(x, self.weight) + if hasattr(self, "bias"): + x = fluid.layers.elementwise_add(x, self.bias) + return x + +class Conv(dg.Layer): + def __init__(self, name_scope, in_channels, out_channels, filter_size=1, + padding=0, dilation=1, stride=1, use_cudnn=True, + data_format="NCT", is_bias=True, gain=1): + super(Conv, self).__init__(name_scope) + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_size = filter_size + self.padding = padding + self.dilation = dilation + self.stride = stride + self.use_cudnn = use_cudnn + self.data_format = data_format + self.is_bias = is_bias + self.gain = gain + + self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer()) + self.bias_attr = None + if is_bias is not False: + k = math.sqrt(1 / in_channels) + self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k)) + + self.conv = Conv1D( self.full_name(), + in_channels = in_channels, + num_filters = out_channels, + filter_size = filter_size, + padding = padding, + dilation = dilation, + stride = stride, + param_attr = self.weight_attr, + bias_attr = self.bias_attr, + use_cudnn = use_cudnn, + data_format = data_format) + + def forward(self, x): + x = self.conv(x) + return x + +class EncoderPrenet(dg.Layer): + def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True): + super(EncoderPrenet, self).__init__(name_scope) + self.embedding_size = embedding_size + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.embedding = dg.Embedding(self.full_name(), + size = [len(symbols), embedding_size], + param_attr = fluid.ParamAttr(name='weight'), + padding_idx = None) + self.conv1 = Conv(self.full_name(), + in_channels = embedding_size, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.conv2 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.conv3 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + + self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + + self.projection = FC(self.full_name(), num_hidden, num_hidden) + + def forward(self, x): + x = self.embedding(fluid.layers.unsqueeze(x, axes=[-1])) #(batch_size, seq_len, embending_size) + x = layers.transpose(x,[0,2,1]) + x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2) + x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2) + x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2) + x = layers.transpose(x,[0,2,1]) #(N,T,C) + x = self.projection(x) + return x + +class FFN(dg.Layer): + def __init__(self, name_scope, num_hidden, use_cudnn=True): + super(FFN, self).__init__(name_scope) + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.w_1 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden * 4, + filter_size = 1, + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.w_2 = Conv(self.full_name(), + in_channels = num_hidden * 4, + out_channels = num_hidden, + filter_size = 1, + use_cudnn = use_cudnn, + data_format = "NCT", + gain = math.sqrt(2)) + self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) + + def forward(self, input): + #FFN Networt + x = layers.transpose(input, [0,2,1]) + x = self.w_2(layers.relu(self.w_1(x))) + x = layers.transpose(x,[0,2,1]) + + # dropout + # x = layers.dropout(x, 0.1) + # not sure where dropout should be placed, in paper should before residual, + # but the diagonal alignment did not appear correctly in the attention plot. + + # residual connection + x = x + input + + + #layer normalization + x = self.layer_norm(x) + + return x + +class DecoderPrenet(dg.Layer): + def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5): + super(DecoderPrenet, self).__init__(name_scope) + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.dropout_rate = dropout_rate + + self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1 + self.fc2 = FC(self.full_name(), hidden_size, output_size) + + def forward(self, x): + x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate) + x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate) + return x + +class ScaledDotProductAttention(dg.Layer): + def __init__(self, name_scope, d_key): + super(ScaledDotProductAttention, self).__init__(name_scope) + + self.d_key = d_key + + # please attention this mask is diff from pytorch + def forward(self, key, value, query, mask=None, query_mask=None): + # Compute attention score + attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y + attention = attention / math.sqrt(self.d_key) + + # Mask key to ignore padding + if mask is not None: + attention = attention * mask + mask = (mask == 0).astype(float) * (-2 ** 32 + 1) + attention = attention + mask + + attention = layers.softmax(attention) + # Mask query to ignore padding + # Not sure how to work + if query_mask is not None: + attention = attention * query_mask + + result = layers.matmul(attention, value) + return result, attention + +class MultiheadAttention(dg.Layer): + def __init__(self, name_scope, num_hidden, num_head=4): + super(MultiheadAttention, self).__init__(name_scope) + self.num_hidden = num_hidden + self.num_hidden_per_attn = num_hidden // num_head + self.num_head = num_head + + self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) + self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) + self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) + + self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn) + + self.fc = FC(self.full_name(), num_hidden * 2, num_hidden) + + self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) + + def forward(self, key, value, query_input, mask=None, query_mask=None): + batch_size = key.shape[0] + seq_len_key = key.shape[1] + seq_len_query = query_input.shape[1] + + # repeat masks h times + if query_mask is not None: + query_mask = layers.unsqueeze(query_mask, axes=[-1]) + query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) + if mask is not None: + mask = layers.expand(mask, (self.num_head, 1, 1)) + + # Make multihead attention + # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) + key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) + value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) + query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn]) + + key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) + value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) + query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn]) + + result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) + + # concat all multihead result + result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn]) + result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + #print(result.().shape) + # concat result with input + result = layers.concat([query_input, result], axis=-1) + + result = self.fc(result) + result = result + query_input + + result = self.layer_norm(result) + return result, attention + +class PostConvNet(dg.Layer): + def __init__(self, name_scope, config): + super(PostConvNet, self).__init__(name_scope) + + num_hidden = config.network.hidden_size + self.num_hidden = num_hidden + self.conv1 = Conv(self.full_name(), + in_channels = config.audio.num_mels * config.audio.outputs_per_step, + out_channels = num_hidden, + filter_size = 5, + padding = 4, + use_cudnn = config.use_gpu, + data_format = "NCT", + gain = 5 / 3) + self.conv_list = [Conv(self.full_name(), + in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = 4, + use_cudnn = config.use_gpu, + data_format = "NCT", + gain = 5 / 3) for _ in range(3)] + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + self.conv5 = Conv(self.full_name(), + in_channels = num_hidden, + out_channels = config.audio.num_mels * config.audio.outputs_per_step, + filter_size = 5, + padding = 4, + use_cudnn = config.use_gpu, + data_format = "NCT") + + self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') for _ in range(3)] + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + + def forward(self, input): + input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1) + for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1) + input = self.conv5(input)[:, :, :-4] + return input + +class CBHG(dg.Layer): + def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2, + max_pool_kernel_size=2, is_post=False): + super(CBHG, self).__init__(name_scope) + """ + :param hidden_size: dimension of hidden unit + :param K: # of convolution banks + :param projection_size: dimension of projection unit + :param num_gru_layers: # of layers of GRUcell + :param max_pool_kernel_size: max pooling kernel size + :param is_post: whether post processing or not + """ + hidden_size = config.network.hidden_size + self.hidden_size = hidden_size + self.projection_size = projection_size + self.conv_list = [] + self.conv_list.append(Conv(self.full_name(), + in_channels = projection_size, + out_channels = hidden_size, + filter_size = 1, + padding = int(np.floor(1/2)), + data_format = "NCT")) + for i in range(2,K+1): + self.conv_list.append(Conv(self.full_name(), + in_channels = hidden_size, + out_channels = hidden_size, + filter_size = i, + padding = int(np.floor(i/2)), + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batchnorm_list = [] + for i in range(K): + self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW')) + + for i, layer in enumerate(self.batchnorm_list): + self.add_sublayer("batchnorm_list_{}".format(i), layer) + + conv_outdim = hidden_size * K + + self.conv_projection_1 = Conv(self.full_name(), + in_channels = conv_outdim, + out_channels = hidden_size, + filter_size = 3, + padding = int(np.floor(3/2)), + data_format = "NCT") + + self.conv_projection_2 = Conv(self.full_name(), + in_channels = hidden_size, + out_channels = projection_size, + filter_size = 3, + padding = int(np.floor(3/2)), + data_format = "NCT") + + self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size, + pool_type='max', + pool_stride=1, + pool_padding=1, + data_format = "NCT") + self.highway = Highwaynet(self.full_name(), self.projection_size) + + h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32") + h_0 = dg.to_variable(h_0) + self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.gru_forward1 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse = False, + origin_mode = True, + h_0 = h_0) + self.gru_reverse1 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse=True, + origin_mode=True, + h_0 = h_0) + + self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) + self.gru_forward2 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse = False, + origin_mode = True, + h_0 = h_0) + self.gru_reverse2 = DynamicGRU(self.full_name(), + size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse=True, + origin_mode=True, + h_0 = h_0) + + def _conv_fit_dim(self, x, filter_size=3): + if filter_size % 2 == 0: + return x[:,:,:-1] + else: + return x + + def forward(self, input_): + # input_.shape = [N, C, T] + + conv_list = [] + conv_input = input_ + + for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): + conv_input = self._conv_fit_dim(conv(conv_input), i+1) + conv_input = layers.relu(batchnorm(conv_input)) + conv_list.append(conv_input) + + conv_cat = layers.concat(conv_list, axis=1) + conv_pool = self.max_pool(conv_cat)[:,:,:-1] + + + conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) + conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ + + # conv_proj.shape = [N, C, T] + highway = layers.transpose(conv_proj, [0,2,1]) + highway = self.highway(highway) + + # highway.shape = [N, T, C] + fc_forward = self.fc_forward1(highway) + fc_reverse = self.fc_reverse1(highway) + out_forward = self.gru_forward1(fc_forward) + out_reverse = self.gru_reverse1(fc_reverse) + out = layers.concat([out_forward, out_reverse], axis=-1) + fc_forward = self.fc_forward2(out) + fc_reverse = self.fc_reverse2(out) + out_forward = self.gru_forward2(fc_forward) + out_reverse = self.gru_reverse2(fc_reverse) + out = layers.concat([out_forward, out_reverse], axis=-1) + out = layers.transpose(out, [0,2,1]) + return out + +class Highwaynet(dg.Layer): + def __init__(self, name_scope, num_units, num_layers=4): + super(Highwaynet, self).__init__(name_scope) + self.num_units = num_units + self.num_layers = num_layers + + self.gates = [] + self.linears = [] + + for i in range(num_layers): + self.linears.append(FC(self.full_name(), num_units, num_units)) + self.gates.append(FC(self.full_name(), num_units, num_units)) + + for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): + self.add_sublayer("linears_{}".format(i), linear) + self.add_sublayer("gates_{}".format(i), gate) + + def forward(self, input_): + out = input_ + + for linear, gate in zip(self.linears, self.gates): + h = fluid.layers.relu(linear(out)) + t_ = fluid.layers.sigmoid(gate(out)) + + c = 1 - t_ + out = h * t_ + out * c + + return out + + + + + + diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py new file mode 100644 index 0000000..ff25ad2 --- /dev/null +++ b/parakeet/models/transformerTTS/network.py @@ -0,0 +1,207 @@ +from module import * +from utils import get_positional_table, get_sinusoid_encoding_table +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid + +class Encoder(dg.Layer): + def __init__(self, name_scope, embedding_size, num_hidden, config): + super(Encoder, self).__init__(name_scope) + self.num_hidden = num_hidden + param = fluid.ParamAttr(name='alpha') + self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32', + default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) + self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding(name_scope=self.full_name(), + size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + name='weight', + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(), + embedding_size = embedding_size, + num_hidden = num_hidden, + use_cudnn=config.use_gpu) + self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.layers): + self.add_sublayer("self_attn_{}".format(i), layer) + self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)] + for i, layer in enumerate(self.ffns): + self.add_sublayer("ffns_{}".format(i), layer) + + def forward(self, x, positional): + if fluid.framework._dygraph_tracer()._train_mode: + query_mask = (positional != 0).astype(float) + mask = (positional != 0).astype(float) + mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1]) + else: + query_mask, mask = None, None + + # Encoder pre_network + x = self.encoder_prenet(x) #(N,T,C) + + + # Get positional encoding + positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1])) + x = positional * self.alpha + x #(N, T, C) + + + # Positional dropout + x = layers.dropout(x, 0.1) + + # Self attention encoder + attentions = list() + for layer, ffn in zip(self.layers, self.ffns): + x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) + x = ffn(x) + attentions.append(attention) + + return x, query_mask, attentions + +class Decoder(dg.Layer): + def __init__(self, name_scope, num_hidden, config): + super(Decoder, self).__init__(name_scope) + self.num_hidden = num_hidden + param = fluid.ParamAttr(name='alpha') + self.alpha = self.create_parameter(param, shape=(1,), dtype='float32', + default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) + self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding(name_scope=self.full_name(), + size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + name='weight', + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.decoder_prenet = DecoderPrenet(self.full_name(), + input_size = config.audio.num_mels, + hidden_size = num_hidden * 2, + output_size = num_hidden, + dropout_rate=0.2) + self.linear = FC(self.full_name(), num_hidden, num_hidden) + + self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.selfattn_layers): + self.add_sublayer("self_attn_{}".format(i), layer) + self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.attn_layers): + self.add_sublayer("attn_{}".format(i), layer) + self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)] + for i, layer in enumerate(self.ffns): + self.add_sublayer("ffns_{}".format(i), layer) + self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step) + self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1) + + self.postconvnet = PostConvNet(self.full_name(), config) + + def forward(self, key, value, query, c_mask, positional): + batch_size = key.shape[0] + decoder_len = query.shape[1] + + # get decoder mask with triangular matrix + + if fluid.framework._dygraph_tracer()._train_mode: + #zeros = np.zeros(positional.shape, dtype=np.float32) + m_mask = (positional != 0).astype(float) + mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1) + mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) + mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + + + # (batch_size, decoder_len, decoder_len) + zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(float), axes=2), [1,1,decoder_len]) + # (batch_size, decoder_len, seq_len) + zero_mask = fluid.layers.transpose(zero_mask, [0,2,1]) + + else: + mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) + mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + m_mask, zero_mask = None, None + #import pdb; pdb.set_trace() + # Decoder pre-network + query = self.decoder_prenet(query) + + # Centered position + query = self.linear(query) + + # Get position embedding + positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1])) + query = positional * self.alpha + query + + #positional dropout + query = fluid.layers.dropout(query, 0.1) + + # Attention decoder-decoder, encoder-decoder + selfattn_list = list() + attn_list = list() + + for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): + query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) + query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) + query = ffn(query) + selfattn_list.append(attn_dec) + attn_list.append(attn_dot) + + # Mel linear projection + mel_out = self.mel_linear(query) + # Post Mel Network + postnet_input = layers.transpose(mel_out, [0,2,1]) + out = self.postconvnet(postnet_input) + out = postnet_input + out + out = layers.transpose(out, [0,2,1]) + + # Stop tokens + stop_tokens = self.stop_linear(query) + + return mel_out, out, attn_list, stop_tokens, selfattn_list + +class Model(dg.Layer): + def __init__(self, name_scope, config): + super(Model, self).__init__(name_scope) + self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config) + self.decoder = Decoder(self.full_name(), config.network.hidden_size, config) + self.config = config + + def forward(self, characters, mel_input, pos_text, pos_mel): + # key (batch_size, seq_len, channel) + # c_mask (batch_size, seq_len) + # attns_enc (channel / 2, seq_len, seq_len) + key, c_mask, attns_enc = self.encoder(characters, pos_text) + + # mel_output/postnet_output (batch_size, mel_len, n_mel) + # attn_probs (128, mel_len, seq_len) + # stop_preds (batch_size, mel_len, 1) + # attns_dec (128, mel_len, mel_len) + mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel) + + return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec + +class ModelPostNet(dg.Layer): + """ + CBHG Network (mel -> linear) + """ + def __init__(self, name_scope, config): + super(ModelPostNet, self).__init__(name_scope) + self.pre_proj = Conv(self.full_name(), + in_channels = config.audio.num_mels, + out_channels = config.network.hidden_size, + data_format = "NCT") + self.cbhg = CBHG(self.full_name(), config) + self.post_proj = Conv(self.full_name(), + in_channels = config.audio.num_mels, + out_channels = (config.audio.n_fft // 2) + 1, + data_format = "NCT") + + def forward(self, mel): + mel = layers.transpose(mel, [0,2,1]) + mel = self.pre_proj(mel) + mel = self.cbhg(mel) + mag_pred = self.post_proj(mel) + mag_pred = layers.transpose(mag_pred, [0,2,1]) + return mag_pred + + + + + + diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py new file mode 100644 index 0000000..0c09d01 --- /dev/null +++ b/parakeet/models/transformerTTS/parse.py @@ -0,0 +1,63 @@ +import jsonargparse + +def add_config_options_to_parser(parser): + parser.add_argument('--audio.num_mels', type=int, default=80, + help="the number of mel bands when calculating mel spectrograms.") + parser.add_argument('--audio.n_fft', type=int, default=2048, + help="the number of fft components.") + parser.add_argument('--audio.sr', type=int, default=22050, + help="the sampling rate of audio data file.") + parser.add_argument('--audio.preemphasis', type=float, default=0.97, + help="the preemphasis coefficient.") + parser.add_argument('--audio.hop_length', type=float, default=128, + help="the number of samples to advance between frames.") + parser.add_argument('--audio.win_length', type=float, default=1024, + help="the length (width) of the window function.") + parser.add_argument('--audio.power', type=float, default=1.4, + help="the power to raise before griffin-lim.") + parser.add_argument('--audio.min_level_db', type=int, default=-100, + help="the minimum level db.") + parser.add_argument('--audio.ref_level_db', type=int, default=20, + help="the reference level db.") + parser.add_argument('--audio.outputs_per_step', type=int, default=1, + help="the outputs per step.") + + parser.add_argument('--network.hidden_size', type=int, default=256, + help="the hidden size in network.") + parser.add_argument('--network.embedding_size', type=int, default=512, + help="the embedding vector size.") + + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--image_step', type=int, default=2000, + help="attention image interval during training.") + parser.add_argument('--max_len', type=int, default=400, + help="The max length of audio when synthsis.") + parser.add_argument('--transformer_step', type=int, default=160000, + help="Global step to restore checkpoint of transformer in synthesis.") + parser.add_argument('--postnet_step', type=int, default=100000, + help="Global step to restore checkpoint of postnet in synthesis.") + parser.add_argument('--use_gpu', type=bool, default=True, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=bool, default=False, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./log', + help="the directory to save audio sample in synthesis.") + + + parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/parakeet/models/transformerTTS/preprocess.py b/parakeet/models/transformerTTS/preprocess.py new file mode 100644 index 0000000..61ed353 --- /dev/null +++ b/parakeet/models/transformerTTS/preprocess.py @@ -0,0 +1,137 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa + +from parakeet import g2p +from parakeet import audio + +from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler +from parakeet.data.dataset import Dataset +from parakeet.data.datacargo import DataCargo +from parakeet.data.batch import TextIDBatcher, SpecBatcher + +_ljspeech_processor = audio.AudioProcessor( + sample_rate=22050, + num_mels=80, + min_level_db=-100, + ref_level_db=20, + n_fft=2048, + win_length= int(22050 * 0.05), + hop_length= int(22050 * 0.0125), + power=1.2, + preemphasis=0.97, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + +class LJSpeech(Dataset): + def __init__(self, root): + super(LJSpeech, self).__init__() + assert isinstance(root, (str, Path)), "root should be a string or Path object" + self.root = root if isinstance(root, Path) else Path(root) + self.metadata = self._prepare_metadata() + + def _prepare_metadata(self): + csv_path = self.root.joinpath("metadata.csv") + metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, + names=["fname", "raw_text", "normalized_text"]) + return metadata + + def _get_example(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + + fname, raw_text, normalized_text = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = _ljspeech_processor.load_wav(str(wav_path)) + mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + def _batch_examples(self, minibatch): + mag_batch = [] + mel_batch = [] + phoneme_batch = [] + for example in minibatch: + mag, mel, phoneme = example + mag_batch.append(mag) + mel_batch.append(mel) + phoneme_batch.append(phoneme) + mag_batch = SpecBatcher(pad_value=0.)(mag_batch) + mel_batch = SpecBatcher(pad_value=0.)(mel_batch) + phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) + return (mag_batch, mel_batch, phoneme_batch) + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __len__(self): + return len(self.metadata) + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) + +def batch_examples_postnet(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py new file mode 100644 index 0000000..13e0de0 --- /dev/null +++ b/parakeet/models/transformerTTS/synthesis.py @@ -0,0 +1,67 @@ +import os +from scipy.io.wavfile import write +from parakeet.g2p.en import text_to_sequence +import numpy as np +from network import Model, ModelPostNet +from tqdm import tqdm +from tensorboardX import SummaryWriter +import paddle.fluid as fluid +import paddle.fluid.dygraph as dg +from preprocess import _ljspeech_processor +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint + +def load_checkpoint(step, model_path): + model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + return model_dict + +def synthesis(text_input, cfg): + place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) + + # tensorboard + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'synthesis') + + writer = SummaryWriter(path) + + with dg.guard(place): + model = Model('transtts', cfg) + model_postnet = ModelPostNet('postnet', cfg) + + model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) + model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) + + # init input + text = np.asarray(text_to_sequence(text_input)) + text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) + mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32) + pos_text = np.arange(1, text.shape[1]+1) + pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) + + + model.eval() + model_postnet.eval() + + pbar = tqdm(range(cfg.max_len)) + + for i in pbar: + pos_mel = np.arange(1, mel_input.shape[1]+1) + pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0]) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel) + mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) + mag_pred = model_postnet(postnet_pred) + + wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) + writer.add_audio(text_input, wav, 0, cfg.audio.sr) + if not os.path.exists(cfg.sample_path): + os.mkdir(cfg.sample_path) + write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) + +if __name__ == '__main__': + parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) + synthesis("Transformer model is so fast!", cfg) \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py new file mode 100644 index 0000000..6e32f9c --- /dev/null +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -0,0 +1,135 @@ +from network import * +from preprocess import batch_examples_postnet, LJSpeech +from tensorboardX import SummaryWriter +import os +from tqdm import tqdm +from parakeet.data.datacargo import DataCargo +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + + +def main(): + parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split()) + + local_rank = dg.parallel.Env().local_rank + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + LJSPEECH_ROOT = Path(cfg.data_path) + dataset = LJSpeech(LJSPEECH_ROOT) + dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'postnet') + writer = SummaryWriter(path) + + with dg.guard(place): + # dataloader + input_fields = { + 'names': ['mel', 'mag'], + 'shapes': + [[cfg.batch_size, None, 80], [cfg.batch_size, None, 257]], + 'dtypes': ['float32', 'float32'], + 'lod_levels': [0, 0] + } + + inputs = [ + fluid.data( + name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) + for i in range(len(input_fields['names'])) + ] + + reader = fluid.io.DataLoader.from_generator( + feed_list=inputs, + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + + + model = ModelPostNet('postnet', cfg) + + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + reader.set_batch_generator(dataloader, place) + pbar = tqdm(reader()) + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + mel, mag = data + mag = dg.to_variable(mag.numpy()) + mel = dg.to_variable(mel.numpy()) + global_step += 1 + + mag_pred = model(mel) + + loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) + if cfg.use_data_parallel: + loss = model.scale_loss(loss) + + writer.add_scalars('training_loss',{ + 'loss':loss.numpy(), + }, global_step) + + loss.backward() + if cfg.use_data_parallel: + model.apply_collective_grads() + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) + model.clear_gradients() + + if global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + + + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py new file mode 100644 index 0000000..0cdbf37 --- /dev/null +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -0,0 +1,166 @@ +from preprocess import batch_examples, LJSpeech +import os +from tqdm import tqdm +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +from network import * +from tensorboardX import SummaryWriter +from parakeet.data.datacargo import DataCargo +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint +from matplotlib import cm + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + + +def main(): + parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split()) + + local_rank = dg.parallel.Env().local_rank + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + + LJSPEECH_ROOT = Path(cfg.data_path) + dataset = LJSpeech(LJSPEECH_ROOT) + dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples, drop_last=True) + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'transformer') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + + # dataloader + input_fields = { + 'names': ['character', 'mel', 'mel_input', 'pos_text', 'pos_mel', 'text_len'], + 'shapes': + [[cfg.batch_size, None], [cfg.batch_size, None, 80], [cfg.batch_size, None, 80], [cfg.batch_size, 1], [cfg.batch_size, 1], [cfg.batch_size, 1]], + 'dtypes': ['float32', 'float32', 'float32', 'int64', 'int64', 'int64'], + 'lod_levels': [0, 0, 0, 0, 0, 0] + } + + inputs = [ + fluid.data( + name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) + for i in range(len(input_fields['names'])) + ] + + reader = fluid.io.DataLoader.from_generator( + feed_list=inputs, + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + + model = Model('transtts', cfg) + + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + reader.set_batch_generator(dataloader, place) + pbar = tqdm(reader()) + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + character, mel, mel_input, pos_text, pos_mel, text_length = data + + global_step += 1 + + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) + + mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) + post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) + loss = mel_loss + post_mel_loss + + if cfg.use_data_parallel: + loss = model.scale_loss(loss) + + writer.add_scalars('training_loss', { + 'mel_loss':mel_loss.numpy(), + 'post_mel_loss':post_mel_loss.numpy(), + }, global_step) + + writer.add_scalars('alphas', { + 'encoder_alpha':model.encoder.alpha.numpy(), + 'decoder_alpha':model.decoder.alpha.numpy(), + }, global_step) + + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + if global_step % cfg.image_step == 1: + for i, prob in enumerate(attn_probs): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + loss.backward() + if cfg.use_data_parallel: + model.apply_collective_grads() + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) + model.clear_gradients() + + # save checkpoint + if local_rank==0 and global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() + + +if __name__ =='__main__': + main() \ No newline at end of file diff --git a/parakeet/models/transformerTTS/utils.py b/parakeet/models/transformerTTS/utils.py new file mode 100644 index 0000000..087cacf --- /dev/null +++ b/parakeet/models/transformerTTS/utils.py @@ -0,0 +1,42 @@ +import numpy as np +import librosa +import os, copy +from scipy import signal + + +def get_positional_table(d_pos_vec, n_position=1024): + position_enc = np.array([ + [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)] + if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc + +def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): + ''' Sinusoid position encoding table ''' + + def cal_angle(position, hid_idx): + return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) + + def get_posi_angle_vec(position): + return [cal_angle(position, hid_j) for hid_j in range(d_hid)] + + sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) + + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + + if padding_idx is not None: + # zero vector for padding dimension + sinusoid_table[padding_idx] = 0. + + return sinusoid_table + +def guided_attention(N, T, g=0.2): + '''Guided attention. Refer to page 3 on the paper.''' + W = np.zeros((N, T), dtype=np.float32) + for n_pos in range(W.shape[0]): + for t_pos in range(W.shape[1]): + W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) + return W diff --git a/tests/test_ljspeech.py b/tests/test_ljspeech.py index 04db6a9..34f5011 100644 --- a/tests/test_ljspeech.py +++ b/tests/test_ljspeech.py @@ -7,4 +7,4 @@ LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1") ljspeech = LJSpeech(LJSPEECH_ROOT) ljspeech_cargo = DataCargo(ljspeech, batch_size=16, shuffle=True) for i, batch in enumerate(ljspeech_cargo): - print(i) \ No newline at end of file + print(i) From 8c22397b5504345f92068c923d732035aff739d5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 16 Dec 2019 16:42:39 -0800 Subject: [PATCH 04/10] add working synthesis code --- parakeet/models/waveflow/data.py | 8 +-- parakeet/models/waveflow/synthesis.py | 8 +-- parakeet/models/waveflow/waveflow.py | 44 +++++++++--- parakeet/models/waveflow/waveflow_modules.py | 70 +++++++++++++++++++- 4 files changed, 106 insertions(+), 24 deletions(-) diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py index 3c70ce0..ddaf104 100644 --- a/parakeet/models/waveflow/data.py +++ b/parakeet/models/waveflow/data.py @@ -79,17 +79,13 @@ class Subset(dataset.Dataset): mode='constant', constant_values=0) # Normalize audio. - audio = audio / MAX_WAV_VALUE + audio = audio.astype(np.float32) / MAX_WAV_VALUE mel = self.get_mel(audio) + #print("mel = {}, dtype {}, shape {}".format(mel, mel.dtype, mel.shape)) return audio, mel def _batch_examples(self, batch): - audio_batch = [] - mel_batch = [] - for audio, mel in batch: - audio_batch - audios = [sample[0] for sample in batch] mels = [sample[1] for sample in batch] diff --git a/parakeet/models/waveflow/synthesis.py b/parakeet/models/waveflow/synthesis.py index d87a188..e42e170 100644 --- a/parakeet/models/waveflow/synthesis.py +++ b/parakeet/models/waveflow/synthesis.py @@ -8,11 +8,11 @@ import paddle.fluid.dygraph as dg from paddle import fluid import utils -from wavenet import WaveNet +from waveflow import WaveFlow def add_options_to_parser(parser): - parser.add_argument('--model', type=str, default='wavenet', + parser.add_argument('--model', type=str, default='waveflow', help="general name of the model") parser.add_argument('--name', type=str, help="specific name of the training model") @@ -30,7 +30,7 @@ def add_options_to_parser(parser): parser.add_argument('--output', type=str, default="./syn_audios", help="path to write synthesized audio files") - parser.add_argument('--sample', type=int, + parser.add_argument('--sample', type=int, default=None, help="which of the valid samples to synthesize audio") @@ -54,7 +54,7 @@ def synthesize(config): print("Random Seed: ", seed) # Build model. - model = WaveNet(config, checkpoint_dir) + model = WaveFlow(config, checkpoint_dir) model.build(training=False) # Obtain the current iteration. diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py index b778497..b362c2d 100644 --- a/parakeet/models/waveflow/waveflow.py +++ b/parakeet/models/waveflow/waveflow.py @@ -2,7 +2,8 @@ import itertools import os import time -import librosa +#import librosa +from scipy.io.wavfile import write import numpy as np import paddle.fluid.dygraph as dg from paddle import fluid @@ -156,17 +157,38 @@ class WaveFlow(): output = "{}/{}/iter-{}".format(config.output, config.name, iteration) os.makedirs(output, exist_ok=True) - filename = "{}/valid_{}.wav".format(output, sample) - print("Synthesize sample {}, save as {}".format(sample, filename)) + mels_list = [mels for _, mels in self.validloader()] + if sample is not None: + mels_list = [mels_list[sample]] - mels_list = [mels for _, mels, _ in self.validloader()] - start_time = time.time() - syn_audio = self.waveflow.synthesize(mels_list[sample]) - syn_time = time.time() - start_time - print("audio shape {}, synthesis time {}".format( - syn_audio.shape, syn_time)) - librosa.output.write_wav(filename, syn_audio, - sr=config.sample_rate) + audio_times = [] + inf_times = [] + for sample, mel in enumerate(mels_list): + filename = "{}/valid_{}.wav".format(output, sample) + print("Synthesize sample {}, save as {}".format(sample, filename)) + + start_time = time.time() + audio = self.waveflow.synthesize(mel) + syn_time = time.time() - start_time + + audio_time = audio.shape[0] / 22050 + print("audio time {}, synthesis time {}, speedup: {}".format( + audio_time, syn_time, audio_time / syn_time)) + + #librosa.output.write_wav(filename, syn_audio, + # sr=config.sample_rate) + audio = audio.numpy() * 32768.0 + audio = audio.astype('int16') + write(filename, config.sample_rate, audio) + + audio_times.append(audio_time) + inf_times.append(syn_time) + + total_audio = sum(audio_times) + total_inf = sum(inf_times) + + print("Total audio: {}, total inf time {}, speedup: {}".format( + total_audio, total_inf, total_audio / total_inf)) def save(self, iteration): utils.save_latest_parameters(self.checkpoint_dir, iteration, diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py index a4b9c4f..45b46a6 100644 --- a/parakeet/models/waveflow/waveflow_modules.py +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -75,6 +75,16 @@ class Conditioner(dg.Layer): return fluid.layers.squeeze(x, [1]) + def infer(self, x): + x = fluid.layers.unsqueeze(x, 1) + for layer in self.upsample_conv2d: + x = layer(x) + # Trim conv artifacts. + time_cutoff = layer._filter_size[1] - layer._stride[1] + x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4) + + return fluid.layers.squeeze(x, [1]) + class Flow(dg.Layer): def __init__(self, name_scope, config): @@ -183,6 +193,14 @@ class Flow(dg.Layer): return self.end(output) +def debug(x, msg): + y = x.numpy() + print(msg + " :\n", y) + print("shape: ", y.shape) + print("dtype: ", y.dtype) + print("") + + class WaveFlowModule(dg.Layer): def __init__(self, name_scope, config): super(WaveFlowModule, self).__init__(name_scope) @@ -217,7 +235,7 @@ class WaveFlowModule(dg.Layer): if mel.shape[2] > pruned_len: mel = mel[:, :, :pruned_len] - # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] + # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) # From [bs, time] to [bs, n_group, time/n_group] audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1]) @@ -247,8 +265,54 @@ class WaveFlowModule(dg.Layer): return z, log_s_list - def synthesize(self, mels): - pass + def synthesize(self, mel, sigma=1.0): + #debug(mel, "mel") + mel = self.conditioner.infer(mel) + #debug(mel, "mel after conditioner") + + # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] + mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) + #debug(mel, "after group") + + audio = fluid.layers.gaussian_random( + shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma) + + #debug(audio, "audio") + + for i in reversed(range(self.n_flows)): + # Permute over the height dimension. + audio_slices = [audio[:, :, j, :] for j in self.perms[i]] + audio = fluid.layers.stack(audio_slices, axis=2) + mel_slices = [mel[:, :, j, :] for j in self.perms[i]] + mel = fluid.layers.stack(mel_slices, axis=2) + + audio_list = [] + audio_0 = audio[:, :, :1, :] + audio_list.append(audio_0) + + for h in range(1, self.n_group): + # inputs: [bs, 1, h, time/n_group] + inputs = fluid.layers.concat(audio_list, axis=2) + conds = mel[:, :, 1:(h+1), :] + outputs = self.flows[i](inputs, conds) + + log_s = outputs[:, :1, (h-1):h, :] + b = outputs[:, 1:, (h-1):h, :] + audio_h = (audio[:, :, h:(h+1), :] - b) / fluid.layers.exp(log_s) + audio_list.append(audio_h) + + audio = fluid.layers.concat(audio_list, axis=2) + #print("audio.shape =", audio.shape) + + # Assume batch size = 1 + # audio: [n_group, time/n_group] + audio = fluid.layers.squeeze(audio, [0, 1]) + # audio: [time] + audio = fluid.layers.reshape( + fluid.layers.transpose(audio, [1, 0]), [-1]) + #print("audio.shape =", audio.shape) + + return audio def start_new_sequence(self): for layer in self.sublayers(): From 9fe6ad11f0dad288b7a312c43fe2a94d035bfb6a Mon Sep 17 00:00:00 2001 From: lifuchen Date: Tue, 17 Dec 2019 06:23:34 +0000 Subject: [PATCH 05/10] Training with multi-GPU --- parakeet/audio/__init__.py | 1 + parakeet/audio/audio.py | 261 ++++++++++++++++++ parakeet/data/datacargo.py | 17 +- .../transformerTTS/config/train_postnet.yaml | 2 +- .../config/train_transformer.yaml | 2 +- parakeet/models/transformerTTS/data.py | 29 ++ parakeet/models/transformerTTS/module.py | 5 +- parakeet/models/transformerTTS/network.py | 18 +- .../models/transformerTTS/train_postnet.py | 86 ++---- .../transformerTTS/train_transformer.py | 122 +++----- 10 files changed, 393 insertions(+), 150 deletions(-) create mode 100644 parakeet/audio/__init__.py create mode 100644 parakeet/audio/audio.py create mode 100644 parakeet/models/transformerTTS/data.py diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py new file mode 100644 index 0000000..6212dee --- /dev/null +++ b/parakeet/audio/__init__.py @@ -0,0 +1 @@ +from .audio import AudioProcessor \ No newline at end of file diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py new file mode 100644 index 0000000..b29dbf2 --- /dev/null +++ b/parakeet/audio/audio.py @@ -0,0 +1,261 @@ +import librosa +import soundfile as sf +import numpy as np +import scipy.io +import scipy.signal + +class AudioProcessor(object): + def __init__(self, + sample_rate=None, # int, sampling rate + num_mels=None, # int, bands of mel spectrogram + min_level_db=None, # float, minimum level db + ref_level_db=None, # float, reference level dbn + n_fft=None, # int: number of samples in a frame for stft + win_length=None, # int: the same meaning with n_fft + hop_length=None, # int: number of samples between neighboring frame + power=None, # float:power to raise before griffin-lim + preemphasis=None, # float: preemphasis coefficident + signal_norm=None, # + symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] + max_norm=None, # float, max norm + mel_fmin=None, # int: mel spectrogram's minimum frequency + mel_fmax=None, # int: mel spectrogram's maximum frequency + clip_norm=True, # bool: clip spectrogram's norm + griffin_lim_iters=None, # int: + do_trim_silence=False, # bool: trim silience + sound_norm=False, + **kwargs): + self.sample_rate = sample_rate + self.num_mels = num_mels + self.min_level_db = min_level_db + self.ref_level_db = ref_level_db + + # stft related + self.n_fft = n_fft + self.win_length = win_length or n_fft + # hop length defaults to 1/4 window_length + self.hop_length = hop_length or 0.25 * self.win_length + + self.power = power + self.preemphasis = float(preemphasis) + + self.griffin_lim_iters = griffin_lim_iters + self.signal_norm = signal_norm + self.symmetric_norm = symmetric_norm + + # mel transform related + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + + self.max_norm = 1.0 if max_norm is None else float(max_norm) + self.clip_norm = clip_norm + self.do_trim_silence = do_trim_silence + + self.sound_norm = sound_norm + self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters() + + def _stft_parameters(self): + """compute frame length and hop length in ms""" + frame_length_ms = self.win_length * 1. / self.sample_rate + frame_shift_ms = self.hop_length * 1. / self.sample_rate + num_freq = 1 + self.n_fft // 2 + return num_freq, frame_length_ms, frame_shift_ms + + def __repr__(self): + """object repr""" + cls_name_str = self.__class__.__name__ + members = vars(self) + dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()]) + repr_str = "{}(\n{})\n".format(cls_name_str, dict_str) + return repr_str + + def save_wav(self, path, wav): + """save audio with scipy.io.wavfile in 16bit integers""" + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16)) + + def load_wav(self, path, sr=None): + """load wav -> trim_silence -> rescale""" + + x, sr = librosa.load(path, sr=None) + assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate) + if self.do_trim_silence: + try: + x = self.trim_silence(x) + except ValueError: + print(" [!] File cannot be trimmed for silence - {}".format(path)) + if self.sound_norm: + x = x / x.max() * 0.9 # why 0.9 ? + return x + + def trim_silence(self, wav): + """Trim soilent parts with a threshold and 0.01s margin""" + margin = int(self.sample_rate * 0.01) + wav = wav[margin: -margin] + trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] + return trimed_wav + + def apply_preemphasis(self, x): + if self.preemphasis == 0.: + raise RuntimeError(" !! Preemphasis coefficient should be positive. ") + return scipy.signal.lfilter([1., -self.preemphasis], [1.], x) + + def apply_inv_preemphasis(self, x): + if self.preemphasis == 0.: + raise RuntimeError(" !! Preemphasis coefficient should be positive. ") + return scipy.signal.lfilter([1.], [1., -self.preemphasis], x) + + def _amplitude_to_db(self, x): + amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) + return 20 * np.log10(np.maximum(amplitude_min, x)) + + @staticmethod + def _db_to_amplitude(x): + return np.power(10., 0.05 * x) + + def _linear_to_mel(self, spectrogram): + _mel_basis = self._build_mel_basis() + return np.dot(_mel_basis, spectrogram) + + def _mel_to_linear(self, mel_spectrogram): + inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram)) + + def _build_mel_basis(self): + """return mel basis for mel scale""" + if self.mel_fmax is not None: + assert self.mel_fmax <= self.sample_rate // 2 + return librosa.filters.mel( + self.sample_rate, + self.n_fft, + n_mels=self.num_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax) + + def _normalize(self, S): + """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]""" + if self.signal_norm: + S_norm = (S - self.min_level_db) / (-self.min_level_db) + if self.symmetric_norm: + S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm + if self.clip_norm: + S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) + return S_norm + else: + S_norm = self.max_norm * S_norm + if self.clip_norm: + S_norm = np.clip(S_norm, 0, self.max_norm) + return S_norm + else: + return S + + def _denormalize(self, S): + """denormalize values""" + S_denorm = S + if self.signal_norm: + if self.symmetric_norm: + if self.clip_norm: + S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) + S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db + return S_denorm + else: + if self.clip_norm: + S_denorm = np.clip(S_denorm, 0, self.max_norm) + S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db + return S_denorm + else: + return S + + def _stft(self, y): + return librosa.stft( + y=y, + n_fft=self.n_fft, + win_length=self.win_length, + hop_length=self.hop_length) + + def _istft(self, S): + return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length) + + def spectrogram(self, y): + """compute linear spectrogram(amplitude) + preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize + """ + if self.preemphasis: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db + return self._normalize(S) + + def melspectrogram(self, y): + """compute linear spectrogram(amplitude) + preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize + """ + if self.preemphasis: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db + return self._normalize(S) + + def inv_spectrogram(self, spectrogram): + """convert spectrogram back to waveform using griffin_lim in librosa""" + S = self._denormalize(spectrogram) + S = self._db_to_amplitude(S + self.ref_level_db) + if self.preemphasis: + return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) + return self._griffin_lim(S ** self.power) + + def inv_melspectrogram(self, mel_spectrogram): + S = self._denormalize(mel_spectrogram) + S = self._db_to_amplitude(S + self.ref_level_db) + S = self._linear_to_mel(np.abs(S)) + if self.preemphasis: + return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) + return self._griffin_lim(S ** self.power) + + def out_linear_to_mel(self, linear_spec): + """convert output linear spec to mel spec""" + S = self._denormalize(linear_spec) + S = self._db_to_amplitude(S + self.ref_level_db) + S = self._linear_to_mel(np.abs(S)) + S = self._amplitude_to_db(S) - self.ref_level_db + mel = self._normalize(S) + return mel + + def _griffin_lim(self, S): + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = self._istft(S_complex * angles) + for _ in range(self.griffin_lim_iters): + angles = np.exp(1j * np.angle(self._stft(y))) + y = self._istft(S_complex * angles) + return y + + @staticmethod + def mulaw_encode(wav, qc): + mu = 2 ** qc - 1 + # wav_abs = np.minimum(np.abs(wav), 1.0) + signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) + # Quantize signal to the specified number of levels. + signal = (signal + 1) / 2 * mu + 0.5 + return np.floor(signal,) + + @staticmethod + def mulaw_decode(wav, qc): + """Recovers waveform from quantized values.""" + mu = 2 ** qc - 1 + x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) + return x + + @staticmethod + def encode_16bits(x): + return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) + + @staticmethod + def quantize(x, bits): + return (x + 1.) * (2**bits - 1) / 2 + + @staticmethod + def dequantize(x, bits): + return 2 * x / (2**bits - 1) - 1 diff --git a/parakeet/data/datacargo.py b/parakeet/data/datacargo.py index 1d7d8d5..e087a4f 100644 --- a/parakeet/data/datacargo.py +++ b/parakeet/data/datacargo.py @@ -2,7 +2,8 @@ from .sampler import SequentialSampler, RandomSampler, BatchSampler class DataCargo(object): def __init__(self, dataset, batch_size=1, sampler=None, - shuffle=False, batch_sampler=None, drop_last=False): + shuffle=False, batch_sampler=None, collate_fn=None, + drop_last=False): self.dataset = dataset if batch_sampler is not None: @@ -21,13 +22,20 @@ class DataCargo(object): sampler = RandomSampler(dataset) else: sampler = SequentialSampler(dataset) - # auto_collation without custom batch_sampler batch_sampler = BatchSampler(sampler, batch_size, drop_last) + else: + batch_sampler = BatchSampler(sampler, batch_size, drop_last) + + self.batch_sampler = batch_sampler + + if collate_fn is None: + collate_fn = dataset._batch_examples + self.collate_fn = collate_fn self.batch_size = batch_size self.drop_last = drop_last self.sampler = sampler - self.batch_sampler = batch_sampler + def __iter__(self): return DataIterator(self) @@ -57,6 +65,7 @@ class DataIterator(object): self._index_sampler = loader._index_sampler self._sampler_iter = iter(self._index_sampler) + self.collate_fn = loader.collate_fn def __iter__(self): return self @@ -64,7 +73,7 @@ class DataIterator(object): def __next__(self): index = self._next_index() # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size minibatch = [self._dataset[i] for i in index] # we can abstract it, too to use dynamic batch size - minibatch = self._dataset._batch_examples(minibatch) # list[Example] -> Batch + minibatch = self.collate_fn(minibatch) return minibatch def _next_index(self): diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml index 90ac94e..5753ab1 100644 --- a/parakeet/models/transformerTTS/config/train_postnet.yaml +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -20,7 +20,7 @@ epochs: 10000 lr: 0.001 save_step: 500 use_gpu: True -use_data_parallel: False +use_data_parallel: True data_path: ../../../dataset/LJSpeech-1.1 save_path: ./checkpoint diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml index 17db190..3e56a4f 100644 --- a/parakeet/models/transformerTTS/config/train_transformer.yaml +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -21,7 +21,7 @@ lr: 0.001 save_step: 500 image_step: 2000 use_gpu: True -use_data_parallel: False +use_data_parallel: True data_path: ../../../dataset/LJSpeech-1.1 save_path: ./checkpoint diff --git a/parakeet/models/transformerTTS/data.py b/parakeet/models/transformerTTS/data.py new file mode 100644 index 0000000..f432640 --- /dev/null +++ b/parakeet/models/transformerTTS/data.py @@ -0,0 +1,29 @@ +from pathlib import Path +import numpy as np +from paddle import fluid +from parakeet.data.sampler import DistributedSampler +from parakeet.data.datacargo import DataCargo +from preprocess import batch_examples, LJSpeech, batch_examples_postnet + +class LJSpeechLoader: + def __init__(self, config, nranks, rank, is_postnet=False): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + LJSPEECH_ROOT = Path(config.data_path) + dataset = LJSpeech(LJSPEECH_ROOT) + sampler = DistributedSampler(len(dataset), nranks, rank) + + assert config.batch_size % nranks == 0 + each_bs = config.batch_size // nranks + if is_postnet: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) + else: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) + + self.reader = fluid.io.DataLoader.from_generator( + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + self.reader.set_batch_generator(dataloader, place) + diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index 76bdffb..f83bff5 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -130,7 +130,7 @@ class EncoderPrenet(dg.Layer): self.projection = FC(self.full_name(), num_hidden, num_hidden) def forward(self, x): - x = self.embedding(fluid.layers.unsqueeze(x, axes=[-1])) #(batch_size, seq_len, embending_size) + x = self.embedding(x) #(batch_size, seq_len, embending_size) x = layers.transpose(x,[0,2,1]) x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2) x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2) @@ -211,8 +211,9 @@ class ScaledDotProductAttention(dg.Layer): # Mask key to ignore padding if mask is not None: attention = attention * mask - mask = (mask == 0).astype(float) * (-2 ** 32 + 1) + mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) attention = attention + mask + attention = layers.softmax(attention) # Mask query to ignore padding diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py index ff25ad2..3d356dc 100644 --- a/parakeet/models/transformerTTS/network.py +++ b/parakeet/models/transformerTTS/network.py @@ -7,9 +7,9 @@ class Encoder(dg.Layer): def __init__(self, name_scope, embedding_size, num_hidden, config): super(Encoder, self).__init__(name_scope) self.num_hidden = num_hidden - param = fluid.ParamAttr(name='alpha') - self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32', - default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) + param = fluid.ParamAttr(name='alpha', + initializer=fluid.initializer.Constant(value=1.0)) + self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32') self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding(name_scope=self.full_name(), size=[1024, num_hidden], @@ -31,8 +31,8 @@ class Encoder(dg.Layer): def forward(self, x, positional): if fluid.framework._dygraph_tracer()._train_mode: - query_mask = (positional != 0).astype(float) - mask = (positional != 0).astype(float) + query_mask = (positional != 0).astype(np.float32) + mask = (positional != 0).astype(np.float32) mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1]) else: query_mask, mask = None, None @@ -42,7 +42,7 @@ class Encoder(dg.Layer): # Get positional encoding - positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1])) + positional = self.pos_emb(positional) x = positional * self.alpha + x #(N, T, C) @@ -102,14 +102,14 @@ class Decoder(dg.Layer): if fluid.framework._dygraph_tracer()._train_mode: #zeros = np.zeros(positional.shape, dtype=np.float32) - m_mask = (positional != 0).astype(float) + m_mask = (positional != 0).astype(np.float32) mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1) mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) # (batch_size, decoder_len, decoder_len) - zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(float), axes=2), [1,1,decoder_len]) + zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(np.float32), axes=2), [1,1,decoder_len]) # (batch_size, decoder_len, seq_len) zero_mask = fluid.layers.transpose(zero_mask, [0,2,1]) @@ -125,7 +125,7 @@ class Decoder(dg.Layer): query = self.linear(query) # Get position embedding - positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1])) + positional = self.pos_emb(positional) query = positional * self.alpha + query #positional dropout diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py index 6e32f9c..8beeece 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -1,13 +1,12 @@ from network import * -from preprocess import batch_examples_postnet, LJSpeech from tensorboardX import SummaryWriter import os from tqdm import tqdm -from parakeet.data.datacargo import DataCargo from pathlib import Path import jsonargparse from parse import add_config_options_to_parser from pprint import pprint +from data import LJSpeechLoader class MyDataParallel(dg.parallel.DataParallel): """ @@ -27,21 +26,15 @@ class MyDataParallel(dg.parallel.DataParallel): object.__getattribute__(self, "_sub_layers")["_layers"], key) -def main(): - parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') - add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split()) +def main(cfg): + + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 - local_rank = dg.parallel.Env().local_rank - if local_rank == 0: # Print the whole config setting. pprint(jsonargparse.namespace_to_dict(cfg)) - LJSPEECH_ROOT = Path(cfg.data_path) - dataset = LJSpeech(LJSPEECH_ROOT) - dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) - global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if cfg.use_data_parallel else fluid.CUDAPlace(0) @@ -50,35 +43,10 @@ def main(): if not os.path.exists(cfg.log_dir): os.mkdir(cfg.log_dir) path = os.path.join(cfg.log_dir,'postnet') - writer = SummaryWriter(path) - with dg.guard(place): - # dataloader - input_fields = { - 'names': ['mel', 'mag'], - 'shapes': - [[cfg.batch_size, None, 80], [cfg.batch_size, None, 257]], - 'dtypes': ['float32', 'float32'], - 'lod_levels': [0, 0] - } + writer = SummaryWriter(path) if local_rank == 0 else None - inputs = [ - fluid.data( - name=input_fields['names'][i], - shape=input_fields['shapes'][i], - dtype=input_fields['dtypes'][i], - lod_level=input_fields['lod_levels'][i]) - for i in range(len(input_fields['names'])) - ] - - reader = fluid.io.DataLoader.from_generator( - feed_list=inputs, - capacity=32, - iterable=True, - use_double_buffer=True, - return_list=True) - - + with dg.guard(place): model = ModelPostNet('postnet', cfg) model.train() @@ -94,9 +62,10 @@ def main(): strategy = dg.parallel.prepare_context() model = MyDataParallel(model, strategy) + reader = LJSpeechLoader(cfg, nranks, local_rank, is_postnet=True).reader() + for epoch in range(cfg.epochs): - reader.set_batch_generator(dataloader, place) - pbar = tqdm(reader()) + pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) mel, mag = data @@ -109,27 +78,30 @@ def main(): loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) if cfg.use_data_parallel: loss = model.scale_loss(loss) - - writer.add_scalars('training_loss',{ - 'loss':loss.numpy(), - }, global_step) - - loss.backward() - if cfg.use_data_parallel: + loss.backward() model.apply_collective_grads() + else: + loss.backward() optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) model.clear_gradients() - - if global_step % cfg.save_step == 0: - if not os.path.exists(cfg.save_path): - os.mkdir(cfg.save_path) - save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) - dg.save_dygraph(model.state_dict(), save_path) - dg.save_dygraph(optimizer.state_dict(), save_path) - + if local_rank==0: + writer.add_scalars('training_loss',{ + 'loss':loss.numpy(), + }, global_step) + if global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() if __name__ == '__main__': - main() \ No newline at end of file + parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split()) + main(cfg) \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index 0cdbf37..065be6d 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -1,16 +1,15 @@ -from preprocess import batch_examples, LJSpeech import os from tqdm import tqdm import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers from network import * from tensorboardX import SummaryWriter -from parakeet.data.datacargo import DataCargo from pathlib import Path import jsonargparse from parse import add_config_options_to_parser from pprint import pprint from matplotlib import cm +from data import LJSpeechLoader class MyDataParallel(dg.parallel.DataParallel): """ @@ -30,21 +29,14 @@ class MyDataParallel(dg.parallel.DataParallel): object.__getattribute__(self, "_sub_layers")["_layers"], key) -def main(): - parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') - add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split()) - - local_rank = dg.parallel.Env().local_rank +def main(cfg): + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 if local_rank == 0: # Print the whole config setting. pprint(jsonargparse.namespace_to_dict(cfg)) - - LJSPEECH_ROOT = Path(cfg.data_path) - dataset = LJSpeech(LJSPEECH_ROOT) - dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples, drop_last=True) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if cfg.use_data_parallel else fluid.CUDAPlace(0) @@ -57,39 +49,13 @@ def main(): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - if cfg.use_data_parallel: - strategy = dg.parallel.prepare_context() - - # dataloader - input_fields = { - 'names': ['character', 'mel', 'mel_input', 'pos_text', 'pos_mel', 'text_len'], - 'shapes': - [[cfg.batch_size, None], [cfg.batch_size, None, 80], [cfg.batch_size, None, 80], [cfg.batch_size, 1], [cfg.batch_size, 1], [cfg.batch_size, 1]], - 'dtypes': ['float32', 'float32', 'float32', 'int64', 'int64', 'int64'], - 'lod_levels': [0, 0, 0, 0, 0, 0] - } - - inputs = [ - fluid.data( - name=input_fields['names'][i], - shape=input_fields['shapes'][i], - dtype=input_fields['dtypes'][i], - lod_level=input_fields['lod_levels'][i]) - for i in range(len(input_fields['names'])) - ] - - reader = fluid.io.DataLoader.from_generator( - feed_list=inputs, - capacity=32, - iterable=True, - use_double_buffer=True, - return_list=True) - model = Model('transtts', cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) - + + reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + if cfg.checkpoint_path is not None: model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) model.set_dict(model_dict) @@ -97,11 +63,11 @@ def main(): print("load checkpoint!!!") if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() model = MyDataParallel(model, strategy) - + for epoch in range(cfg.epochs): - reader.set_batch_generator(dataloader, place) - pbar = tqdm(reader()) + pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) character, mel, mel_input, pos_text, pos_mel, text_length = data @@ -114,40 +80,41 @@ def main(): post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss + if local_rank==0: + writer.add_scalars('training_loss', { + 'mel_loss':mel_loss.numpy(), + 'post_mel_loss':post_mel_loss.numpy(), + }, global_step) + + writer.add_scalars('alphas', { + 'encoder_alpha':model.encoder.alpha.numpy(), + 'decoder_alpha':model.decoder.alpha.numpy(), + }, global_step) + + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + if global_step % cfg.image_step == 1: + for i, prob in enumerate(attn_probs): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") + if cfg.use_data_parallel: loss = model.scale_loss(loss) - - writer.add_scalars('training_loss', { - 'mel_loss':mel_loss.numpy(), - 'post_mel_loss':post_mel_loss.numpy(), - }, global_step) - - writer.add_scalars('alphas', { - 'encoder_alpha':model.encoder.alpha.numpy(), - 'decoder_alpha':model.decoder.alpha.numpy(), - }, global_step) - - writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) - - if global_step % cfg.image_step == 1: - for i, prob in enumerate(attn_probs): - for j in range(4): - x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") - - for i, prob in enumerate(attn_enc): - for j in range(4): - x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") - - for i, prob in enumerate(attn_dec): - for j in range(4): - x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") - - loss.backward() - if cfg.use_data_parallel: + loss.backward() model.apply_collective_grads() + else: + loss.backward() optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) model.clear_gradients() @@ -163,4 +130,7 @@ def main(): if __name__ =='__main__': - main() \ No newline at end of file + parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split()) + main(cfg) \ No newline at end of file From 0e18d600572ca1e9461cced0f0a470b503c5c900 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 19 Dec 2019 00:03:06 -0800 Subject: [PATCH 06/10] refine code --- parakeet/models/waveflow/benchmark.py | 71 +++++++++++ ...4_layer8x8.yaml => waveflow_ljspeech.yaml} | 0 ...flow_ljspeech_sqz16_r64_layer8x8_s123.yaml | 24 ---- parakeet/models/waveflow/data.py | 8 +- parakeet/models/waveflow/requirements.txt | 3 - parakeet/models/waveflow/train.py | 25 ---- parakeet/models/waveflow/utils.py | 21 ---- parakeet/models/waveflow/waveflow.py | 56 ++++----- parakeet/models/waveflow/waveflow_modules.py | 113 +++++++++++------- 9 files changed, 170 insertions(+), 151 deletions(-) create mode 100644 parakeet/models/waveflow/benchmark.py rename parakeet/models/waveflow/configs/{waveflow_ljspeech_sqz16_r64_layer8x8.yaml => waveflow_ljspeech.yaml} (100%) delete mode 100644 parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml delete mode 100644 parakeet/models/waveflow/requirements.txt diff --git a/parakeet/models/waveflow/benchmark.py b/parakeet/models/waveflow/benchmark.py new file mode 100644 index 0000000..b2949d2 --- /dev/null +++ b/parakeet/models/waveflow/benchmark.py @@ -0,0 +1,71 @@ +import os +import random +from pprint import pprint + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid + +import utils +from waveflow import WaveFlow + + +def add_options_to_parser(parser): + parser.add_argument('--model', type=str, default='waveflow', + help="general name of the model") + parser.add_argument('--name', type=str, + help="specific name of the training model") + parser.add_argument('--root', type=str, + help="root path of the LJSpeech dataset") + + parser.add_argument('--use_gpu', type=bool, default=True, + help="option to use gpu training") + + parser.add_argument('--iteration', type=int, default=None, + help=("which iteration of checkpoint to load, " + "default to load the latest checkpoint")) + parser.add_argument('--checkpoint', type=str, default=None, + help="path of the checkpoint to load") + + +def benchmark(config): + pprint(jsonargparse.namespace_to_dict(config)) + + # Get checkpoint directory path. + run_dir = os.path.join("runs", config.model, config.name) + checkpoint_dir = os.path.join(run_dir, "checkpoint") + + # Configurate device. + place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace() + + with dg.guard(place): + # Fix random seed. + seed = config.seed + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + print("Random Seed: ", seed) + + # Build model. + model = WaveFlow(config, checkpoint_dir) + model.build(training=False) + + # Run model inference. + model.benchmark() + + +if __name__ == "__main__": + # Create parser. + parser = jsonargparse.ArgumentParser( + description="Synthesize audio using WaveNet model", + formatter_class='default_argparse') + add_options_to_parser(parser) + utils.add_config_options_to_parser(parser) + + # Parse argument from both command line and yaml config file. + # For conflicting updates to the same field, + # the preceding update will be overwritten by the following one. + config = parser.parse_args() + benchmark(config) diff --git a/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml b/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml similarity index 100% rename from parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml rename to parakeet/models/waveflow/configs/waveflow_ljspeech.yaml diff --git a/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml b/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml deleted file mode 100644 index 7d45212..0000000 --- a/parakeet/models/waveflow/configs/waveflow_ljspeech_sqz16_r64_layer8x8_s123.yaml +++ /dev/null @@ -1,24 +0,0 @@ -valid_size: 16 -segment_length: 16000 -sample_rate: 22050 -fft_window_shift: 256 -fft_window_size: 1024 -fft_size: 1024 -mel_bands: 80 -mel_fmin: 0.0 -mel_fmax: 8000.0 - -seed: 123 -learning_rate: 0.0002 -batch_size: 8 -test_every: 2000 -save_every: 5000 -max_iterations: 2000000 - -sigma: 1.0 -n_flows: 8 -n_group: 16 -n_layers: 8 -n_channels: 64 -kernel_h: 3 -kernel_w: 3 diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py index ddaf104..d89fb7b 100644 --- a/parakeet/models/waveflow/data.py +++ b/parakeet/models/waveflow/data.py @@ -4,7 +4,6 @@ import librosa import numpy as np from paddle import fluid -import utils from parakeet.datasets import ljspeech from parakeet.data import dataset from parakeet.data.batch import SpecBatcher, WavBatcher @@ -12,8 +11,6 @@ from parakeet.data.datacargo import DataCargo from parakeet.data.sampler import DistributedSampler, BatchSampler from scipy.io.wavfile import read -MAX_WAV_VALUE = 32768.0 - class Dataset(ljspeech.LJSpeech): def __init__(self, config): @@ -78,10 +75,9 @@ class Subset(dataset.Dataset): audio = np.pad(audio, (0, segment_length - audio.shape[0]), mode='constant', constant_values=0) - # Normalize audio. - audio = audio.astype(np.float32) / MAX_WAV_VALUE + # Normalize audio to the [-1, 1] range. + audio = audio.astype(np.float32) / 32768.0 mel = self.get_mel(audio) - #print("mel = {}, dtype {}, shape {}".format(mel, mel.dtype, mel.shape)) return audio, mel diff --git a/parakeet/models/waveflow/requirements.txt b/parakeet/models/waveflow/requirements.txt deleted file mode 100644 index f575339..0000000 --- a/parakeet/models/waveflow/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -paddlepaddle-gpu==1.6.1.post97 -tensorboardX==1.9 -librosa==0.7.1 diff --git a/parakeet/models/waveflow/train.py b/parakeet/models/waveflow/train.py index a125d97..89b787a 100644 --- a/parakeet/models/waveflow/train.py +++ b/parakeet/models/waveflow/train.py @@ -14,8 +14,6 @@ import slurm import utils from waveflow import WaveFlow -MAXIMUM_SAVE_TIME = 10 * 60 - def add_options_to_parser(parser): parser.add_argument('--model', type=str, default='waveflow', @@ -35,8 +33,6 @@ def add_options_to_parser(parser): "default to load the latest checkpoint")) parser.add_argument('--checkpoint', type=str, default=None, help="path of the checkpoint to load") - parser.add_argument('--slurm', type=bool, default=False, - help="whether you are using slurm to submit training jobs") def train(config): @@ -85,13 +81,6 @@ def train(config): else: iteration = int(config.checkpoint.split('/')[-1].split('-')[-1]) - # Get restart command if using slurm. - if config.slurm: - resume_command, death_time = slurm.restart_command() - if rank == 0: - print("Restart command:", " ".join(resume_command)) - done = False - while iteration < config.max_iterations: # Run one single training step. model.train_step(iteration) @@ -102,20 +91,6 @@ def train(config): # Run validation step. model.valid_step(iteration) - # Check whether reaching the time limit. - if config.slurm: - done = (death_time is not None and death_time - time.time() < - MAXIMUM_SAVE_TIME) - - if rank == 0 and done: - print("Saving progress before exiting.") - model.save(iteration) - - print("Running restart command:", " ".join(resume_command)) - # Submit restart command. - subprocess.check_call(resume_command) - break - if rank == 0 and iteration % config.save_every == 0: # Save parameters. model.save(iteration) diff --git a/parakeet/models/waveflow/utils.py b/parakeet/models/waveflow/utils.py index 494a409..3baeb60 100644 --- a/parakeet/models/waveflow/utils.py +++ b/parakeet/models/waveflow/utils.py @@ -57,27 +57,6 @@ def add_config_options_to_parser(parser): parser.add_argument('--config', action=jsonargparse.ActionConfigFile) -def pad_to_size(array, length, pad_with=0.0): - """ - Pad an array on the first (length) axis to a given length. - """ - padding = length - array.shape[0] - assert padding >= 0, "Padding required was less than zero" - - paddings = [(0, 0)] * len(array.shape) - paddings[0] = (0, padding) - - return np.pad(array, paddings, mode='constant', constant_values=pad_with) - - -def calculate_context_size(config): - dilations = list( - itertools.islice( - itertools.cycle(config.dilation_block), config.layers)) - config.context_size = sum(dilations) + 1 - print("Context size is", config.context_size) - - def load_latest_checkpoint(checkpoint_dir, rank=0): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") # Create checkpoint index file if not exist. diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py index b362c2d..4935d42 100644 --- a/parakeet/models/waveflow/waveflow.py +++ b/parakeet/models/waveflow/waveflow.py @@ -2,11 +2,10 @@ import itertools import os import time -#import librosa -from scipy.io.wavfile import write import numpy as np import paddle.fluid.dygraph as dg from paddle import fluid +from scipy.io.wavfile import write import utils from data import LJSpeech @@ -29,18 +28,6 @@ class WaveFlow(): self.trainloader = dataset.trainloader self.validloader = dataset.validloader -# if self.rank == 0: -# for i, (audios, mels) in enumerate(self.validloader()): -# print("audios {}, mels {}".format(audios.dtype, mels.dtype)) -# print("{}: rank {}, audios {}, mels {}".format( -# i, self.rank, audios.shape, mels.shape)) -# -# for i, (audios, mels) in enumerate(self.trainloader): -# print("{}: rank {}, audios {}, mels {}".format( -# i, self.rank, audios.shape, mels.shape)) -# -# exit() - waveflow = WaveFlowModule("waveflow", config) # Dry run once to create and initalize all necessary parameters. @@ -96,8 +83,6 @@ class WaveFlow(): else: loss.backward() - current_lr = self.optimizer._learning_rate - self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters()) self.waveflow.clear_gradients() @@ -113,7 +98,6 @@ class WaveFlow(): tb = self.tb_logger tb.add_scalar("Train-Loss-Rank-0", loss_val, iteration) - tb.add_scalar("Learning-Rate", current_lr, iteration) @dg.no_grad def valid_step(self, iteration): @@ -161,34 +145,44 @@ class WaveFlow(): if sample is not None: mels_list = [mels_list[sample]] - audio_times = [] - inf_times = [] for sample, mel in enumerate(mels_list): filename = "{}/valid_{}.wav".format(output, sample) print("Synthesize sample {}, save as {}".format(sample, filename)) start_time = time.time() - audio = self.waveflow.synthesize(mel) + audio = self.waveflow.synthesize(mel, sigma=self.config.sigma) syn_time = time.time() - start_time - audio_time = audio.shape[0] / 22050 - print("audio time {}, synthesis time {}, speedup: {}".format( - audio_time, syn_time, audio_time / syn_time)) + audio = audio[0] + audio_time = audio.shape[0] / self.config.sample_rate + print("audio time {:.4f}, synthesis time {:.4f}".format( + audio_time, syn_time)) - #librosa.output.write_wav(filename, syn_audio, - # sr=config.sample_rate) + # Denormalize audio from [-1, 1] to [-32768, 32768] int16 range. audio = audio.numpy() * 32768.0 audio = audio.astype('int16') write(filename, config.sample_rate, audio) - audio_times.append(audio_time) - inf_times.append(syn_time) + @dg.no_grad + def benchmark(self): + self.waveflow.eval() - total_audio = sum(audio_times) - total_inf = sum(inf_times) + mels_list = [mels for _, mels in self.validloader()] + mel = fluid.layers.concat(mels_list, axis=2) + mel = mel[:, :, :864] + batch_size = 8 + mel = fluid.layers.expand(mel, [batch_size, 1, 1]) - print("Total audio: {}, total inf time {}, speedup: {}".format( - total_audio, total_inf, total_audio / total_inf)) + for i in range(10): + start_time = time.time() + audio = self.waveflow.synthesize(mel, sigma=self.config.sigma) + print("audio.shape = ", audio.shape) + syn_time = time.time() - start_time + + audio_time = audio.shape[1] * batch_size / self.config.sample_rate + print("audio time {:.4f}, synthesis time {:.4f}".format( + audio_time, syn_time)) + print("{} X real-time".format(audio_time / syn_time)) def save(self, iteration): utils.save_latest_parameters(self.checkpoint_dir, iteration, diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py index 45b46a6..39cb598 100644 --- a/parakeet/models/waveflow/waveflow_modules.py +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -23,7 +23,6 @@ def set_param_attr(layer, c_in=1): def unfold(x, n_group): length = x.shape[-1] - #assert length % n_group == 0 new_shape = x.shape[:-1] + [length // n_group, n_group] return fluid.layers.reshape(x, new_shape) @@ -192,13 +191,53 @@ class Flow(dg.Layer): return self.end(output) + def infer(self, audio, mel, queues): + audio = self.start(audio) -def debug(x, msg): - y = x.numpy() - print(msg + " :\n", y) - print("shape: ", y.shape) - print("dtype: ", y.dtype) - print("") + for i in range(self.n_layers): + dilation_h = self.dilation_h_list[i] + dilation_w = 2 ** i + + state_size = dilation_h * (self.kernel_h - 1) + queue = queues[i] + + if len(queue) == 0: + for j in range(state_size): + queue.append(fluid.layers.zeros_like(audio)) + + state = queue[0:state_size] + state = fluid.layers.concat([*state, audio], axis=2) + + queue.pop(0) + queue.append(audio) + + # Pad height dim (n_group): causal convolution + # Pad width dim (time): dialated non-causal convolution + pad_top, pad_bottom = 0, 0 + pad_left = int((self.kernel_w-1) * dilation_w / 2) + pad_right = int((self.kernel_w-1) * dilation_w / 2) + state = fluid.layers.pad2d(state, + paddings=[pad_top, pad_bottom, pad_left, pad_right]) + + hidden = self.in_layers[i](state) + cond_hidden = self.cond_layers[i](mel) + in_acts = hidden + cond_hidden + out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \ + fluid.layers.sigmoid(in_acts[:, self.n_channels:, :]) + res_skip_acts = self.res_skip_layers[i](out_acts) + + if i < self.n_layers - 1: + audio += res_skip_acts[:, :self.n_channels, :, :] + skip_acts = res_skip_acts[:, self.n_channels:, :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output += skip_acts + + return self.end(output) class WaveFlowModule(dg.Layer): @@ -206,7 +245,9 @@ class WaveFlowModule(dg.Layer): super(WaveFlowModule, self).__init__(name_scope) self.n_flows = config.n_flows self.n_group = config.n_group + self.n_layers = config.n_layers assert self.n_group % 2 == 0 + assert self.n_flows % 2 == 0 self.conditioner = Conditioner(self.full_name()) self.flows = [] @@ -215,14 +256,16 @@ class WaveFlowModule(dg.Layer): self.flows.append(flow) self.add_sublayer("flow_{}".format(i), flow) - self.perms = [[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], - [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], - [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], - [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], - [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8], - [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8], - [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8], - [7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]] + self.perms = [] + half = self.n_group // 2 + for i in range(self.n_flows): + perm = list(range(self.n_group)) + if i < self.n_flows // 2: + perm = perm[::-1] + else: + perm[:half] = reversed(perm[:half]) + perm[half:] = reversed(perm[half:]) + self.perms.append(perm) def forward(self, audio, mel): mel = self.conditioner(mel) @@ -266,19 +309,13 @@ class WaveFlowModule(dg.Layer): return z, log_s_list def synthesize(self, mel, sigma=1.0): - #debug(mel, "mel") mel = self.conditioner.infer(mel) - #debug(mel, "mel after conditioner") - # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) - #debug(mel, "after group") audio = fluid.layers.gaussian_random( shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma) - #debug(audio, "audio") - for i in reversed(range(self.n_flows)): # Permute over the height dimension. audio_slices = [audio[:, :, j, :] for j in self.perms[i]] @@ -287,34 +324,28 @@ class WaveFlowModule(dg.Layer): mel = fluid.layers.stack(mel_slices, axis=2) audio_list = [] - audio_0 = audio[:, :, :1, :] + audio_0 = audio[:, :, 0:1, :] audio_list.append(audio_0) + audio_h = audio_0 + queues = [[] for _ in range(self.n_layers)] for h in range(1, self.n_group): - # inputs: [bs, 1, h, time/n_group] - inputs = fluid.layers.concat(audio_list, axis=2) - conds = mel[:, :, 1:(h+1), :] - outputs = self.flows[i](inputs, conds) + inputs = audio_h + conds = mel[:, :, h:(h+1), :] + outputs = self.flows[i].infer(inputs, conds, queues) - log_s = outputs[:, :1, (h-1):h, :] - b = outputs[:, 1:, (h-1):h, :] - audio_h = (audio[:, :, h:(h+1), :] - b) / fluid.layers.exp(log_s) + log_s = outputs[:, 0:1, :, :] + b = outputs[:, 1:, :, :] + audio_h = (audio[:, :, h:(h+1), :] - b) / \ + fluid.layers.exp(log_s) audio_list.append(audio_h) audio = fluid.layers.concat(audio_list, axis=2) - #print("audio.shape =", audio.shape) - # Assume batch size = 1 - # audio: [n_group, time/n_group] - audio = fluid.layers.squeeze(audio, [0, 1]) - # audio: [time] + # audio: [bs, n_group, time/n_group] + audio = fluid.layers.squeeze(audio, [1]) + # audio: [bs, time] audio = fluid.layers.reshape( - fluid.layers.transpose(audio, [1, 0]), [-1]) - #print("audio.shape =", audio.shape) + fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1]) return audio - - def start_new_sequence(self): - for layer in self.sublayers(): - if isinstance(layer, conv.Conv1D): - layer.start_new_sequence() From 4af577ad723788a0dd6c10bba1f357a263a92149 Mon Sep 17 00:00:00 2001 From: zhaokexin01 Date: Thu, 19 Dec 2019 16:34:22 +0800 Subject: [PATCH 07/10] Update README.md --- parakeet/models/waveflow/README.md | 83 ++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/parakeet/models/waveflow/README.md b/parakeet/models/waveflow/README.md index 355ca31..d8072b1 100644 --- a/parakeet/models/waveflow/README.md +++ b/parakeet/models/waveflow/README.md @@ -1,6 +1,28 @@ -### Install +# WaveFlow with Paddle Fluid -pip install -r requirements.txt +Paddle fluid implementation of [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219). + +## Project Structure +```text +├── configs # yaml configuration files of preset model hyperparameters +├── benchmark.py # benchmark code to test the speed of batched speech synthesis +├── data.py # dataset and dataloader settings for LJSpeech +├── synthesis.py # script for speech synthesis +├── train.py # script for model training +├── utils.py # helper functions for e.g., model checkpointing +├── waveflow.py # WaveFlow model high level APIs +└── waveflow_modules.py # WaveFlow model implementation +``` + +## Usage + +There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on. +We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset. + +Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`. +For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`. + +Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively. ### Dataset @@ -18,21 +40,72 @@ In this example, assume that the path of unzipped LJSpeech dataset is `./data/LJ ```bash export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." export CUDA_VISIBLE_DEVICES=0 -python -u train.py --config=${yaml} \ +python -u train.py \ + --config=./configs/waveflow_ljspeech.yaml \ --root=./data/LJSpeech-1.1 \ --name=${ModelName} --batch_size=4 \ --parallel=false --use_gpu=true ``` +#### Save and Load checkpoints + +Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default. +The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. + +There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): +1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. +2. Use `--iteration=500000`. +3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`. + ### Train on multiple GPUs ```bash export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." export CUDA_VISIBLE_DEVICES=0,1,2,3 python -u -m paddle.distributed.launch train.py \ - --config=./configs/waveflow_ljspeech_sqz16_r64_layer8x8.yaml \ + --config=./configs/waveflow_ljspeech.yaml \ --root=./data/LJSpeech-1.1 \ - --name=test_speed --parallel=true --use_gpu=true + --name=${ModelName} --parallel=true --use_gpu=true ``` Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode. + +### Monitor with Tensorboard + +By default, the logs are saved in `./runs/waveflow/${ModelName}/logs/`. You can monitor logs by tensorboard. + +```bash +tensorboard --logdir=${log_dir} --port=8888 +``` + +### Synthesize from a checkpoint + +Check the [Save and load checkpoint](#save-and-load-checkpoints) section on how to load a specific checkpoint. +The following example will automatically load the latest checkpoint: + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u synthesis.py \ + --config=./configs/waveflow_ljspeech.yaml \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --use_gpu=true \ + --output=./syn_audios \ + --sample=${SAMPLE} \ + --sigma=1.0 +``` + +In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. + +### Benchmarking + +Use the following example to benchmark the speed of batched speech synthesis, which reports how many times faster than real-time: + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u benchmark.py \ + --config=./configs/waveflow_ljspeech.yaml \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --use_gpu=true +``` \ No newline at end of file From 91ab2b34c4ca01dc731f8a97cdd3e0e8914c19e5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 19 Dec 2019 00:37:43 -0800 Subject: [PATCH 08/10] small change --- .../waveflow/configs/waveflow_ljspeech.yaml | 4 +- parakeet/models/waveflow/slurm.py | 113 ------------------ 2 files changed, 2 insertions(+), 115 deletions(-) delete mode 100644 parakeet/models/waveflow/slurm.py diff --git a/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml b/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml index f9bbc83..d3548c4 100644 --- a/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml +++ b/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml @@ -12,8 +12,8 @@ seed: 1234 learning_rate: 0.0002 batch_size: 8 test_every: 2000 -save_every: 5000 -max_iterations: 2000000 +save_every: 10000 +max_iterations: 3000000 sigma: 1.0 n_flows: 8 diff --git a/parakeet/models/waveflow/slurm.py b/parakeet/models/waveflow/slurm.py deleted file mode 100644 index de1818c..0000000 --- a/parakeet/models/waveflow/slurm.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Utility module for restarting training when using SLURM. -""" -import subprocess -import os -import sys -import shlex -import re -import time - - -def job_info(): - """Get information about the current job using `scontrol show job`. - Returns a dict mapping parameter names (e.g. "UserId", "RunTime", etc) to - their values, both as strings. - """ - job_id = int(os.environ["SLURM_JOB_ID"]) - - command = ["scontrol", "show", "job", str(job_id)] - output = subprocess.check_output(command).decode("utf-8") - - # Use a regex to extract the parameter names and values - pattern = "([A-Za-z/]*)=([^ \t\n]*)" - return dict(re.findall(pattern, output)) - - -def parse_hours(text): - """Parse a time format HH or DD-HH into a number of hours.""" - hour_chunks = text.split("-") - if len(hour_chunks) == 1: - return int(hour_chunks[0]) - elif len(hour_chunks) == 2: - return 24 * int(hour_chunks[0]) + int(hour_chunks[1]) - else: - raise ValueError("Unexpected hour format (expected HH or " - "DD-HH, but got {}).".format(text)) - - -def parse_time(text): - """Convert slurm time to an integer. - Expects time to be of the form: - "hours:minutes:seconds" or "day-hours:minutes:seconds". - """ - hours, minutes, seconds = text.split(":") - try: - return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds) - except ValueError as e: - raise ValueError("Error parsing time {}. Got error {}.".format( - text, str(e))) - - -def restart_command(): - """Using the environment and SLURM command, create a command that, when, - run, will enqueue a repeat of the current job using `sbatch`. - Return the command as a list of strings, suitable for passing to - `subprocess.check_call` or similar functions. - Returns: - resume_command: list, command to run to restart job. - end_time: int or None; the time the job will end or None - if the job has unlimited runtime. - """ - # Make sure `RunTime` could be parsed correctly. - while job_info()["RunTime"] == "INVALID": - time.sleep(1) - - # Get all the necessary information by querying SLURM with this job id - info = job_info() - - try: - num_cpus = int(info["CPUs/Task"]) - except KeyError: - num_cpus = int(os.environ["SLURM_CPUS_PER_TASK"]) - - num_tasks = int(os.environ["SLURM_NTASKS"]) - nodes = info["NumNodes"] - gres, partition = info.get("Gres"), info.get("Partition") - stderr, stdout = info.get("StdErr"), info.get("StdOut") - job_name = info.get("JobName") - command = ["sbatch", "--job-name={}".format(job_name), - "--ntasks={}".format(num_tasks), - "--exclude=asimov-186"] - - if partition: - command.extend(["--partition", partition]) - - if gres and gres != "(null)": - command.extend(["--gres", gres]) - num_gpu = int(gres.split(':')[-1]) - print("number of gpu assigned by slurm is {}".format(num_gpu)) - - if stderr: - command.extend(["--error", stderr]) - - if stdout: - command.extend(["--output", stdout]) - - python = subprocess.check_output( - ["/usr/bin/which", "python3"]).decode("utf-8").strip() - dist_setting = ['-m', 'paddle.distributed.launch'] - wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv - - command.append( - "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd))) - time_limit_string = info["TimeLimit"] - if time_limit_string.lower() == "unlimited": - print("UNLIMITED detected: restart OFF, infinite learning ON.", - flush=True) - return command, None - time_limit = parse_time(time_limit_string) - runtime = parse_time(info["RunTime"]) - end_time = time.time() + time_limit - runtime - - return command, end_time From 2179d6d5b0595da2aa9131bc44679f5a71131ffc Mon Sep 17 00:00:00 2001 From: lifuchen Date: Fri, 3 Jan 2020 08:25:17 +0000 Subject: [PATCH 09/10] add FastSpeech --- parakeet/models/dataloader/__init__.py | 0 parakeet/models/dataloader/jlspeech.py | 148 +++++++ parakeet/models/fastspeech/__init__.py | 0 .../models/fastspeech/config/fastapeech.yaml | 41 ++ .../models/fastspeech/config/fastspeech.yaml | 43 ++ parakeet/models/fastspeech/dataset.py | 124 ++++++ parakeet/models/fastspeech/modules.py | 117 ++++++ parakeet/models/fastspeech/network.py | 163 ++++++++ parakeet/models/fastspeech/parse.py | 93 +++++ parakeet/models/fastspeech/train.py | 139 +++++++ parakeet/models/fastspeech/utils.py | 32 ++ .../transformerTTS/config/train_postnet.yaml | 5 +- .../config/train_transformer.yaml | 8 +- parakeet/models/transformerTTS/data.py | 8 +- parakeet/models/transformerTTS/layers.py | 14 +- parakeet/models/transformerTTS/module.py | 385 +++--------------- parakeet/models/transformerTTS/network.py | 118 +++--- parakeet/models/transformerTTS/parse.py | 4 +- parakeet/models/transformerTTS/preprocess.py | 16 +- parakeet/models/transformerTTS/synthesis.py | 4 +- .../models/transformerTTS/train_postnet.py | 5 +- .../transformerTTS/train_transformer.py | 18 +- parakeet/modules/dynamicGRU.py | 44 ++ parakeet/modules/feed_forward.py | 40 ++ parakeet/modules/layers.py | 122 ++++++ parakeet/modules/multihead_attention.py | 84 ++++ parakeet/modules/post_convnet.py | 67 +++ parakeet/modules/prenet.py | 26 ++ .../transformerTTS => modules}/utils.py | 31 ++ 29 files changed, 1457 insertions(+), 442 deletions(-) create mode 100644 parakeet/models/dataloader/__init__.py create mode 100644 parakeet/models/dataloader/jlspeech.py create mode 100644 parakeet/models/fastspeech/__init__.py create mode 100644 parakeet/models/fastspeech/config/fastapeech.yaml create mode 100644 parakeet/models/fastspeech/config/fastspeech.yaml create mode 100644 parakeet/models/fastspeech/dataset.py create mode 100644 parakeet/models/fastspeech/modules.py create mode 100644 parakeet/models/fastspeech/network.py create mode 100644 parakeet/models/fastspeech/parse.py create mode 100644 parakeet/models/fastspeech/train.py create mode 100644 parakeet/models/fastspeech/utils.py create mode 100644 parakeet/modules/dynamicGRU.py create mode 100644 parakeet/modules/feed_forward.py create mode 100644 parakeet/modules/layers.py create mode 100644 parakeet/modules/multihead_attention.py create mode 100644 parakeet/modules/post_convnet.py create mode 100644 parakeet/modules/prenet.py rename parakeet/{models/transformerTTS => modules}/utils.py (58%) diff --git a/parakeet/models/dataloader/__init__.py b/parakeet/models/dataloader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/dataloader/jlspeech.py b/parakeet/models/dataloader/jlspeech.py new file mode 100644 index 0000000..7f39bfb --- /dev/null +++ b/parakeet/models/dataloader/jlspeech.py @@ -0,0 +1,148 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa + +from paddle import fluid +from parakeet import g2p +from parakeet import audio +from parakeet.data.sampler import * +from parakeet.data.datacargo import DataCargo +from parakeet.data.dataset import Dataset +from parakeet.data.batch import TextIDBatcher, SpecBatcher + +class LJSpeechLoader: + def __init__(self, config, nranks, rank, is_vocoder=False): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + LJSPEECH_ROOT = Path(config.data_path) + dataset = LJSpeech(LJSPEECH_ROOT, config) + sampler = DistributedSampler(len(dataset), nranks, rank) + + assert config.batch_size % nranks == 0 + each_bs = config.batch_size // nranks + if is_vocoder: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True) + else: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) + + self.reader = fluid.io.DataLoader.from_generator( + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + self.reader.set_batch_generator(dataloader, place) + + +class LJSpeech(Dataset): + def __init__(self, root, config): + super(LJSpeech, self).__init__() + assert isinstance(root, (str, Path)), "root should be a string or Path object" + self.root = root if isinstance(root, Path) else Path(root) + self.metadata = self._prepare_metadata() + self.config = config + + def _prepare_metadata(self): + csv_path = self.root.joinpath("metadata.csv") + metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, + names=["fname", "raw_text", "normalized_text"]) + return metadata + + def _get_example(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + + fname, raw_text, normalized_text = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + _ljspeech_processor = audio.AudioProcessor( + sample_rate=22050, + num_mels=80, + min_level_db=-100, + ref_level_db=20, + n_fft=2048, + win_length= int(22050 * 0.05), + hop_length= int(22050 * 0.0125), + power=1.2, + preemphasis=0.97, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = _ljspeech_processor.load_wav(str(wav_path)) + mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __len__(self): + return len(self.metadata) + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) + +def batch_examples_vocoder(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + + + diff --git a/parakeet/models/fastspeech/__init__.py b/parakeet/models/fastspeech/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/fastspeech/config/fastapeech.yaml b/parakeet/models/fastspeech/config/fastapeech.yaml new file mode 100644 index 0000000..3e62846 --- /dev/null +++ b/parakeet/models/fastspeech/config/fastapeech.yaml @@ -0,0 +1,41 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +encoder_output_size: 384 +word_vec_dim: 384 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +decoder_output_size: 384 +d_model: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +image_step: 2000 +use_gpu: False +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +transtts_path: ./checkpoint +transformer_step: 70000 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/parakeet/models/fastspeech/config/fastspeech.yaml new file mode 100644 index 0000000..947457b --- /dev/null +++ b/parakeet/models/fastspeech/config/fastspeech.yaml @@ -0,0 +1,43 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +encoder_output_size: 384 +embedding_size: 384 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +decoder_output_size: 384 +hidden_size: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 +dropout: 0.1 +transformer_head: 4 + +warm_up_step: 4000 +grad_clip_thresh: 0.1 +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +transtts_path: ../transformerTTS/checkpoint +transformer_step: 20 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/dataset.py b/parakeet/models/fastspeech/dataset.py new file mode 100644 index 0000000..b3ee344 --- /dev/null +++ b/parakeet/models/fastspeech/dataset.py @@ -0,0 +1,124 @@ +import torch +from torch.nn import functional as F +from torch.utils.data import Dataset, DataLoader + +import numpy as np +import math +import os + +import hparams +import Audio +from text import text_to_sequence +from utils import process_text, pad_1D, pad_2D + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +class FastSpeechDataset(Dataset): + """ LJSpeech """ + + def __init__(self): + self.text = process_text(os.path.join("data", "train.txt")) + + def __len__(self): + return len(self.text) + + def __getitem__(self, idx): + mel_gt_name = os.path.join( + hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1)) + mel_gt_target = np.load(mel_gt_name) + D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy")) + + character = self.text[idx][0:len(self.text[idx])-1] + character = np.array(text_to_sequence( + character, hparams.text_cleaners)) + + sample = {"text": character, + "mel_target": mel_gt_target, + "D": D} + + return sample + + +def reprocess(batch, cut_list): + texts = [batch[ind]["text"] for ind in cut_list] + mel_targets = [batch[ind]["mel_target"] for ind in cut_list] + Ds = [batch[ind]["D"] for ind in cut_list] + + length_text = np.array([]) + for text in texts: + length_text = np.append(length_text, text.shape[0]) + + src_pos = list() + max_len = int(max(length_text)) + for length_src_row in length_text: + src_pos.append(np.pad([i+1 for i in range(int(length_src_row))], + (0, max_len-int(length_src_row)), 'constant')) + src_pos = np.array(src_pos) + + length_mel = np.array(list()) + for mel in mel_targets: + length_mel = np.append(length_mel, mel.shape[0]) + + mel_pos = list() + max_mel_len = int(max(length_mel)) + for length_mel_row in length_mel: + mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))], + (0, max_mel_len-int(length_mel_row)), 'constant')) + mel_pos = np.array(mel_pos) + + texts = pad_1D(texts) + Ds = pad_1D(Ds) + mel_targets = pad_2D(mel_targets) + + out = {"text": texts, + "mel_target": mel_targets, + "D": Ds, + "mel_pos": mel_pos, + "src_pos": src_pos, + "mel_max_len": max_mel_len} + + return out + + +def collate_fn(batch): + len_arr = np.array([d["text"].shape[0] for d in batch]) + index_arr = np.argsort(-len_arr) + batchsize = len(batch) + real_batchsize = int(math.sqrt(batchsize)) + + cut_list = list() + for i in range(real_batchsize): + cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize]) + + output = list() + for i in range(real_batchsize): + output.append(reprocess(batch, cut_list[i])) + + return output + + +if __name__ == "__main__": + # Test + dataset = FastSpeechDataset() + training_loader = DataLoader(dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_fn, + drop_last=True, + num_workers=0) + total_step = hparams.epochs * len(training_loader) * hparams.batch_size + + cnt = 0 + for i, batchs in enumerate(training_loader): + for j, data_of_batch in enumerate(batchs): + mel_target = torch.from_numpy( + data_of_batch["mel_target"]).float().to(device) + D = torch.from_numpy(data_of_batch["D"]).int().to(device) + # print(mel_target.size()) + # print(D.sum()) + print(cnt) + if mel_target.size(1) == D.sum().item(): + cnt += 1 + + print(cnt) diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py new file mode 100644 index 0000000..6c09f41 --- /dev/null +++ b/parakeet/models/fastspeech/modules.py @@ -0,0 +1,117 @@ +import numpy as np +import math +import utils +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parakeet.modules.layers import Conv1D +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward + + + +class FFTBlock(dg.Layer): + """FFT Block""" + def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): + super(FFTBlock, self).__init__() + self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout) + self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) + + def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): + enc_output, enc_slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) + enc_output *= non_pad_mask + + enc_output = self.pos_ffn(enc_output) + enc_output *= non_pad_mask + + return enc_output, enc_slf_attn + + +class LengthRegulator(dg.Layer): + def __init__(self, input_size, out_channels, filter_size, dropout=0.1): + super(LengthRegulator, self).__init__() + self.duration_predictor = DurationPredictor(input_size=input_size, + out_channels=out_channels, + filter_size=filter_size, + dropout=dropout) + + def LR(self, x, duration_predictor_output, alpha=1.0): + output = [] + batch_size = x.shape[0] + for i in range(batch_size): + output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha)) + output = self.pad(output) + return output + + def pad(self, input_ele): + max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) + out_list = [] + for i in range(len(input_ele)): + pad_len = max_len - input_ele[i].shape[0] + one_batch_padded = layers.pad( + input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) + out_list.append(one_batch_padded) + out_padded = layers.stack(out_list) + return out_padded + + def expand(self, batch, predicted, alpha): + out = [] + time_steps = batch.shape[1] + fertilities = predicted.numpy() + batch = layers.squeeze(batch,[0]) + + + for i in range(time_steps): + if fertilities[0,i]==0: + continue + out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1])) + out = layers.concat(out, axis=0) + return out + + + def forward(self, x, alpha=1.0, target=None): + duration_predictor_output = self.duration_predictor(x) + if fluid.framework._dygraph_tracer()._train_mode: + output = self.LR(x, target) + return output, duration_predictor_output + else: + duration_predictor_output = layers.round(duration_predictor_output) + output = self.LR(x, duration_predictor_output, alpha) + mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])]) + return output, mel_pos + +class DurationPredictor(dg.Layer): + """ Duration Predictor """ + def __init__(self, input_size, out_channels, filter_size, dropout=0.1): + super(DurationPredictor, self).__init__() + self.input_size = input_size + self.out_channels = out_channels + self.filter_size = filter_size + self.dropout = dropout + + self.conv1 = Conv1D(in_channels = self.input_size, + out_channels = self.out_channels, + filter_size = self.filter_size, + padding=1, + data_format='NTC') + self.conv2 = Conv1D(in_channels = self.out_channels, + out_channels = self.out_channels, + filter_size = self.filter_size, + padding=1, + data_format='NTC') + self.layer_norm1 = dg.LayerNorm(self.out_channels) + self.layer_norm2 = dg.LayerNorm(self.out_channels) + + self.linear =dg.Linear(self.out_channels, 1) + + def forward(self, encoder_output): + + # encoder_output.shape(N, T, C) + out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout) + out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout) + out = layers.relu(self.linear(out)) + out = layers.squeeze(out, axes=[-1]) + + return out + + diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py new file mode 100644 index 0000000..3f00263 --- /dev/null +++ b/parakeet/models/fastspeech/network.py @@ -0,0 +1,163 @@ +from utils import * +from modules import * +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +from parakeet.g2p.text.symbols import symbols +from parakeet.modules.utils import * +from parakeet.modules.post_convnet import PostConvNet + +class Encoder(dg.Layer): + def __init__(self, + n_src_vocab, + len_max_seq, + d_word_vec, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Encoder, self).__init__() + n_position = len_max_seq + 1 + + self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0) + self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, character, text_pos): + enc_slf_attn_list = [] + # -- prepare masks + # shape character (N, T) + slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) + non_pad_mask = get_non_pad_mask(character) + + # -- Forward + enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) + + for enc_layer in self.layer_stack: + enc_output, enc_slf_attn = enc_layer( + enc_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + enc_slf_attn_list += [enc_slf_attn] + + return enc_output, non_pad_mask, enc_slf_attn_list + +class Decoder(dg.Layer): + def __init__(self, + len_max_seq, + d_word_vec, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Decoder, self).__init__() + + n_position = len_max_seq + 1 + self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, enc_seq, enc_pos): + dec_slf_attn_list = [] + + # -- Prepare masks + slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) + non_pad_mask = get_non_pad_mask(enc_pos) + + # -- Forward + dec_output = enc_seq + self.position_enc(enc_pos) + + for dec_layer in self.layer_stack: + dec_output, dec_slf_attn = dec_layer( + dec_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + dec_slf_attn_list += [dec_slf_attn] + + return dec_output, dec_slf_attn_list + +class FastSpeech(dg.Layer): + def __init__(self, cfg): + " FastSpeech" + super(FastSpeech, self).__init__() + + self.encoder = Encoder(n_src_vocab=len(symbols)+1, + len_max_seq=cfg.max_sep_len, + d_word_vec=cfg.embedding_size, + n_layers=cfg.encoder_n_layer, + n_head=cfg.encoder_head, + d_k=64, + d_v=64, + d_model=cfg.hidden_size, + d_inner=cfg.encoder_conv1d_filter_size, + fft_conv1d_kernel=cfg.fft_conv1d_filter, + fft_conv1d_padding=cfg.fft_conv1d_padding, + dropout=0.1) + self.length_regulator = LengthRegulator(input_size=cfg.hidden_size, + out_channels=cfg.duration_predictor_output_size, + filter_size=cfg.duration_predictor_filter_size, + dropout=cfg.dropout) + self.decoder = Decoder(len_max_seq=cfg.max_sep_len, + d_word_vec=cfg.embedding_size, + n_layers=cfg.decoder_n_layer, + n_head=cfg.decoder_head, + d_k=64, + d_v=64, + d_model=cfg.hidden_size, + d_inner=cfg.decoder_conv1d_filter_size, + fft_conv1d_kernel=cfg.fft_conv1d_filter, + fft_conv1d_padding=cfg.fft_conv1d_padding, + dropout=0.1) + self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels) + self.postnet = PostConvNet(n_mels=80, + num_hidden=512, + filter_size=5, + padding=int(5 / 2), + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1) + + def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): + encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) + if fluid.framework._dygraph_tracer()._train_mode: + + length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, + target=length_target, + alpha=alpha) + decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos) + + mel_output = self.mel_linear(decoder_output) + mel_output_postnet = self.postnet(mel_output) + mel_output + + return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list + else: + length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) + decoder_output = self.decoder(length_regulator_output, decoder_pos) + + mel_output = self.mel_linear(decoder_output) + mel_output_postnet = self.postnet(mel_output) + mel_output + + return mel_output, mel_output_postnet \ No newline at end of file diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py new file mode 100644 index 0000000..a6a8b2f --- /dev/null +++ b/parakeet/models/fastspeech/parse.py @@ -0,0 +1,93 @@ +import jsonargparse + +def add_config_options_to_parser(parser): + parser.add_argument('--audio.num_mels', type=int, default=80, + help="the number of mel bands when calculating mel spectrograms.") + parser.add_argument('--audio.n_fft', type=int, default=2048, + help="the number of fft components.") + parser.add_argument('--audio.sr', type=int, default=22050, + help="the sampling rate of audio data file.") + parser.add_argument('--audio.preemphasis', type=float, default=0.97, + help="the preemphasis coefficient.") + parser.add_argument('--audio.hop_length', type=float, default=128, + help="the number of samples to advance between frames.") + parser.add_argument('--audio.win_length', type=float, default=1024, + help="the length (width) of the window function.") + parser.add_argument('--audio.power', type=float, default=1.4, + help="the power to raise before griffin-lim.") + parser.add_argument('--audio.min_level_db', type=int, default=-100, + help="the minimum level db.") + parser.add_argument('--audio.ref_level_db', type=int, default=20, + help="the reference level db.") + parser.add_argument('--audio.outputs_per_step', type=int, default=1, + help="the outputs per step.") + + parser.add_argument('--embedding_size', type=int, default=256, + help="the dim size of embedding.") + parser.add_argument('--encoder_n_layer', type=int, default=6, + help="the number of FFT Block in encoder.") + parser.add_argument('--encoder_head', type=int, default=2, + help="the attention head number in encoder.") + parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024, + help="the filter size of conv1d in encoder.") + parser.add_argument('--max_sep_len', type=int, default=2048, + help="the max length of sequence.") + parser.add_argument('--encoder_output_size', type=int, default=256, + help="the output channel size of encoder.") + parser.add_argument('--decoder_n_layer', type=int, default=6, + help="the number of FFT Block in decoder.") + parser.add_argument('--decoder_head', type=int, default=2, + help="the attention head number in decoder.") + parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, + help="the filter size of conv1d in decoder.") + parser.add_argument('--decoder_output_size', type=int, default=256, + help="the output channel size of decoder.") + parser.add_argument('--hidden_size', type=int, default=256, + help="the hidden size in model.") + parser.add_argument('--duration_predictor_output_size', type=int, default=256, + help="the output size of duration predictior.") + parser.add_argument('--duration_predictor_filter_size', type=int, default=3, + help="the filter size of conv1d in duration prediction.") + parser.add_argument('--fft_conv1d_filter', type=int, default=3, + help="the filter size of conv1d in fft.") + parser.add_argument('--fft_conv1d_padding', type=int, default=1, + help="the padding size of conv1d in fft.") + parser.add_argument('--dropout', type=float, default=0.1, + help="the dropout in network.") + parser.add_argument('--transformer_head', type=int, default=4, + help="the attention head num of transformerTTS.") + + parser.add_argument('--warm_up_step', type=int, default=4000, + help="the warm up step of learning rate.") + parser.add_argument('--grad_clip_thresh', type=float, default=1.0, + help="the threshold of grad clip.") + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--use_gpu', type=bool, default=True, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=bool, default=False, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./sample', + help="the directory to save audio sample in synthesis.") + parser.add_argument('--transtts_path', type=str, default='./log', + help="the directory to load pretrain transformerTTS model.") + parser.add_argument('--transformer_step', type=int, default=70000, + help="the step to load transformerTTS model.") + + + parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/parakeet/models/fastspeech/train.py b/parakeet/models/fastspeech/train.py new file mode 100644 index 0000000..2af299d --- /dev/null +++ b/parakeet/models/fastspeech/train.py @@ -0,0 +1,139 @@ +import numpy as np +import argparse +import os +import time +import math +import jsonargparse +from pathlib import Path +from tqdm import tqdm +from tensorboardX import SummaryWriter +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parse import add_config_options_to_parser +from pprint import pprint +from network import FastSpeech +from utils import get_alignment +from parakeet.models.dataloader.jlspeech import LJSpeechLoader +from parakeet.models.transformerTTS.network import TransformerTTS + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + +def main(cfg): + + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'fastspeech') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + transformerTTS = TransformerTTS(cfg) + model_path = os.path.join(cfg.transtts_path, "transformer") + model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) + #for param in transformerTTS.state_dict(): + # print(param) + + transformerTTS.set_dict(model_dict) + transformerTTS.eval() + + model = FastSpeech(cfg) + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step)) + + reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + pbar = tqdm(reader) + + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + character, mel, mel_input, pos_text, pos_mel, text_length = data + + _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) + alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32) + + global_step += 1 + + #Forward + result= model(character, + pos_text, + mel_pos=pos_mel, + length_target=alignment) + mel_output, mel_output_postnet, duration_predictor_output, _, _ = result + mel_loss = layers.mse_loss(mel_output, mel) + mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) + duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment))) + total_loss = mel_loss + mel_postnet_loss + duration_loss + + if local_rank==0: + print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) + + writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) + writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) + writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + + if cfg.use_data_parallel: + total_loss = model.scale_loss(total_loss) + total_loss.backward() + model.apply_collective_grads() + else: + total_loss.backward() + optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + model.clear_gradients() + + # save checkpoint + if local_rank==0 and global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() + + +if __name__ =='__main__': + parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) + main(cfg) \ No newline at end of file diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py new file mode 100644 index 0000000..7517a13 --- /dev/null +++ b/parakeet/models/fastspeech/utils.py @@ -0,0 +1,32 @@ +import numpy as np + +def get_alignment(attn_probs, n_head): + max_F = 0 + assert attn_probs[0].shape[0] % n_head == 0 + batch_size = int(attn_probs[0].shape[0] // n_head) + for i in range(len(attn_probs)): + multi_attn = attn_probs[i].numpy() + for j in range(n_head): + attn = multi_attn[j*batch_size:(j+1)*batch_size] + F = score_F(attn) + if max_F < F: + max_F = F + max_attn = attn + alignment = compute_duration(max_attn) + return alignment + +def score_F(attn): + max = np.max(attn, axis=-1) + mean = np.mean(max) + return mean + +def compute_duration(attn): + alignment = np.zeros([attn.shape[0],attn.shape[2]]) + for i in range(attn.shape[0]): + for j in range(attn.shape[1]): + max_index = attn[i,j].tolist().index(attn[i,j].max()) + alignment[i,max_index] += 1 + + return alignment + + diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml index 5753ab1..7937c5e 100644 --- a/parakeet/models/transformerTTS/config/train_postnet.yaml +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -10,9 +10,8 @@ audio: ref_level_db: 20 outputs_per_step: 1 -network: - hidden_size: 256 - embedding_size: 512 +hidden_size: 256 +embedding_size: 512 batch_size: 32 diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml index 3e56a4f..038848b 100644 --- a/parakeet/models/transformerTTS/config/train_transformer.yaml +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -10,15 +10,15 @@ audio: ref_level_db: 20 outputs_per_step: 1 -network: - hidden_size: 256 - embedding_size: 512 + +hidden_size: 384 #256 +embedding_size: 384 #512 batch_size: 32 epochs: 10000 lr: 0.001 -save_step: 500 +save_step: 10 image_step: 2000 use_gpu: True use_data_parallel: True diff --git a/parakeet/models/transformerTTS/data.py b/parakeet/models/transformerTTS/data.py index f432640..8fa9182 100644 --- a/parakeet/models/transformerTTS/data.py +++ b/parakeet/models/transformerTTS/data.py @@ -3,10 +3,10 @@ import numpy as np from paddle import fluid from parakeet.data.sampler import DistributedSampler from parakeet.data.datacargo import DataCargo -from preprocess import batch_examples, LJSpeech, batch_examples_postnet +from preprocess import batch_examples, LJSpeech, batch_examples_vocoder class LJSpeechLoader: - def __init__(self, config, nranks, rank, is_postnet=False): + def __init__(self, config, nranks, rank, is_vocoder=False): place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() LJSPEECH_ROOT = Path(config.data_path) @@ -15,8 +15,8 @@ class LJSpeechLoader: assert config.batch_size % nranks == 0 each_bs = config.batch_size // nranks - if is_postnet: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) + if is_vocoder: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True) else: dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) diff --git a/parakeet/models/transformerTTS/layers.py b/parakeet/models/transformerTTS/layers.py index 88f110f..7a8e97e 100644 --- a/parakeet/models/transformerTTS/layers.py +++ b/parakeet/models/transformerTTS/layers.py @@ -14,7 +14,6 @@ class Conv1D(dg.Layer): """ def __init__(self, - name_scope, in_channels, num_filters, filter_size=3, @@ -28,7 +27,7 @@ class Conv1D(dg.Layer): act=None, data_format='NCT', dtype="float32"): - super(Conv1D, self).__init__(name_scope, dtype=dtype) + super(Conv1D, self).__init__(dtype=dtype) self.padding = padding self.in_channels = in_channels @@ -41,7 +40,7 @@ class Conv1D(dg.Layer): self.data_format = data_format self.conv = dg.Conv2D( - self.full_name(), + in_channels=in_channels, num_filters=num_filters, filter_size=(1, filter_size), stride=(1, stride), @@ -77,7 +76,6 @@ class Pool1D(dg.Layer): A Pool 1D block implemented with Pool2D. """ def __init__(self, - name_scope, pool_size=-1, pool_type='max', pool_stride=1, @@ -88,7 +86,7 @@ class Pool1D(dg.Layer): exclusive=True, data_format='NCT', dtype='float32'): - super(Pool1D, self).__init__(name_scope, dtype=dtype) + super(Pool1D, self).__init__(dtype=dtype) self.pool_size = pool_size self.pool_type = pool_type self.pool_stride = pool_stride @@ -101,7 +99,7 @@ class Pool1D(dg.Layer): self.dtype = dtype - self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type, + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], global_pooling = global_pooling, use_cudnn = use_cudnn, ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) @@ -127,7 +125,6 @@ class Pool1D(dg.Layer): class DynamicGRU(dg.Layer): def __init__(self, - scope_name, size, param_attr=None, bias_attr=None, @@ -137,9 +134,8 @@ class DynamicGRU(dg.Layer): h_0=None, origin_mode=False, init_size=None): - super(DynamicGRU, self).__init__(scope_name) + super(DynamicGRU, self).__init__() self.gru_unit = dg.GRUUnit( - self.full_name(), size * 3, param_attr=param_attr, bias_attr=bias_attr, diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index f83bff5..8e003da 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -3,339 +3,63 @@ from parakeet.g2p.text.symbols import symbols import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers -from layers import Conv1D, Pool1D, DynamicGRU +from parakeet.modules.layers import Conv1D, Pool1D +from parakeet.modules.dynamicGRU import DynamicGRU import numpy as np -class FC(dg.Layer): - def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1): - super(FC, self).__init__(name_scope) - self.in_features = in_features - self.out_features = out_features - self.is_bias = is_bias - self.dtype = dtype - self.gain = gain - - self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features), - dtype=dtype, - default_initializer = fluid.initializer.XavierInitializer()) - #self.weight = gain * self.weight - # mind the implicit conversion to ParamAttr for many cases - if is_bias is not False: - k = math.sqrt(1 / in_features) - self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ), - is_bias=True, - dtype=dtype, - default_initializer = fluid.initializer.Uniform(low=-k, high=k)) - - # 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature - - def forward(self, x): - x = fluid.layers.matmul(x, self.weight) - if hasattr(self, "bias"): - x = fluid.layers.elementwise_add(x, self.bias) - return x - -class Conv(dg.Layer): - def __init__(self, name_scope, in_channels, out_channels, filter_size=1, - padding=0, dilation=1, stride=1, use_cudnn=True, - data_format="NCT", is_bias=True, gain=1): - super(Conv, self).__init__(name_scope) - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_size = filter_size - self.padding = padding - self.dilation = dilation - self.stride = stride - self.use_cudnn = use_cudnn - self.data_format = data_format - self.is_bias = is_bias - self.gain = gain - - self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer()) - self.bias_attr = None - if is_bias is not False: - k = math.sqrt(1 / in_channels) - self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k)) - - self.conv = Conv1D( self.full_name(), - in_channels = in_channels, - num_filters = out_channels, - filter_size = filter_size, - padding = padding, - dilation = dilation, - stride = stride, - param_attr = self.weight_attr, - bias_attr = self.bias_attr, - use_cudnn = use_cudnn, - data_format = data_format) - - def forward(self, x): - x = self.conv(x) - return x class EncoderPrenet(dg.Layer): - def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True): - super(EncoderPrenet, self).__init__(name_scope) + def __init__(self, embedding_size, num_hidden, use_cudnn=True): + super(EncoderPrenet, self).__init__() self.embedding_size = embedding_size self.num_hidden = num_hidden self.use_cudnn = use_cudnn - self.embedding = dg.Embedding(self.full_name(), - size = [len(symbols), embedding_size], + self.embedding = dg.Embedding( size = [len(symbols), embedding_size], param_attr = fluid.ParamAttr(name='weight'), padding_idx = None) - self.conv1 = Conv(self.full_name(), - in_channels = embedding_size, + self.conv_list = [] + self.conv_list.append(Conv1D(in_channels = embedding_size, out_channels = num_hidden, filter_size = 5, padding = int(np.floor(5/2)), use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.conv2 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.conv3 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - - self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') - self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') - self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') + data_format = "NCT")) + for _ in range(2): + self.conv_list = Conv1D(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT") - self.projection = FC(self.full_name(), num_hidden, num_hidden) - - def forward(self, x): - x = self.embedding(x) #(batch_size, seq_len, embending_size) - x = layers.transpose(x,[0,2,1]) - x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2) - x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2) - x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2) - x = layers.transpose(x,[0,2,1]) #(N,T,C) - x = self.projection(x) - return x - -class FFN(dg.Layer): - def __init__(self, name_scope, num_hidden, use_cudnn=True): - super(FFN, self).__init__(name_scope) - self.num_hidden = num_hidden - self.use_cudnn = use_cudnn - self.w_1 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden * 4, - filter_size = 1, - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.w_2 = Conv(self.full_name(), - in_channels = num_hidden * 4, - out_channels = num_hidden, - filter_size = 1, - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) - - def forward(self, input): - #FFN Networt - x = layers.transpose(input, [0,2,1]) - x = self.w_2(layers.relu(self.w_1(x))) - x = layers.transpose(x,[0,2,1]) - - # dropout - # x = layers.dropout(x, 0.1) - # not sure where dropout should be placed, in paper should before residual, - # but the diagonal alignment did not appear correctly in the attention plot. - - # residual connection - x = x + input - - - #layer normalization - x = self.layer_norm(x) - - return x - -class DecoderPrenet(dg.Layer): - def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5): - super(DecoderPrenet, self).__init__(name_scope) - self.input_size = input_size - self.hidden_size = hidden_size - self.output_size = output_size - self.dropout_rate = dropout_rate - - self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1 - self.fc2 = FC(self.full_name(), hidden_size, output_size) - - def forward(self, x): - x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate) - x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate) - return x - -class ScaledDotProductAttention(dg.Layer): - def __init__(self, name_scope, d_key): - super(ScaledDotProductAttention, self).__init__(name_scope) - - self.d_key = d_key - - # please attention this mask is diff from pytorch - def forward(self, key, value, query, mask=None, query_mask=None): - # Compute attention score - attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y - attention = attention / math.sqrt(self.d_key) - - # Mask key to ignore padding - if mask is not None: - attention = attention * mask - mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) - attention = attention + mask - - - attention = layers.softmax(attention) - # Mask query to ignore padding - # Not sure how to work - if query_mask is not None: - attention = attention * query_mask - - result = layers.matmul(attention, value) - return result, attention - -class MultiheadAttention(dg.Layer): - def __init__(self, name_scope, num_hidden, num_head=4): - super(MultiheadAttention, self).__init__(name_scope) - self.num_hidden = num_hidden - self.num_hidden_per_attn = num_hidden // num_head - self.num_head = num_head - - self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) - self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) - self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) - - self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn) - - self.fc = FC(self.full_name(), num_hidden * 2, num_hidden) - - self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) - - def forward(self, key, value, query_input, mask=None, query_mask=None): - batch_size = key.shape[0] - seq_len_key = key.shape[1] - seq_len_query = query_input.shape[1] - - # repeat masks h times - if query_mask is not None: - query_mask = layers.unsqueeze(query_mask, axes=[-1]) - query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) - if mask is not None: - mask = layers.expand(mask, (self.num_head, 1, 1)) - - # Make multihead attention - # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) - key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) - value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) - query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn]) - - key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) - value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) - query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn]) - - result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) - - # concat all multihead result - result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn]) - result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) - #print(result.().shape) - # concat result with input - result = layers.concat([query_input, result], axis=-1) - - result = self.fc(result) - result = result + query_input - - result = self.layer_norm(result) - return result, attention - -class PostConvNet(dg.Layer): - def __init__(self, name_scope, config): - super(PostConvNet, self).__init__(name_scope) - - num_hidden = config.network.hidden_size - self.num_hidden = num_hidden - self.conv1 = Conv(self.full_name(), - in_channels = config.audio.num_mels * config.audio.outputs_per_step, - out_channels = num_hidden, - filter_size = 5, - padding = 4, - use_cudnn = config.use_gpu, - data_format = "NCT", - gain = 5 / 3) - self.conv_list = [Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = 4, - use_cudnn = config.use_gpu, - data_format = "NCT", - gain = 5 / 3) for _ in range(3)] for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) - self.conv5 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = config.audio.num_mels * config.audio.outputs_per_step, - filter_size = 5, - padding = 4, - use_cudnn = config.use_gpu, - data_format = "NCT") - self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden, + self.batch_norm_list = [dg.BatchNorm(num_hidden, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', moving_variance_name = 'moving_var', data_layout='NCHW') for _ in range(3)] + for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) - self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') - def forward(self, input): - input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1) + self.projection = dg.Linear(num_hidden, num_hidden) + + def forward(self, x): + x = self.embedding(x) #(batch_size, seq_len, embending_size) + x = layers.transpose(x,[0,2,1]) for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): - input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1) - input = self.conv5(input)[:, :, :-4] - return input + x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) + x = layers.transpose(x,[0,2,1]) #(N,T,C) + x = self.projection(x) + return x class CBHG(dg.Layer): - def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2, + def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, max_pool_kernel_size=2, is_post=False): - super(CBHG, self).__init__(name_scope) + super(CBHG, self).__init__() """ :param hidden_size: dimension of hidden unit :param K: # of convolution banks @@ -344,19 +68,16 @@ class CBHG(dg.Layer): :param max_pool_kernel_size: max pooling kernel size :param is_post: whether post processing or not """ - hidden_size = config.network.hidden_size self.hidden_size = hidden_size self.projection_size = projection_size self.conv_list = [] - self.conv_list.append(Conv(self.full_name(), - in_channels = projection_size, + self.conv_list.append(Conv1D(in_channels = projection_size, out_channels = hidden_size, filter_size = 1, padding = int(np.floor(1/2)), data_format = "NCT")) for i in range(2,K+1): - self.conv_list.append(Conv(self.full_name(), - in_channels = hidden_size, + self.conv_list.append(Conv1D(in_channels = hidden_size, out_channels = hidden_size, filter_size = i, padding = int(np.floor(i/2)), @@ -367,7 +88,7 @@ class CBHG(dg.Layer): self.batchnorm_list = [] for i in range(K): - self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size, + self.batchnorm_list.append(dg.BatchNorm(hidden_size, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', @@ -379,69 +100,63 @@ class CBHG(dg.Layer): conv_outdim = hidden_size * K - self.conv_projection_1 = Conv(self.full_name(), - in_channels = conv_outdim, + self.conv_projection_1 = Conv1D(in_channels = conv_outdim, out_channels = hidden_size, filter_size = 3, padding = int(np.floor(3/2)), data_format = "NCT") - self.conv_projection_2 = Conv(self.full_name(), - in_channels = hidden_size, + self.conv_projection_2 = Conv1D(in_channels = hidden_size, out_channels = projection_size, filter_size = 3, padding = int(np.floor(3/2)), data_format = "NCT") - self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size, + self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', moving_variance_name = 'moving_var', data_layout='NCHW') - self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size, + self.batchnorm_proj_2 = dg.BatchNorm(projection_size, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', moving_variance_name = 'moving_var', data_layout='NCHW') - self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size, + self.max_pool = Pool1D(pool_size = max_pool_kernel_size, pool_type='max', pool_stride=1, pool_padding=1, data_format = "NCT") - self.highway = Highwaynet(self.full_name(), self.projection_size) + self.highway = Highwaynet(self.projection_size) - h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32") + h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = dg.to_variable(h_0) - self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.gru_forward1 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse = False, origin_mode = True, h_0 = h_0) - self.gru_reverse1 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse=True, origin_mode=True, h_0 = h_0) - self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.gru_forward2 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse = False, origin_mode = True, h_0 = h_0) - self.gru_reverse2 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse=True, @@ -491,8 +206,8 @@ class CBHG(dg.Layer): return out class Highwaynet(dg.Layer): - def __init__(self, name_scope, num_units, num_layers=4): - super(Highwaynet, self).__init__(name_scope) + def __init__(self, num_units, num_layers=4): + super(Highwaynet, self).__init__() self.num_units = num_units self.num_layers = num_layers @@ -500,8 +215,8 @@ class Highwaynet(dg.Layer): self.linears = [] for i in range(num_layers): - self.linears.append(FC(self.full_name(), num_units, num_units)) - self.gates.append(FC(self.full_name(), num_units, num_units)) + self.linears.append(dg.Linear(num_units, num_units)) + self.gates.append(dg.Linear(num_units, num_units)) for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): self.add_sublayer("linears_{}".format(i), linear) diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py index 3d356dc..0536f68 100644 --- a/parakeet/models/transformerTTS/network.py +++ b/parakeet/models/transformerTTS/network.py @@ -1,39 +1,42 @@ -from module import * -from utils import get_positional_table, get_sinusoid_encoding_table +from parakeet.models.transformerTTS.module import * import paddle.fluid.dygraph as dg import paddle.fluid as fluid +from parakeet.modules.layers import Conv1D +from parakeet.modules.utils import * +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward +from parakeet.modules.prenet import PreNet +from parakeet.modules.post_convnet import PostConvNet + class Encoder(dg.Layer): - def __init__(self, name_scope, embedding_size, num_hidden, config): - super(Encoder, self).__init__(name_scope) + def __init__(self, embedding_size, num_hidden, config): + super(Encoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr(name='alpha', initializer=fluid.initializer.Constant(value=1.0)) - self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32') + self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(name_scope=self.full_name(), - size=[1024, num_hidden], + self.pos_emb = dg.Embedding(size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( name='weight', initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) - self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(), - embedding_size = embedding_size, + self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, num_hidden = num_hidden, use_cudnn=config.use_gpu) - self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + self.layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) def forward(self, x, positional): if fluid.framework._dygraph_tracer()._train_mode: - query_mask = (positional != 0).astype(np.float32) - mask = (positional != 0).astype(np.float32) - mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1]) + query_mask = get_non_pad_mask(positional) + mask = get_attn_key_pad_mask(positional, x) else: query_mask, mask = None, None @@ -59,65 +62,60 @@ class Encoder(dg.Layer): return x, query_mask, attentions class Decoder(dg.Layer): - def __init__(self, name_scope, num_hidden, config): - super(Decoder, self).__init__(name_scope) + def __init__(self, num_hidden, config): + super(Decoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr(name='alpha') - self.alpha = self.create_parameter(param, shape=(1,), dtype='float32', + self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(name_scope=self.full_name(), - size=[1024, num_hidden], + self.pos_emb = dg.Embedding(size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( name='weight', initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) - self.decoder_prenet = DecoderPrenet(self.full_name(), - input_size = config.audio.num_mels, + self.decoder_prenet = PreNet(input_size = config.audio.num_mels, hidden_size = num_hidden * 2, output_size = num_hidden, dropout_rate=0.2) - self.linear = FC(self.full_name(), num_hidden, num_hidden) + self.linear = dg.Linear(num_hidden, num_hidden) - self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] for i, layer in enumerate(self.selfattn_layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + self.attn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] for i, layer in enumerate(self.attn_layers): self.add_sublayer("attn_{}".format(i), layer) - self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step) - self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1) + self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) + self.stop_linear = dg.Linear(num_hidden, 1) - self.postconvnet = PostConvNet(self.full_name(), config) + self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, + filter_size = 5, padding = 4, num_conv=5, + outputs_per_step=config.audio.outputs_per_step, + use_cudnn = config.use_gpu) def forward(self, key, value, query, c_mask, positional): - batch_size = key.shape[0] - decoder_len = query.shape[1] # get decoder mask with triangular matrix if fluid.framework._dygraph_tracer()._train_mode: - #zeros = np.zeros(positional.shape, dtype=np.float32) - m_mask = (positional != 0).astype(np.float32) - mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1) - mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + m_mask = get_non_pad_mask(positional) + mask = get_attn_key_pad_mask(positional, query) + triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) + mask = mask + triu_tensor + mask = fluid.layers.cast(mask != 0, np.float32) - # (batch_size, decoder_len, decoder_len) - zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(np.float32), axes=2), [1,1,decoder_len]) - # (batch_size, decoder_len, seq_len) - zero_mask = fluid.layers.transpose(zero_mask, [0,2,1]) - + # (batch_size, decoder_len, encoder_len) + zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) else: - mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) + mask = fluid.layers.cast(dg.to_variable(mask != 0), np.float32) m_mask, zero_mask = None, None - #import pdb; pdb.set_trace() # Decoder pre-network query = self.decoder_prenet(query) @@ -145,21 +143,21 @@ class Decoder(dg.Layer): # Mel linear projection mel_out = self.mel_linear(query) # Post Mel Network - postnet_input = layers.transpose(mel_out, [0,2,1]) - out = self.postconvnet(postnet_input) - out = postnet_input + out - out = layers.transpose(out, [0,2,1]) + out = self.postconvnet(mel_out) + out = mel_out + out # Stop tokens stop_tokens = self.stop_linear(query) + stop_tokens = layers.squeeze(stop_tokens, [-1]) + stop_tokens = layers.sigmoid(stop_tokens) return mel_out, out, attn_list, stop_tokens, selfattn_list -class Model(dg.Layer): - def __init__(self, name_scope, config): - super(Model, self).__init__(name_scope) - self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config) - self.decoder = Decoder(self.full_name(), config.network.hidden_size, config) +class TransformerTTS(dg.Layer): + def __init__(self, config): + super(TransformerTTS, self).__init__() + self.encoder = Encoder(config.embedding_size, config.hidden_size, config) + self.decoder = Decoder(config.hidden_size, config) self.config = config def forward(self, characters, mel_input, pos_text, pos_mel): @@ -180,16 +178,16 @@ class ModelPostNet(dg.Layer): """ CBHG Network (mel -> linear) """ - def __init__(self, name_scope, config): - super(ModelPostNet, self).__init__(name_scope) - self.pre_proj = Conv(self.full_name(), - in_channels = config.audio.num_mels, - out_channels = config.network.hidden_size, + def __init__(self, config): + super(ModelPostNet, self).__init__() + self.pre_proj = Conv1D(in_channels = config.audio.num_mels, + out_channels = config.hidden_size, + filter_size=1, data_format = "NCT") - self.cbhg = CBHG(self.full_name(), config) - self.post_proj = Conv(self.full_name(), - in_channels = config.audio.num_mels, + self.cbhg = CBHG(config.hidden_size, config.batch_size) + self.post_proj = Conv1D(in_channels = config.hidden_size, out_channels = (config.audio.n_fft // 2) + 1, + filter_size=1, data_format = "NCT") def forward(self, mel): diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py index 0c09d01..87a67e9 100644 --- a/parakeet/models/transformerTTS/parse.py +++ b/parakeet/models/transformerTTS/parse.py @@ -22,9 +22,9 @@ def add_config_options_to_parser(parser): parser.add_argument('--audio.outputs_per_step', type=int, default=1, help="the outputs per step.") - parser.add_argument('--network.hidden_size', type=int, default=256, + parser.add_argument('--hidden_size', type=int, default=256, help="the hidden size in network.") - parser.add_argument('--network.embedding_size', type=int, default=512, + parser.add_argument('--embedding_size', type=int, default=512, help="the embedding vector size.") parser.add_argument('--batch_size', type=int, default=32, diff --git a/parakeet/models/transformerTTS/preprocess.py b/parakeet/models/transformerTTS/preprocess.py index 61ed353..b128b00 100644 --- a/parakeet/models/transformerTTS/preprocess.py +++ b/parakeet/models/transformerTTS/preprocess.py @@ -62,20 +62,6 @@ class LJSpeech(Dataset): phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) return (mag, mel, phonemes) # maybe we need to implement it as a map in the future - def _batch_examples(self, minibatch): - mag_batch = [] - mel_batch = [] - phoneme_batch = [] - for example in minibatch: - mag, mel, phoneme = example - mag_batch.append(mag) - mel_batch.append(mel) - phoneme_batch.append(phoneme) - mag_batch = SpecBatcher(pad_value=0.)(mag_batch) - mel_batch = SpecBatcher(pad_value=0.)(mel_batch) - phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) - return (mag_batch, mel_batch, phoneme_batch) - def __getitem__(self, index): metadatum = self.metadata.iloc[index] example = self._get_example(metadatum) @@ -121,7 +107,7 @@ def batch_examples(batch): mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) -def batch_examples_postnet(batch): +def batch_examples_vocoder(batch): mels=[] mags=[] for data in batch: diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py index 13e0de0..9c89d16 100644 --- a/parakeet/models/transformerTTS/synthesis.py +++ b/parakeet/models/transformerTTS/synthesis.py @@ -28,8 +28,8 @@ def synthesis(text_input, cfg): writer = SummaryWriter(path) with dg.guard(place): - model = Model('transtts', cfg) - model_postnet = ModelPostNet('postnet', cfg) + model = Model(cfg) + model_postnet = ModelPostNet(cfg) model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py index 8beeece..2f893f2 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -47,7 +47,7 @@ def main(cfg): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - model = ModelPostNet('postnet', cfg) + model = ModelPostNet(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) @@ -62,7 +62,7 @@ def main(cfg): strategy = dg.parallel.prepare_context() model = MyDataParallel(model, strategy) - reader = LJSpeechLoader(cfg, nranks, local_rank, is_postnet=True).reader() + reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg.epochs): pbar = tqdm(reader) @@ -74,7 +74,6 @@ def main(cfg): global_step += 1 mag_pred = model(mel) - loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) if cfg.use_data_parallel: loss = model.scale_loss(loss) diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index 065be6d..fc522ae 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -9,7 +9,8 @@ import jsonargparse from parse import add_config_options_to_parser from pprint import pprint from matplotlib import cm -from data import LJSpeechLoader +from parakeet.modules.utils import cross_entropy +from parakeet.models.dataloader.jlspeech import LJSpeechLoader class MyDataParallel(dg.parallel.DataParallel): """ @@ -49,7 +50,7 @@ def main(cfg): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - model = Model('transtts', cfg) + model = TransformerTTS(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) @@ -75,15 +76,22 @@ def main(cfg): global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) - + + label = np.zeros(stop_preds.shape).astype(np.float32) + text_length = text_length.numpy() + for i in range(label.shape[0]): + label[i][text_length[i] - 1] = 1 + mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) - loss = mel_loss + post_mel_loss + stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) + loss = mel_loss + post_mel_loss + stop_loss if local_rank==0: writer.add_scalars('training_loss', { 'mel_loss':mel_loss.numpy(), 'post_mel_loss':post_mel_loss.numpy(), + 'stop_loss':stop_loss.numpy() }, global_step) writer.add_scalars('alphas', { @@ -97,7 +105,7 @@ def main(cfg): for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): diff --git a/parakeet/modules/dynamicGRU.py b/parakeet/modules/dynamicGRU.py new file mode 100644 index 0000000..44a6e7f --- /dev/null +++ b/parakeet/modules/dynamicGRU.py @@ -0,0 +1,44 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class DynamicGRU(dg.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = dg.GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = layers.concat(res, axis=1) + return res + diff --git a/parakeet/modules/feed_forward.py b/parakeet/modules/feed_forward.py new file mode 100644 index 0000000..d197c6e --- /dev/null +++ b/parakeet/modules/feed_forward.py @@ -0,0 +1,40 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +from parakeet.modules.layers import Conv1D + +class PositionwiseFeedForward(dg.Layer): + ''' A two-feed-forward-layer module ''' + def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.dropout = dropout + + self.w_1 = Conv1D(in_channels = d_in, + out_channels = num_hidden, + filter_size = filter_size, + padding=padding, + use_cudnn = use_cudnn, + data_format = "NTC") + self.w_2 = Conv1D(in_channels = num_hidden, + out_channels = d_in, + filter_size = filter_size, + padding=padding, + use_cudnn = use_cudnn, + data_format = "NTC") + self.layer_norm = dg.LayerNorm(d_in) + + def forward(self, input): + #FFN Networt + x = self.w_2(layers.relu(self.w_1(input))) + + # dropout + x = layers.dropout(x, self.dropout) + + # residual connection + x = x + input + + #layer normalization + x = self.layer_norm(x) + + return x \ No newline at end of file diff --git a/parakeet/modules/layers.py b/parakeet/modules/layers.py new file mode 100644 index 0000000..c62f0b1 --- /dev/null +++ b/parakeet/modules/layers.py @@ -0,0 +1,122 @@ +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + in_channels, + out_channels, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = out_channels + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + num_channels=in_channels, + num_filters=out_channels, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT'): + super(Pool1D, self).__init__() + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + + + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py new file mode 100644 index 0000000..6b86e51 --- /dev/null +++ b/parakeet/modules/multihead_attention.py @@ -0,0 +1,84 @@ +import math +import numpy as np +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class ScaledDotProductAttention(dg.Layer): + def __init__(self, d_key): + super(ScaledDotProductAttention, self).__init__() + + self.d_key = d_key + + # please attention this mask is diff from pytorch + def forward(self, key, value, query, mask=None, query_mask=None): + # Compute attention score + attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y + attention = attention / math.sqrt(self.d_key) + + # Mask key to ignore padding + if mask is not None: + attention = attention * (mask == 0).astype(np.float32) + mask = mask * (-2 ** 32 + 1) + attention = attention + mask + + + attention = layers.softmax(attention) + attention = layers.dropout(attention, 0.0) + # Mask query to ignore padding + # Not sure how to work + if query_mask is not None: + attention = attention * query_mask + + result = layers.matmul(attention, value) + return result, attention + +class MultiheadAttention(dg.Layer): + def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1): + super(MultiheadAttention, self).__init__() + self.num_hidden = num_hidden + self.num_head = num_head + self.d_k = d_k + self.d_q = d_q + self.dropout = dropout + + self.key = dg.Linear(num_hidden, num_head * d_k) + self.value = dg.Linear(num_hidden, num_head * d_k) + self.query = dg.Linear(num_hidden, num_head * d_q) + + self.scal_attn = ScaledDotProductAttention(d_k) + + self.fc = dg.Linear(num_head * d_q, num_hidden) + + self.layer_norm = dg.LayerNorm(num_hidden) + + def forward(self, key, value, query_input, mask=None, query_mask=None): + batch_size = key.shape[0] + seq_len_key = key.shape[1] + seq_len_query = query_input.shape[1] + + # repeat masks h times + if query_mask is not None: + query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) + if mask is not None: + mask = layers.expand(mask, (self.num_head, 1, 1)) + + # Make multihead attention + # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) + key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) + value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k]) + query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q]) + + key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q]) + result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) + + # concat all multihead result + result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) + result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + + result = layers.dropout(self.fc(result), self.dropout) + result = result + query_input + + result = self.layer_norm(result) + return result, attention \ No newline at end of file diff --git a/parakeet/modules/post_convnet.py b/parakeet/modules/post_convnet.py new file mode 100644 index 0000000..fb7d531 --- /dev/null +++ b/parakeet/modules/post_convnet.py @@ -0,0 +1,67 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from parakeet.modules.layers import Conv1D + +class PostConvNet(dg.Layer): + def __init__(self, + n_mels=80, + num_hidden=512, + filter_size=5, + padding=0, + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1): + super(PostConvNet, self).__init__() + + self.dropout = dropout + self.conv_list = [] + self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT")) + + for _ in range(1, num_conv-1): + self.conv_list.append(Conv1D(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT") ) + + self.conv_list.append(Conv1D(in_channels = num_hidden, + out_channels = n_mels * outputs_per_step, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batch_norm_list = [dg.BatchNorm(num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') for _ in range(num_conv-1)] + self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW')) + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + + + def forward(self, input): + input = layers.transpose(input, [0,2,1]) + len = input.shape[-1] + for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) + input = layers.transpose(input, [0,2,1]) + return input \ No newline at end of file diff --git a/parakeet/modules/prenet.py b/parakeet/modules/prenet.py new file mode 100644 index 0000000..1f4249e --- /dev/null +++ b/parakeet/modules/prenet.py @@ -0,0 +1,26 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class PreNet(dg.Layer): + """ + Pre Net before passing through the network + """ + def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): + """ + :param input_size: dimension of input + :param hidden_size: dimension of hidden unit + :param output_size: dimension of output + """ + super(PreNet, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.dropout_rate = dropout_rate + + self.linear1 = dg.Linear(input_size, hidden_size) + self.linear2 = dg.Linear(hidden_size, output_size) + + def forward(self, x): + x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) + x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) + return x diff --git a/parakeet/models/transformerTTS/utils.py b/parakeet/modules/utils.py similarity index 58% rename from parakeet/models/transformerTTS/utils.py rename to parakeet/modules/utils.py index 087cacf..626d5f2 100644 --- a/parakeet/models/transformerTTS/utils.py +++ b/parakeet/modules/utils.py @@ -2,6 +2,7 @@ import numpy as np import librosa import os, copy from scipy import signal +import paddle.fluid.layers as layers def get_positional_table(d_pos_vec, n_position=1024): @@ -33,6 +34,28 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): return sinusoid_table +def get_non_pad_mask(seq): + return layers.unsqueeze((seq != 0).astype(np.float32),[-1]) + +def get_attn_key_pad_mask(seq_k, seq_q): + ''' For masking out the padding part of key sequence. ''' + + # Expand to fit the shape of key query attention matrix. + len_q = seq_q.shape[1] + padding_mask = (seq_k != 0).astype(np.float32) + padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) + return padding_mask + +def get_triu_tensor(seq_k, seq_q): + ''' For make a triu tensor ''' + len_k = seq_k.shape[1] + len_q = seq_q.shape[1] + batch_size = seq_k.shape[0] + triu_tensor = np.triu(np.ones([len_k, len_q]), 1) + triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0) + + return triu_tensor + def guided_attention(N, T, g=0.2): '''Guided attention. Refer to page 3 on the paper.''' W = np.zeros((N, T), dtype=np.float32) @@ -40,3 +63,11 @@ def guided_attention(N, T, g=0.2): for t_pos in range(W.shape[1]): W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) return W + + +def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001): + input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) + label = input * (label * (position_weight - 1) + 1) + return layers.reduce_sum(label, dim=[0, 1]) + + From e0aef2e081e1c97fe9ac3b33c1ec501066a83e7b Mon Sep 17 00:00:00 2001 From: lifuchen Date: Wed, 8 Jan 2020 03:55:06 +0000 Subject: [PATCH 10/10] rebuild code and TransformerTTS is right. FastSpeech will later. --- parakeet/audio/audio.py | 4 +- parakeet/models/dataloader/jlspeech.py | 62 ++++----- parakeet/models/fastspeech/dataset.py | 124 ------------------ parakeet/models/fastspeech/modules.py | 47 ++++++- parakeet/models/fastspeech/network.py | 51 +++++++ parakeet/models/fastspeech/parse.py | 4 +- parakeet/models/fastspeech/train.py | 4 +- .../transformerTTS/config/train_postnet.yaml | 3 +- .../config/train_transformer.yaml | 13 +- parakeet/models/transformerTTS/data.py | 29 ---- parakeet/models/transformerTTS/module.py | 17 +-- parakeet/models/transformerTTS/network.py | 17 +-- parakeet/models/transformerTTS/parse.py | 8 +- .../models/transformerTTS/train_postnet.py | 11 +- .../transformerTTS/train_transformer.py | 16 ++- parakeet/modules/dynamicGRU.py | 8 ++ parakeet/modules/feed_forward.py | 24 +++- parakeet/modules/layers.py | 36 +++++ parakeet/modules/multihead_attention.py | 38 +++++- parakeet/modules/post_convnet.py | 20 ++- parakeet/modules/prenet.py | 11 +- 21 files changed, 297 insertions(+), 250 deletions(-) delete mode 100644 parakeet/models/fastspeech/dataset.py delete mode 100644 parakeet/models/transformerTTS/data.py diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index b29dbf2..6b84701 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -9,7 +9,7 @@ class AudioProcessor(object): sample_rate=None, # int, sampling rate num_mels=None, # int, bands of mel spectrogram min_level_db=None, # float, minimum level db - ref_level_db=None, # float, reference level dbn + ref_level_db=None, # float, reference level db n_fft=None, # int: number of samples in a frame for stft win_length=None, # int: the same meaning with n_fft hop_length=None, # int: number of samples between neighboring frame @@ -22,7 +22,7 @@ class AudioProcessor(object): mel_fmax=None, # int: mel spectrogram's maximum frequency clip_norm=True, # bool: clip spectrogram's norm griffin_lim_iters=None, # int: - do_trim_silence=False, # bool: trim silience + do_trim_silence=False, # bool: trim silence sound_norm=False, **kwargs): self.sample_rate = sample_rate diff --git a/parakeet/models/dataloader/jlspeech.py b/parakeet/models/dataloader/jlspeech.py index 7f39bfb..ef55b0f 100644 --- a/parakeet/models/dataloader/jlspeech.py +++ b/parakeet/models/dataloader/jlspeech.py @@ -12,19 +12,19 @@ from parakeet.data.dataset import Dataset from parakeet.data.batch import TextIDBatcher, SpecBatcher class LJSpeechLoader: - def __init__(self, config, nranks, rank, is_vocoder=False): + def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() LJSPEECH_ROOT = Path(config.data_path) dataset = LJSpeech(LJSPEECH_ROOT, config) - sampler = DistributedSampler(len(dataset), nranks, rank) + sampler = DistributedSampler(len(dataset), nranks, rank, shuffle=shuffle) assert config.batch_size % nranks == 0 each_bs = config.batch_size // nranks if is_vocoder: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True) + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples_vocoder, drop_last=True) else: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples, drop_last=True) self.reader = fluid.io.DataLoader.from_generator( capacity=32, @@ -41,6 +41,25 @@ class LJSpeech(Dataset): self.root = root if isinstance(root, Path) else Path(root) self.metadata = self._prepare_metadata() self.config = config + self._ljspeech_processor = audio.AudioProcessor( + sample_rate=config.audio.sr, + num_mels=config.audio.num_mels, + min_level_db=config.audio.min_level_db, + ref_level_db=config.audio.ref_level_db, + n_fft=config.audio.n_fft, + win_length= config.audio.win_length, + hop_length= config.audio.hop_length, + power=config.audio.power, + preemphasis=config.audio.preemphasis, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) def _prepare_metadata(self): csv_path = self.root.joinpath("metadata.csv") @@ -59,29 +78,10 @@ class LJSpeech(Dataset): fname, raw_text, normalized_text = metadatum wav_path = self.root.joinpath("wavs", fname + ".wav") - _ljspeech_processor = audio.AudioProcessor( - sample_rate=22050, - num_mels=80, - min_level_db=-100, - ref_level_db=20, - n_fft=2048, - win_length= int(22050 * 0.05), - hop_length= int(22050 * 0.0125), - power=1.2, - preemphasis=0.97, - signal_norm=True, - symmetric_norm=False, - max_norm=1., - mel_fmin=0, - mel_fmax=None, - clip_norm=True, - griffin_lim_iters=60, - do_trim_silence=False, - sound_norm=False) # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize - wav = _ljspeech_processor.load_wav(str(wav_path)) - mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) - mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) + wav = self._ljspeech_processor.load_wav(str(wav_path)) + mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) return (mag, mel, phonemes) # maybe we need to implement it as a map in the future @@ -123,11 +123,11 @@ def batch_examples(batch): text_lens = sorted(text_lens, reverse=True) # Pad sequence with largest len of the batch - texts = TextIDBatcher(pad_id=0)(texts) - pos_texts = TextIDBatcher(pad_id=0)(pos_texts) - pos_mels = TextIDBatcher(pad_id=0)(pos_mels) - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) - mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) + texts = TextIDBatcher(pad_id=0)(texts) #(B, T) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) def batch_examples_vocoder(batch): diff --git a/parakeet/models/fastspeech/dataset.py b/parakeet/models/fastspeech/dataset.py deleted file mode 100644 index b3ee344..0000000 --- a/parakeet/models/fastspeech/dataset.py +++ /dev/null @@ -1,124 +0,0 @@ -import torch -from torch.nn import functional as F -from torch.utils.data import Dataset, DataLoader - -import numpy as np -import math -import os - -import hparams -import Audio -from text import text_to_sequence -from utils import process_text, pad_1D, pad_2D - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - -class FastSpeechDataset(Dataset): - """ LJSpeech """ - - def __init__(self): - self.text = process_text(os.path.join("data", "train.txt")) - - def __len__(self): - return len(self.text) - - def __getitem__(self, idx): - mel_gt_name = os.path.join( - hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1)) - mel_gt_target = np.load(mel_gt_name) - D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy")) - - character = self.text[idx][0:len(self.text[idx])-1] - character = np.array(text_to_sequence( - character, hparams.text_cleaners)) - - sample = {"text": character, - "mel_target": mel_gt_target, - "D": D} - - return sample - - -def reprocess(batch, cut_list): - texts = [batch[ind]["text"] for ind in cut_list] - mel_targets = [batch[ind]["mel_target"] for ind in cut_list] - Ds = [batch[ind]["D"] for ind in cut_list] - - length_text = np.array([]) - for text in texts: - length_text = np.append(length_text, text.shape[0]) - - src_pos = list() - max_len = int(max(length_text)) - for length_src_row in length_text: - src_pos.append(np.pad([i+1 for i in range(int(length_src_row))], - (0, max_len-int(length_src_row)), 'constant')) - src_pos = np.array(src_pos) - - length_mel = np.array(list()) - for mel in mel_targets: - length_mel = np.append(length_mel, mel.shape[0]) - - mel_pos = list() - max_mel_len = int(max(length_mel)) - for length_mel_row in length_mel: - mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))], - (0, max_mel_len-int(length_mel_row)), 'constant')) - mel_pos = np.array(mel_pos) - - texts = pad_1D(texts) - Ds = pad_1D(Ds) - mel_targets = pad_2D(mel_targets) - - out = {"text": texts, - "mel_target": mel_targets, - "D": Ds, - "mel_pos": mel_pos, - "src_pos": src_pos, - "mel_max_len": max_mel_len} - - return out - - -def collate_fn(batch): - len_arr = np.array([d["text"].shape[0] for d in batch]) - index_arr = np.argsort(-len_arr) - batchsize = len(batch) - real_batchsize = int(math.sqrt(batchsize)) - - cut_list = list() - for i in range(real_batchsize): - cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize]) - - output = list() - for i in range(real_batchsize): - output.append(reprocess(batch, cut_list[i])) - - return output - - -if __name__ == "__main__": - # Test - dataset = FastSpeechDataset() - training_loader = DataLoader(dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_fn, - drop_last=True, - num_workers=0) - total_step = hparams.epochs * len(training_loader) * hparams.batch_size - - cnt = 0 - for i, batchs in enumerate(training_loader): - for j, data_of_batch in enumerate(batchs): - mel_target = torch.from_numpy( - data_of_batch["mel_target"]).float().to(device) - D = torch.from_numpy(data_of_batch["D"]).int().to(device) - # print(mel_target.size()) - # print(D.sum()) - print(cnt) - if mel_target.size(1) == D.sum().item(): - cnt += 1 - - print(cnt) diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py index 6c09f41..621b5c1 100644 --- a/parakeet/models/fastspeech/modules.py +++ b/parakeet/models/fastspeech/modules.py @@ -11,20 +11,33 @@ from parakeet.modules.feed_forward import PositionwiseFeedForward class FFTBlock(dg.Layer): - """FFT Block""" def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): super(FFTBlock, self).__init__() self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): - enc_output, enc_slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) - enc_output *= non_pad_mask + """ + Feed Forward Transformer block in FastSpeech. + + Args: + enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input. + T means the timesteps of input. + non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence. + slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention. + len_q means the sequence length of query, len_k means the sequence length of key. - enc_output = self.pos_ffn(enc_output) - enc_output *= non_pad_mask + Returns: + output (Variable), Shape(B, T, C), the output after self-attention & ffn. + slf_attn (Variable), Shape(B * n_head, T, T), the self attention. + """ + output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) + output *= non_pad_mask - return enc_output, enc_slf_attn + output = self.pos_ffn(output) + output *= non_pad_mask + + return output, slf_attn class LengthRegulator(dg.Layer): @@ -70,6 +83,20 @@ class LengthRegulator(dg.Layer): def forward(self, x, alpha=1.0, target=None): + """ + Length Regulator block in FastSpeech. + + Args: + x (Variable): Shape(B, T, C), dtype: float32. The encoder output. + alpha (Constant): dtype: float32. The hyperparameter to determine the length of + the expanded sequence mel, thereby controlling the voice speed. + target (Variable): (Variable, optional): Shape(B, T_text), + dtype: int64. The duration of phoneme compute from pretrained transformerTTS. + + Returns: + output (Variable), Shape(B, T, C), the output after exppand. + duration_predictor_output (Variable), Shape(B, T, C), the output of duration predictor. + """ duration_predictor_output = self.duration_predictor(x) if fluid.framework._dygraph_tracer()._train_mode: output = self.LR(x, target) @@ -81,7 +108,6 @@ class LengthRegulator(dg.Layer): return output, mel_pos class DurationPredictor(dg.Layer): - """ Duration Predictor """ def __init__(self, input_size, out_channels, filter_size, dropout=0.1): super(DurationPredictor, self).__init__() self.input_size = input_size @@ -105,7 +131,14 @@ class DurationPredictor(dg.Layer): self.linear =dg.Linear(self.out_channels, 1) def forward(self, encoder_output): + """ + Duration Predictor block in FastSpeech. + Args: + encoder_output (Variable): Shape(B, T, C), dtype: float32. The encoder output. + Returns: + out (Variable), Shape(B, T, C), the output of duration predictor. + """ # encoder_output.shape(N, T, C) out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout) out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout) diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py index 3f00263..2f8dc9a 100644 --- a/parakeet/models/fastspeech/network.py +++ b/parakeet/models/fastspeech/network.py @@ -35,6 +35,20 @@ class Encoder(dg.Layer): self.add_sublayer('fft_{}'.format(i), layer) def forward(self, character, text_pos): + """ + Encoder layer of FastSpeech. + + Args: + character (Variable): Shape(B, T_text), dtype: float32. The input text + characters. T_text means the timesteps of input characters. + text_pos (Variable): Shape(B, T_text), dtype: int64. The input text + position. T_text means the timesteps of input characters. + + Returns: + enc_output (Variable), Shape(B, text_T, C), the encoder output. + non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad. + enc_slf_attn_list (list), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. + """ enc_slf_attn_list = [] # -- prepare masks # shape character (N, T) @@ -80,6 +94,18 @@ class Decoder(dg.Layer): self.add_sublayer('fft_{}'.format(i), layer) def forward(self, enc_seq, enc_pos): + """ + Decoder layer of FastSpeech. + + Args: + enc_seq (Variable), Shape(B, text_T, C), dtype: float32. + The output of length regulator. + enc_pos (Variable, optional): Shape(B, T_mel), + dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum. + Returns: + dec_output (Variable), Shape(B, mel_T, C), the decoder output. + dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. + """ dec_slf_attn_list = [] # -- Prepare masks @@ -141,6 +167,31 @@ class FastSpeech(dg.Layer): dropout=0.1) def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): + """ + FastSpeech model. + + Args: + character (Variable): Shape(B, T_text), dtype: float32. The input text + characters. T_text means the timesteps of input characters. + text_pos (Variable): Shape(B, T_text), dtype: int64. The input text + position. T_text means the timesteps of input characters. + mel_pos (Variable, optional): Shape(B, T_mel), + dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum. + length_target (Variable, optional): Shape(B, T_text), + dtype: int64. The duration of phoneme compute from pretrained transformerTTS. + alpha (Constant): + dtype: float32. The hyperparameter to determine the length of the expanded sequence + mel, thereby controlling the voice speed. + + Returns: + mel_output (Variable), Shape(B, mel_T, C), the mel output before postnet. + mel_output_postnet (Variable), Shape(B, mel_T, C), the mel output after postnet. + duration_predictor_output (Variable), Shape(B, text_T), the duration of phoneme compute + with duration predictor. + enc_slf_attn_list (Variable), Shape(B, text_T, text_T), the encoder self attention list. + dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. + """ + encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) if fluid.framework._dygraph_tracer()._train_mode: diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py index a6a8b2f..4132674 100644 --- a/parakeet/models/fastspeech/parse.py +++ b/parakeet/models/fastspeech/parse.py @@ -9,9 +9,9 @@ def add_config_options_to_parser(parser): help="the sampling rate of audio data file.") parser.add_argument('--audio.preemphasis', type=float, default=0.97, help="the preemphasis coefficient.") - parser.add_argument('--audio.hop_length', type=float, default=128, + parser.add_argument('--audio.hop_length', type=int, default=128, help="the number of samples to advance between frames.") - parser.add_argument('--audio.win_length', type=float, default=1024, + parser.add_argument('--audio.win_length', type=int, default=1024, help="the length (width) of the window function.") parser.add_argument('--audio.power', type=float, default=1.4, help="the power to raise before griffin-lim.") diff --git a/parakeet/models/fastspeech/train.py b/parakeet/models/fastspeech/train.py index 2af299d..243631c 100644 --- a/parakeet/models/fastspeech/train.py +++ b/parakeet/models/fastspeech/train.py @@ -66,8 +66,8 @@ def main(cfg): model = FastSpeech(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step)) - + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + parameter_list=model.parameters()) reader = LJSpeechLoader(cfg, nranks, local_rank).reader() if cfg.checkpoint_path is not None: diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml index 7937c5e..74e1b5a 100644 --- a/parakeet/models/transformerTTS/config/train_postnet.yaml +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -13,7 +13,8 @@ audio: hidden_size: 256 embedding_size: 512 - +warm_up_step: 4000 +grad_clip_thresh: 1.0 batch_size: 32 epochs: 10000 lr: 0.001 diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml index 038848b..0fbde62 100644 --- a/parakeet/models/transformerTTS/config/train_transformer.yaml +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -11,22 +11,23 @@ audio: outputs_per_step: 1 -hidden_size: 384 #256 -embedding_size: 384 #512 +hidden_size: 256 +embedding_size: 512 +warm_up_step: 4000 +grad_clip_thresh: 1.0 batch_size: 32 epochs: 10000 lr: 0.001 -save_step: 10 +save_step: 1000 image_step: 2000 use_gpu: True -use_data_parallel: True +use_data_parallel: False data_path: ../../../dataset/LJSpeech-1.1 save_path: ./checkpoint log_dir: ./log - - +#checkpoint_path: ./checkpoint/transformer/1 \ No newline at end of file diff --git a/parakeet/models/transformerTTS/data.py b/parakeet/models/transformerTTS/data.py deleted file mode 100644 index 8fa9182..0000000 --- a/parakeet/models/transformerTTS/data.py +++ /dev/null @@ -1,29 +0,0 @@ -from pathlib import Path -import numpy as np -from paddle import fluid -from parakeet.data.sampler import DistributedSampler -from parakeet.data.datacargo import DataCargo -from preprocess import batch_examples, LJSpeech, batch_examples_vocoder - -class LJSpeechLoader: - def __init__(self, config, nranks, rank, is_vocoder=False): - place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() - - LJSPEECH_ROOT = Path(config.data_path) - dataset = LJSpeech(LJSPEECH_ROOT) - sampler = DistributedSampler(len(dataset), nranks, rank) - - assert config.batch_size % nranks == 0 - each_bs = config.batch_size // nranks - if is_vocoder: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True) - else: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) - - self.reader = fluid.io.DataLoader.from_generator( - capacity=32, - iterable=True, - use_double_buffer=True, - return_list=True) - self.reader.set_batch_generator(dataloader, place) - diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index 8e003da..ecacb1b 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -3,11 +3,12 @@ from parakeet.g2p.text.symbols import symbols import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers -from parakeet.modules.layers import Conv1D, Pool1D +from parakeet.modules.layers import Conv, Pool1D from parakeet.modules.dynamicGRU import DynamicGRU import numpy as np + class EncoderPrenet(dg.Layer): def __init__(self, embedding_size, num_hidden, use_cudnn=True): super(EncoderPrenet, self).__init__() @@ -18,19 +19,19 @@ class EncoderPrenet(dg.Layer): param_attr = fluid.ParamAttr(name='weight'), padding_idx = None) self.conv_list = [] - self.conv_list.append(Conv1D(in_channels = embedding_size, + self.conv_list.append(Conv(in_channels = embedding_size, out_channels = num_hidden, filter_size = 5, padding = int(np.floor(5/2)), use_cudnn = use_cudnn, data_format = "NCT")) for _ in range(2): - self.conv_list = Conv1D(in_channels = num_hidden, + self.conv_list.append(Conv(in_channels = num_hidden, out_channels = num_hidden, filter_size = 5, padding = int(np.floor(5/2)), use_cudnn = use_cudnn, - data_format = "NCT") + data_format = "NCT")) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) @@ -71,13 +72,13 @@ class CBHG(dg.Layer): self.hidden_size = hidden_size self.projection_size = projection_size self.conv_list = [] - self.conv_list.append(Conv1D(in_channels = projection_size, + self.conv_list.append(Conv(in_channels = projection_size, out_channels = hidden_size, filter_size = 1, padding = int(np.floor(1/2)), data_format = "NCT")) for i in range(2,K+1): - self.conv_list.append(Conv1D(in_channels = hidden_size, + self.conv_list.append(Conv(in_channels = hidden_size, out_channels = hidden_size, filter_size = i, padding = int(np.floor(i/2)), @@ -100,13 +101,13 @@ class CBHG(dg.Layer): conv_outdim = hidden_size * K - self.conv_projection_1 = Conv1D(in_channels = conv_outdim, + self.conv_projection_1 = Conv(in_channels = conv_outdim, out_channels = hidden_size, filter_size = 3, padding = int(np.floor(3/2)), data_format = "NCT") - self.conv_projection_2 = Conv1D(in_channels = hidden_size, + self.conv_projection_2 = Conv(in_channels = hidden_size, out_channels = projection_size, filter_size = 3, padding = int(np.floor(3/2)), diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py index 0536f68..5f353f8 100644 --- a/parakeet/models/transformerTTS/network.py +++ b/parakeet/models/transformerTTS/network.py @@ -20,13 +20,12 @@ class Encoder(dg.Layer): self.pos_emb = dg.Embedding(size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( - name='weight', initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, num_hidden = num_hidden, use_cudnn=config.use_gpu) - self.layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] + self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] @@ -40,6 +39,7 @@ class Encoder(dg.Layer): else: query_mask, mask = None, None + # Encoder pre_network x = self.encoder_prenet(x) #(N,T,C) @@ -81,10 +81,10 @@ class Decoder(dg.Layer): dropout_rate=0.2) self.linear = dg.Linear(num_hidden, num_hidden) - self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] + self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] for i, layer in enumerate(self.selfattn_layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] + self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] for i, layer in enumerate(self.attn_layers): self.add_sublayer("attn_{}".format(i), layer) self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] @@ -104,18 +104,18 @@ class Decoder(dg.Layer): if fluid.framework._dygraph_tracer()._train_mode: m_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask(positional, query) + mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query) triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) mask = mask + triu_tensor - mask = fluid.layers.cast(mask != 0, np.float32) + mask = fluid.layers.cast(mask == 0, np.float32) - # (batch_size, decoder_len, encoder_len) zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) else: mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) - mask = fluid.layers.cast(dg.to_variable(mask != 0), np.float32) + mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) m_mask, zero_mask = None, None + # Decoder pre-network query = self.decoder_prenet(query) @@ -164,6 +164,7 @@ class TransformerTTS(dg.Layer): # key (batch_size, seq_len, channel) # c_mask (batch_size, seq_len) # attns_enc (channel / 2, seq_len, seq_len) + key, c_mask, attns_enc = self.encoder(characters, pos_text) # mel_output/postnet_output (batch_size, mel_len, n_mel) diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py index 87a67e9..584ea63 100644 --- a/parakeet/models/transformerTTS/parse.py +++ b/parakeet/models/transformerTTS/parse.py @@ -9,9 +9,9 @@ def add_config_options_to_parser(parser): help="the sampling rate of audio data file.") parser.add_argument('--audio.preemphasis', type=float, default=0.97, help="the preemphasis coefficient.") - parser.add_argument('--audio.hop_length', type=float, default=128, + parser.add_argument('--audio.hop_length', type=int, default=128, help="the number of samples to advance between frames.") - parser.add_argument('--audio.win_length', type=float, default=1024, + parser.add_argument('--audio.win_length', type=int, default=1024, help="the length (width) of the window function.") parser.add_argument('--audio.power', type=float, default=1.4, help="the power to raise before griffin-lim.") @@ -27,6 +27,10 @@ def add_config_options_to_parser(parser): parser.add_argument('--embedding_size', type=int, default=512, help="the embedding vector size.") + parser.add_argument('--warm_up_step', type=int, default=4000, + help="the warm up step of learning rate.") + parser.add_argument('--grad_clip_thresh', type=float, default=1.0, + help="the threshold of grad clip.") parser.add_argument('--batch_size', type=int, default=32, help="batch size for training.") parser.add_argument('--epochs', type=int, default=10000, diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py index 2f893f2..fe0f379 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -6,7 +6,7 @@ from pathlib import Path import jsonargparse from parse import add_config_options_to_parser from pprint import pprint -from data import LJSpeechLoader +from parakeet.models.dataloader.jlspeech import LJSpeechLoader class MyDataParallel(dg.parallel.DataParallel): """ @@ -50,7 +50,9 @@ def main(cfg): model = ModelPostNet(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + parameter_list=model.parameters()) + if cfg.checkpoint_path is not None: model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) @@ -75,13 +77,16 @@ def main(cfg): mag_pred = model(mel) loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) + if cfg.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() - optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + print("===============",model.pre_proj.conv.weight.numpy()) + print("===============",model.pre_proj.conv.weight.gradient()) model.clear_gradients() if local_rank==0: diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index fc522ae..8b177cd 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -34,6 +34,9 @@ def main(cfg): local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 + fluid.default_startup_program().random_seed = 1 + fluid.default_main_program().random_seed = 1 + if local_rank == 0: # Print the whole config setting. pprint(jsonargparse.namespace_to_dict(cfg)) @@ -53,7 +56,8 @@ def main(cfg): model = TransformerTTS(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + parameter_list=model.parameters()) reader = LJSpeechLoader(cfg, nranks, local_rank).reader() @@ -69,6 +73,8 @@ def main(cfg): for epoch in range(cfg.epochs): pbar = tqdm(reader) + + for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) character, mel, mel_input, pos_text, pos_mel, text_length = data @@ -86,7 +92,7 @@ def main(cfg): post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) loss = mel_loss + post_mel_loss + stop_loss - + if local_rank==0: writer.add_scalars('training_loss', { 'mel_loss':mel_loss.numpy(), @@ -116,16 +122,16 @@ def main(cfg): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") - + if cfg.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() - optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1)) + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) model.clear_gradients() - + # save checkpoint if local_rank==0 and global_step % cfg.save_step == 0: if not os.path.exists(cfg.save_path): diff --git a/parakeet/modules/dynamicGRU.py b/parakeet/modules/dynamicGRU.py index 44a6e7f..e84c598 100644 --- a/parakeet/modules/dynamicGRU.py +++ b/parakeet/modules/dynamicGRU.py @@ -25,6 +25,14 @@ class DynamicGRU(dg.Layer): self.is_reverse = is_reverse def forward(self, inputs): + """ + Dynamic GRU block. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result compute by GRU. + """ hidden = self.h_0 res = [] for i in range(inputs.shape[1]): diff --git a/parakeet/modules/feed_forward.py b/parakeet/modules/feed_forward.py index d197c6e..452c482 100644 --- a/parakeet/modules/feed_forward.py +++ b/parakeet/modules/feed_forward.py @@ -1,6 +1,9 @@ import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers -from parakeet.modules.layers import Conv1D +import paddle.fluid as fluid +import math +from parakeet.modules.layers import Conv + class PositionwiseFeedForward(dg.Layer): ''' A two-feed-forward-layer module ''' @@ -9,14 +12,15 @@ class PositionwiseFeedForward(dg.Layer): self.num_hidden = num_hidden self.use_cudnn = use_cudnn self.dropout = dropout - - self.w_1 = Conv1D(in_channels = d_in, + + self.w_1 = Conv(in_channels = d_in, out_channels = num_hidden, filter_size = filter_size, padding=padding, use_cudnn = use_cudnn, data_format = "NTC") - self.w_2 = Conv1D(in_channels = num_hidden, + + self.w_2 = Conv(in_channels = num_hidden, out_channels = d_in, filter_size = filter_size, padding=padding, @@ -25,6 +29,14 @@ class PositionwiseFeedForward(dg.Layer): self.layer_norm = dg.LayerNorm(d_in) def forward(self, input): + """ + Feed Forward Network. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result after FFN. + """ #FFN Networt x = self.w_2(layers.relu(self.w_1(input))) @@ -35,6 +47,6 @@ class PositionwiseFeedForward(dg.Layer): x = x + input #layer normalization - x = self.layer_norm(x) + output = self.layer_norm(x) - return x \ No newline at end of file + return output \ No newline at end of file diff --git a/parakeet/modules/layers.py b/parakeet/modules/layers.py index c62f0b1..29a10db 100644 --- a/parakeet/modules/layers.py +++ b/parakeet/modules/layers.py @@ -6,6 +6,42 @@ from paddle import fluid import paddle.fluid.dygraph as dg +class Conv(dg.Layer): + def __init__(self, in_channels, out_channels, filter_size=1, + padding=0, dilation=1, stride=1, use_cudnn=True, + data_format="NCT", is_bias=True): + super(Conv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_size = filter_size + self.padding = padding + self.dilation = dilation + self.stride = stride + self.use_cudnn = use_cudnn + self.data_format = data_format + self.is_bias = is_bias + + self.weight_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()) + self.bias_attr = None + if is_bias is not False: + k = math.sqrt(1 / in_channels) + self.bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)) + + self.conv = Conv1D( in_channels = in_channels, + out_channels = out_channels, + filter_size = filter_size, + padding = padding, + dilation = dilation, + stride = stride, + param_attr = self.weight_attr, + bias_attr = self.bias_attr, + use_cudnn = use_cudnn, + data_format = data_format) + + def forward(self, x): + x = self.conv(x) + return x + class Conv1D(dg.Layer): """ A convolution 1D block implemented with Conv2D. Form simplicity and diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index 6b86e51..b2592bb 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -10,22 +10,35 @@ class ScaledDotProductAttention(dg.Layer): self.d_key = d_key # please attention this mask is diff from pytorch - def forward(self, key, value, query, mask=None, query_mask=None): + def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1): + """ + Scaled Dot Product Attention. + + Args: + key (Variable): Shape(B, T, C), dtype: float32. The input key of attention. + value (Variable): Shape(B, T, C), dtype: float32. The input value of attention. + query (Variable): Shape(B, T, C), dtype: float32. The input query of attention. + mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key. + query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query. + dropout (Constant): dtype: float32. The probability of dropout. + Returns: + result (Variable), Shape(B, T, C), the result of mutihead attention. + attention (Variable), Shape(n_head * B, T, C), the attention of key. + """ # Compute attention score attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y attention = attention / math.sqrt(self.d_key) # Mask key to ignore padding if mask is not None: - attention = attention * (mask == 0).astype(np.float32) - mask = mask * (-2 ** 32 + 1) + attention = attention * mask + mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) attention = attention + mask attention = layers.softmax(attention) - attention = layers.dropout(attention, 0.0) + attention = layers.dropout(attention, dropout) # Mask query to ignore padding - # Not sure how to work if query_mask is not None: attention = attention * query_mask @@ -52,6 +65,19 @@ class MultiheadAttention(dg.Layer): self.layer_norm = dg.LayerNorm(num_hidden) def forward(self, key, value, query_input, mask=None, query_mask=None): + """ + Multihead Attention. + + Args: + key (Variable): Shape(B, T, C), dtype: float32. The input key of attention. + value (Variable): Shape(B, T, C), dtype: float32. The input value of attention. + query_input (Variable): Shape(B, T, C), dtype: float32. The input query of attention. + mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key. + query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query. + Returns: + result (Variable), Shape(B, T, C), the result of mutihead attention. + attention (Variable), Shape(n_head * B, T, C), the attention of key. + """ batch_size = key.shape[0] seq_len_key = key.shape[1] seq_len_query = query_input.shape[1] @@ -62,6 +88,7 @@ class MultiheadAttention(dg.Layer): if mask is not None: mask = layers.expand(mask, (self.num_head, 1, 1)) + # Make multihead attention # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) @@ -71,6 +98,7 @@ class MultiheadAttention(dg.Layer): key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q]) + result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) # concat all multihead result diff --git a/parakeet/modules/post_convnet.py b/parakeet/modules/post_convnet.py index fb7d531..559d70e 100644 --- a/parakeet/modules/post_convnet.py +++ b/parakeet/modules/post_convnet.py @@ -1,7 +1,7 @@ import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers -from parakeet.modules.layers import Conv1D +from parakeet.modules.layers import Conv class PostConvNet(dg.Layer): def __init__(self, @@ -17,7 +17,7 @@ class PostConvNet(dg.Layer): self.dropout = dropout self.conv_list = [] - self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step, + self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step, out_channels = num_hidden, filter_size = filter_size, padding = padding, @@ -25,14 +25,14 @@ class PostConvNet(dg.Layer): data_format = "NCT")) for _ in range(1, num_conv-1): - self.conv_list.append(Conv1D(in_channels = num_hidden, + self.conv_list.append(Conv(in_channels = num_hidden, out_channels = num_hidden, filter_size = filter_size, padding = padding, use_cudnn = use_cudnn, data_format = "NCT") ) - self.conv_list.append(Conv1D(in_channels = num_hidden, + self.conv_list.append(Conv(in_channels = num_hidden, out_channels = n_mels * outputs_per_step, filter_size = filter_size, padding = padding, @@ -59,9 +59,17 @@ class PostConvNet(dg.Layer): def forward(self, input): + """ + Post Conv Net. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result after postconvnet. + """ input = layers.transpose(input, [0,2,1]) len = input.shape[-1] for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) - input = layers.transpose(input, [0,2,1]) - return input \ No newline at end of file + output = layers.transpose(input, [0,2,1]) + return output \ No newline at end of file diff --git a/parakeet/modules/prenet.py b/parakeet/modules/prenet.py index 1f4249e..4ea50e1 100644 --- a/parakeet/modules/prenet.py +++ b/parakeet/modules/prenet.py @@ -2,9 +2,6 @@ import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers class PreNet(dg.Layer): - """ - Pre Net before passing through the network - """ def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): """ :param input_size: dimension of input @@ -21,6 +18,14 @@ class PreNet(dg.Layer): self.linear2 = dg.Linear(hidden_size, output_size) def forward(self, x): + """ + Pre Net before passing through the network. + + Args: + x (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + x (Variable), Shape(B, T, C), the result after pernet. + """ x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) return x