Merge branch 'update_waveflow' into 'master'

Update waveflow

See merge request !21
This commit is contained in:
liuyibing01 2020-02-24 11:07:13 +08:00
commit 25883dcd3e
14 changed files with 458 additions and 302 deletions

27
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,27 @@
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: \.py$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*paddle)^.*$
- id: end-of-file-fixer
files: \.md$
- id: trailing-whitespace
files: \.md$
- repo: https://github.com/Lucas-C/pre-commit-hooks
sha: v1.0.1
hooks:
- id: forbid-crlf
files: \.md$
- id: remove-crlf
files: \.md$
- id: forbid-tabs
files: \.md$
- id: remove-tabs
files: \.md$

View File

@ -28,22 +28,21 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Train a deepvoice 3 model with LJSpeech dataset.") description="Train a deepvoice 3 model with LJSpeech dataset.")
parser.add_argument("-c", "--config", type=str, help="experimrnt config") parser.add_argument("-c", "--config", type=str, help="experimrnt config")
parser.add_argument("-s", parser.add_argument(
"--data", "-s",
type=str, "--data",
default="/workspace/datasets/LJSpeech-1.1/", type=str,
help="The path of the LJSpeech dataset.") default="/workspace/datasets/LJSpeech-1.1/",
help="The path of the LJSpeech dataset.")
parser.add_argument("-r", "--resume", type=str, help="checkpoint to load") parser.add_argument("-r", "--resume", type=str, help="checkpoint to load")
parser.add_argument("-o", parser.add_argument(
"--output", "-o",
type=str, "--output",
default="result", type=str,
help="The directory to save result.") default="result",
parser.add_argument("-g", help="The directory to save result.")
"--device", parser.add_argument(
type=int, "-g", "--device", type=int, default=-1, help="device to use")
default=-1,
help="device to use")
args, _ = parser.parse_known_args() args, _ = parser.parse_known_args()
with open(args.config, 'rt') as f: with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f) config = ruamel.yaml.safe_load(f)
@ -84,18 +83,16 @@ if __name__ == "__main__":
train_config = config["train"] train_config = config["train"]
batch_size = train_config["batch_size"] batch_size = train_config["batch_size"]
text_lengths = [len(example[2]) for example in meta] text_lengths = [len(example[2]) for example in meta]
sampler = PartialyRandomizedSimilarTimeLengthSampler( sampler = PartialyRandomizedSimilarTimeLengthSampler(text_lengths,
text_lengths, batch_size) batch_size)
# some hyperparameters affect how we process data, so create a data collector! # some hyperparameters affect how we process data, so create a data collector!
model_config = config["model"] model_config = config["model"]
downsample_factor = model_config["downsample_factor"] downsample_factor = model_config["downsample_factor"]
r = model_config["outputs_per_step"] r = model_config["outputs_per_step"]
collector = DataCollector(downsample_factor=downsample_factor, r=r) collector = DataCollector(downsample_factor=downsample_factor, r=r)
ljspeech_loader = DataCargo(ljspeech, ljspeech_loader = DataCargo(
batch_fn=collector, ljspeech, batch_fn=collector, batch_size=batch_size, sampler=sampler)
batch_size=batch_size,
sampler=sampler)
# =========================model========================= # =========================model=========================
if args.device == -1: if args.device == -1:
@ -131,15 +128,14 @@ if __name__ == "__main__":
window_ahead = model_config["window_ahead"] window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"] key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"] value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, dv3 = make_model(
padding_idx, embedding_std, max_positions, n_vocab, n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
freeze_embedding, filter_size, encoder_channels, embedding_std, max_positions, n_vocab, freeze_embedding,
n_mels, decoder_channels, r, filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask, trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, query_position_rate, key_position_rate, window_backward,
window_backward, window_ahead, key_projection, window_ahead, key_projection, value_projection, downsample_factor,
value_projection, downsample_factor, linear_dim, linear_dim, use_decoder_states, converter_channels, dropout)
use_decoder_states, converter_channels, dropout)
# =========================loss========================= # =========================loss=========================
loss_config = config["loss"] loss_config = config["loss"]
@ -149,13 +145,14 @@ if __name__ == "__main__":
priority_freq_weight = loss_config["priority_freq_weight"] priority_freq_weight = loss_config["priority_freq_weight"]
binary_divergence_weight = loss_config["binary_divergence_weight"] binary_divergence_weight = loss_config["binary_divergence_weight"]
guided_attention_sigma = loss_config["guided_attention_sigma"] guided_attention_sigma = loss_config["guided_attention_sigma"]
criterion = TTSLoss(masked_weight=masked_weight, criterion = TTSLoss(
priority_bin=priority_bin, masked_weight=masked_weight,
priority_weight=priority_freq_weight, priority_bin=priority_bin,
binary_divergence_weight=binary_divergence_weight, priority_weight=priority_freq_weight,
guided_attention_sigma=guided_attention_sigma, binary_divergence_weight=binary_divergence_weight,
downsample_factor=downsample_factor, guided_attention_sigma=guided_attention_sigma,
r=r) downsample_factor=downsample_factor,
r=r)
# =========================lr_scheduler========================= # =========================lr_scheduler=========================
lr_config = config["lr_scheduler"] lr_config = config["lr_scheduler"]
@ -169,11 +166,12 @@ if __name__ == "__main__":
beta1 = optim_config["beta1"] beta1 = optim_config["beta1"]
beta2 = optim_config["beta2"] beta2 = optim_config["beta2"]
epsilon = optim_config["epsilon"] epsilon = optim_config["epsilon"]
optim = fluid.optimizer.Adam(lr_scheduler, optim = fluid.optimizer.Adam(
beta1, lr_scheduler,
beta2, beta1,
epsilon=epsilon, beta2,
parameter_list=dv3.parameters()) epsilon=epsilon,
parameter_list=dv3.parameters())
gradient_clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.1) gradient_clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.1)
# generation # generation
@ -183,8 +181,8 @@ if __name__ == "__main__":
# =========================link(dataloader, paddle)========================= # =========================link(dataloader, paddle)=========================
# CAUTION: it does not return a DataLoader # CAUTION: it does not return a DataLoader
loader = fluid.io.DataLoader.from_generator(capacity=10, loader = fluid.io.DataLoader.from_generator(
return_list=True) capacity=10, return_list=True)
loader.set_batch_generator(ljspeech_loader, places=place) loader.set_batch_generator(ljspeech_loader, places=place)
# tensorboard & checkpoint preparation # tensorboard & checkpoint preparation
@ -247,22 +245,23 @@ if __name__ == "__main__":
# TODO: clean code # TODO: clean code
# train state saving, the first sentence in the batch # train state saving, the first sentence in the batch
if global_step % snap_interval == 0: if global_step % snap_interval == 0:
save_state(state_dir, save_state(
writer, state_dir,
global_step, writer,
mel_input=downsampled_mel_specs, global_step,
mel_output=mel_outputs, mel_input=downsampled_mel_specs,
lin_input=lin_specs, mel_output=mel_outputs,
lin_output=linear_outputs, lin_input=lin_specs,
alignments=alignments, lin_output=linear_outputs,
win_length=win_length, alignments=alignments,
hop_length=hop_length, win_length=win_length,
min_level_db=min_level_db, hop_length=hop_length,
ref_level_db=ref_level_db, min_level_db=min_level_db,
power=power, ref_level_db=ref_level_db,
n_iter=n_iter, power=power,
preemphasis=preemphasis, n_iter=n_iter,
sample_rate=sample_rate) preemphasis=preemphasis,
sample_rate=sample_rate)
# evaluation # evaluation
if global_step % eval_interval == 0: if global_step % eval_interval == 0:
@ -275,27 +274,28 @@ if __name__ == "__main__":
"Some have accepted this as a miracle without any physical explanation.", "Some have accepted this as a miracle without any physical explanation.",
] ]
for idx, sent in enumerate(sentences): for idx, sent in enumerate(sentences):
wav, attn = eval_model(dv3, sent, wav, attn = eval_model(
replace_pronounciation_prob, dv3, sent, replace_pronounciation_prob,
min_level_db, ref_level_db, min_level_db, ref_level_db, power, n_iter,
power, n_iter, win_length, win_length, hop_length, preemphasis)
hop_length, preemphasis)
wav_path = os.path.join( wav_path = os.path.join(
state_dir, "waveform", state_dir, "waveform",
"eval_sample_{:09d}.wav".format(global_step)) "eval_sample_{:09d}.wav".format(global_step))
sf.write(wav_path, wav, sample_rate) sf.write(wav_path, wav, sample_rate)
writer.add_audio("eval_sample_{}".format(idx), writer.add_audio(
wav, "eval_sample_{}".format(idx),
global_step, wav,
sample_rate=sample_rate) global_step,
sample_rate=sample_rate)
attn_path = os.path.join( attn_path = os.path.join(
state_dir, "alignments", state_dir, "alignments",
"eval_sample_attn_{:09d}.png".format(global_step)) "eval_sample_attn_{:09d}.png".format(global_step))
plot_alignment(attn, attn_path) plot_alignment(attn, attn_path)
writer.add_image("eval_sample_attn{}".format(idx), writer.add_image(
cm.viridis(attn), "eval_sample_attn{}".format(idx),
global_step, cm.viridis(attn),
dataformats="HWC") global_step,
dataformats="HWC")
# save checkpoint # save checkpoint
if global_step % save_interval == 0: if global_step % save_interval == 0:
@ -311,4 +311,4 @@ if __name__ == "__main__":
global_step += 1 global_step += 1
# epoch report # epoch report
writer.add_scalar("epoch_average_loss", epoch_loss / i, j) writer.add_scalar("epoch_average_loss", epoch_loss / i, j)
epoch_loss = 0. epoch_loss = 0.

View File

@ -16,10 +16,10 @@ Paddle fluid implementation of [WaveFlow: A Compact Flow-based Model for Raw Aud
## Usage ## Usage
There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on. There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on.
We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset. We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset.
Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`. Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`.
For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`. For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`.
Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively. Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively.
@ -50,10 +50,10 @@ python -u train.py \
#### Save and Load checkpoints #### Save and Load checkpoints
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default. Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. 1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
2. Use `--iteration=500000`. 2. Use `--iteration=500000`.
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`. 3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`.
@ -108,4 +108,4 @@ python -u benchmark.py \
--config=./configs/waveflow_ljspeech.yaml \ --config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \ --root=./data/LJSpeech-1.1 \
--name=${ModelName} --use_gpu=true --name=${ModelName} --use_gpu=true
``` ```

View File

@ -2,35 +2,47 @@ import os
import random import random
from pprint import pprint from pprint import pprint
import jsonargparse import argparse
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from paddle import fluid from paddle import fluid
import utils import utils
from waveflow import WaveFlow from parakeet.models.waveflow import WaveFlow
def add_options_to_parser(parser): def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow', parser.add_argument(
'--model',
type=str,
default='waveflow',
help="general name of the model") help="general name of the model")
parser.add_argument('--name', type=str, parser.add_argument(
help="specific name of the training model") '--name', type=str, help="specific name of the training model")
parser.add_argument('--root', type=str, parser.add_argument(
help="root path of the LJSpeech dataset") '--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True, parser.add_argument(
'--use_gpu',
type=bool,
default=True,
help="option to use gpu training") help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None, parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, " help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint")) "default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None, parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load") help="path of the checkpoint to load")
def benchmark(config): def benchmark(config):
pprint(jsonargparse.namespace_to_dict(config)) pprint(vars(config))
# Get checkpoint directory path. # Get checkpoint directory path.
run_dir = os.path.join("runs", config.model, config.name) run_dir = os.path.join("runs", config.model, config.name)
@ -47,7 +59,7 @@ def benchmark(config):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed) print("Random Seed: ", seed)
# Build model. # Build model.
model = WaveFlow(config, checkpoint_dir) model = WaveFlow(config, checkpoint_dir)
model.build(training=False) model.build(training=False)
@ -58,9 +70,8 @@ def benchmark(config):
if __name__ == "__main__": if __name__ == "__main__":
# Create parser. # Create parser.
parser = jsonargparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Synthesize audio using WaveNet model", description="Synthesize audio using WaveNet model")
formatter_class='default_argparse')
add_options_to_parser(parser) add_options_to_parser(parser)
utils.add_config_options_to_parser(parser) utils.add_config_options_to_parser(parser)
@ -68,4 +79,5 @@ if __name__ == "__main__":
# For conflicting updates to the same field, # For conflicting updates to the same field,
# the preceding update will be overwritten by the following one. # the preceding update will be overwritten by the following one.
config = parser.parse_args() config = parser.parse_args()
config = utils.add_yaml_config(config)
benchmark(config) benchmark(config)

View File

@ -2,40 +2,58 @@ import os
import random import random
from pprint import pprint from pprint import pprint
import jsonargparse import argparse
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from paddle import fluid from paddle import fluid
import utils import utils
from waveflow import WaveFlow from parakeet.models.waveflow import WaveFlow
def add_options_to_parser(parser): def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow', parser.add_argument(
'--model',
type=str,
default='waveflow',
help="general name of the model") help="general name of the model")
parser.add_argument('--name', type=str, parser.add_argument(
help="specific name of the training model") '--name', type=str, help="specific name of the training model")
parser.add_argument('--root', type=str, parser.add_argument(
help="root path of the LJSpeech dataset") '--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True, parser.add_argument(
'--use_gpu',
type=bool,
default=True,
help="option to use gpu training") help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None, parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, " help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint")) "default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None, parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load") help="path of the checkpoint to load")
parser.add_argument('--output', type=str, default="./syn_audios", parser.add_argument(
'--output',
type=str,
default="./syn_audios",
help="path to write synthesized audio files") help="path to write synthesized audio files")
parser.add_argument('--sample', type=int, default=None, parser.add_argument(
'--sample',
type=int,
default=None,
help="which of the valid samples to synthesize audio") help="which of the valid samples to synthesize audio")
def synthesize(config): def synthesize(config):
pprint(jsonargparse.namespace_to_dict(config)) pprint(vars(config))
# Get checkpoint directory path. # Get checkpoint directory path.
run_dir = os.path.join("runs", config.model, config.name) run_dir = os.path.join("runs", config.model, config.name)
@ -52,7 +70,7 @@ def synthesize(config):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed) print("Random Seed: ", seed)
# Build model. # Build model.
model = WaveFlow(config, checkpoint_dir) model = WaveFlow(config, checkpoint_dir)
model.build(training=False) model.build(training=False)
@ -72,9 +90,8 @@ def synthesize(config):
if __name__ == "__main__": if __name__ == "__main__":
# Create parser. # Create parser.
parser = jsonargparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Synthesize audio using WaveNet model", description="Synthesize audio using WaveNet model")
formatter_class='default_argparse')
add_options_to_parser(parser) add_options_to_parser(parser)
utils.add_config_options_to_parser(parser) utils.add_config_options_to_parser(parser)
@ -82,4 +99,5 @@ if __name__ == "__main__":
# For conflicting updates to the same field, # For conflicting updates to the same field,
# the preceding update will be overwritten by the following one. # the preceding update will be overwritten by the following one.
config = parser.parse_args() config = parser.parse_args()
config = utils.add_yaml_config(config)
synthesize(config) synthesize(config)

View File

@ -4,34 +4,48 @@ import subprocess
import time import time
from pprint import pprint from pprint import pprint
import jsonargparse import argparse
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from paddle import fluid from paddle import fluid
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import slurm
import utils import utils
from waveflow import WaveFlow from parakeet.models.waveflow import WaveFlow
def add_options_to_parser(parser): def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow', parser.add_argument(
'--model',
type=str,
default='waveflow',
help="general name of the model") help="general name of the model")
parser.add_argument('--name', type=str, parser.add_argument(
help="specific name of the training model") '--name', type=str, help="specific name of the training model")
parser.add_argument('--root', type=str, parser.add_argument(
help="root path of the LJSpeech dataset") '--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--parallel', type=bool, default=True, parser.add_argument(
'--parallel',
type=utils.str2bool,
default=True,
help="option to use data parallel training") help="option to use data parallel training")
parser.add_argument('--use_gpu', type=bool, default=True, parser.add_argument(
'--use_gpu',
type=utils.str2bool,
default=True,
help="option to use gpu training") help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None, parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, " help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint")) "default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None, parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load") help="path of the checkpoint to load")
@ -45,12 +59,13 @@ def train(config):
if rank == 0: if rank == 0:
# Print the whole config setting. # Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(config)) pprint(vars(config))
# Make checkpoint directory. # Make checkpoint directory.
run_dir = os.path.join("runs", config.model, config.name) run_dir = os.path.join("runs", config.model, config.name)
checkpoint_dir = os.path.join(run_dir, "checkpoint") checkpoint_dir = os.path.join(run_dir, "checkpoint")
os.makedirs(checkpoint_dir, exist_ok=True) if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
# Create tensorboard logger. # Create tensorboard logger.
tb = SummaryWriter(os.path.join(run_dir, "logs")) \ tb = SummaryWriter(os.path.join(run_dir, "logs")) \
@ -102,8 +117,8 @@ def train(config):
if __name__ == "__main__": if __name__ == "__main__":
# Create parser. # Create parser.
parser = jsonargparse.ArgumentParser(description="Train WaveFlow model", parser = argparse.ArgumentParser(description="Train WaveFlow model")
formatter_class='default_argparse') #formatter_class='default_argparse')
add_options_to_parser(parser) add_options_to_parser(parser)
utils.add_config_options_to_parser(parser) utils.add_config_options_to_parser(parser)
@ -111,4 +126,5 @@ if __name__ == "__main__":
# For conflicting updates to the same field, # For conflicting updates to the same field,
# the preceding update will be overwritten by the following one. # the preceding update will be overwritten by the following one.
config = parser.parse_args() config = parser.parse_args()
train(config) config = utils.add_yaml_config(config)
train(config)

View File

@ -2,59 +2,96 @@ import itertools
import os import os
import time import time
import jsonargparse import argparse
import ruamel.yaml
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
def str2bool(v):
return v.lower() in ("true", "t", "1")
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--valid_size', type=int, parser.add_argument(
help="size of the valid dataset") '--valid_size', type=int, help="size of the valid dataset")
parser.add_argument('--segment_length', type=int, parser.add_argument(
'--segment_length',
type=int,
help="the length of audio clip for training") help="the length of audio clip for training")
parser.add_argument('--sample_rate', type=int, parser.add_argument(
help="sampling rate of audio data file") '--sample_rate', type=int, help="sampling rate of audio data file")
parser.add_argument('--fft_window_shift', type=int, parser.add_argument(
'--fft_window_shift',
type=int,
help="the shift of fft window for each frame") help="the shift of fft window for each frame")
parser.add_argument('--fft_window_size', type=int, parser.add_argument(
'--fft_window_size',
type=int,
help="the size of fft window for each frame") help="the size of fft window for each frame")
parser.add_argument('--fft_size', type=int, parser.add_argument(
help="the size of fft filter on each frame") '--fft_size', type=int, help="the size of fft filter on each frame")
parser.add_argument('--mel_bands', type=int, parser.add_argument(
'--mel_bands',
type=int,
help="the number of mel bands when calculating mel spectrograms") help="the number of mel bands when calculating mel spectrograms")
parser.add_argument('--mel_fmin', type=float, parser.add_argument(
'--mel_fmin',
type=float,
help="lowest frequency in calculating mel spectrograms") help="lowest frequency in calculating mel spectrograms")
parser.add_argument('--mel_fmax', type=float, parser.add_argument(
'--mel_fmax',
type=float,
help="highest frequency in calculating mel spectrograms") help="highest frequency in calculating mel spectrograms")
parser.add_argument('--seed', type=int, parser.add_argument(
help="seed of random initialization for the model") '--seed', type=int, help="seed of random initialization for the model")
parser.add_argument('--learning_rate', type=float) parser.add_argument('--learning_rate', type=float)
parser.add_argument('--batch_size', type=int, parser.add_argument(
help="batch size for training") '--batch_size', type=int, help="batch size for training")
parser.add_argument('--test_every', type=int, parser.add_argument(
help="test interval during training") '--test_every', type=int, help="test interval during training")
parser.add_argument('--save_every', type=int, parser.add_argument(
'--save_every',
type=int,
help="checkpointing interval during training") help="checkpointing interval during training")
parser.add_argument('--max_iterations', type=int, parser.add_argument(
help="maximum training iterations") '--max_iterations', type=int, help="maximum training iterations")
parser.add_argument('--sigma', type=float, parser.add_argument(
'--sigma',
type=float,
help="standard deviation of the latent Gaussian variable") help="standard deviation of the latent Gaussian variable")
parser.add_argument('--n_flows', type=int, parser.add_argument('--n_flows', type=int, help="number of flows")
help="number of flows") parser.add_argument(
parser.add_argument('--n_group', type=int, '--n_group',
type=int,
help="number of adjacent audio samples to squeeze into one column") help="number of adjacent audio samples to squeeze into one column")
parser.add_argument('--n_layers', type=int, parser.add_argument(
'--n_layers',
type=int,
help="number of conv2d layer in one wavenet-like flow architecture") help="number of conv2d layer in one wavenet-like flow architecture")
parser.add_argument('--n_channels', type=int, parser.add_argument(
help="number of residual channels in flow") '--n_channels', type=int, help="number of residual channels in flow")
parser.add_argument('--kernel_h', type=int, parser.add_argument(
'--kernel_h',
type=int,
help="height of the kernel in the conv2d layer") help="height of the kernel in the conv2d layer")
parser.add_argument('--kernel_w', type=int, parser.add_argument(
help="width of the kernel in the conv2d layer") '--kernel_w', type=int, help="width of the kernel in the conv2d layer")
parser.add_argument('--config', action=jsonargparse.ActionConfigFile) parser.add_argument('--config', type=str, help="Path to the config file.")
def add_yaml_config(config):
with open(config.config, 'rt') as f:
yaml_cfg = ruamel.yaml.safe_load(f)
cfg_vars = vars(config)
for k, v in yaml_cfg.items():
if k in cfg_vars and cfg_vars[k] is not None:
continue
cfg_vars[k] = v
return config
def load_latest_checkpoint(checkpoint_dir, rank=0): def load_latest_checkpoint(checkpoint_dir, rank=0):
@ -84,8 +121,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
handle.write("model_checkpoint_path: step-{}".format(iteration)) handle.write("model_checkpoint_path: step-{}".format(iteration))
def load_parameters(checkpoint_dir, rank, model, optimizer=None, def load_parameters(checkpoint_dir,
iteration=None, file_path=None): rank,
model,
optimizer=None,
iteration=None,
file_path=None):
if file_path is None: if file_path is None:
if iteration is None: if iteration is None:
iteration = load_latest_checkpoint(checkpoint_dir, rank) iteration = load_latest_checkpoint(checkpoint_dir, rank)
@ -99,7 +140,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
if optimizer and optimizer_dict: if optimizer and optimizer_dict:
optimizer.set_dict(optimizer_dict) optimizer.set_dict(optimizer_dict)
print("[checkpoint] Rank {}: loaded optimizer state from {}".format( print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
rank, file_path)) rank, file_path))
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None): def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):

View File

@ -5,24 +5,29 @@ import librosa
from .. import g2p from .. import g2p
from ..data.sampler import SequentialSampler, RandomSampler, BatchSampler from ..data.sampler import SequentialSampler, RandomSampler, BatchSampler
from ..data.dataset import Dataset from ..data.dataset import DatasetMixin
from ..data.datacargo import DataCargo from ..data.datacargo import DataCargo
from ..data.batch import TextIDBatcher, SpecBatcher from ..data.batch import TextIDBatcher, SpecBatcher
class LJSpeech(Dataset): class LJSpeech(DatasetMixin):
def __init__(self, root): def __init__(self, root):
super(LJSpeech, self).__init__() super(LJSpeech, self).__init__()
assert isinstance(root, (str, Path)), "root should be a string or Path object" assert isinstance(root, (
str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root) self.root = root if isinstance(root, Path) else Path(root)
self.metadata = self._prepare_metadata() self.metadata = self._prepare_metadata()
def _prepare_metadata(self): def _prepare_metadata(self):
csv_path = self.root.joinpath("metadata.csv") csv_path = self.root.joinpath("metadata.csv")
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, metadata = pd.read_csv(
names=["fname", "raw_text", "normalized_text"]) csv_path,
sep="|",
header=None,
quoting=3,
names=["fname", "raw_text", "normalized_text"])
return metadata return metadata
def _get_example(self, metadatum): def _get_example(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a """All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method. different preprocessing pipeline, you can override this method.
@ -30,28 +35,32 @@ class LJSpeech(Dataset):
In this case, you'd better pass a composed transform and pass it to the init In this case, you'd better pass a composed transform and pass it to the init
method. method.
""" """
fname, raw_text, normalized_text = metadatum fname, raw_text, normalized_text = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav") wav_path = self.root.joinpath("wavs", fname + ".wav")
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav, sample_rate = librosa.load(wav_path, sr=None) # we would rather use functor to hold its parameters wav, sample_rate = librosa.load(
wav_path,
sr=None) # we would rather use functor to hold its parameters
trimed, _ = librosa.effects.trim(wav) trimed, _ = librosa.effects.trim(wav)
preemphasized = librosa.effects.preemphasis(trimed) preemphasized = librosa.effects.preemphasis(trimed)
D = librosa.stft(preemphasized) D = librosa.stft(preemphasized)
mag, phase = librosa.magphase(D) mag, phase = librosa.magphase(D)
mel = librosa.feature.melspectrogram(S=mag) mel = librosa.feature.melspectrogram(S=mag)
mag = librosa.amplitude_to_db(S=mag) mag = librosa.amplitude_to_db(S=mag)
mel = librosa.amplitude_to_db(S=mel) mel = librosa.amplitude_to_db(S=mel)
ref_db = 20 ref_db = 20
max_db = 100 max_db = 100
mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1) mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
mel = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1) mel = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) phonemes = np.array(
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes
) # maybe we need to implement it as a map in the future
def _batch_examples(self, minibatch): def _batch_examples(self, minibatch):
mag_batch = [] mag_batch = []
@ -71,12 +80,10 @@ class LJSpeech(Dataset):
metadatum = self.metadata.iloc[index] metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum) example = self._get_example(metadatum)
return example return example
def __iter__(self): def __iter__(self):
for i in range(len(self)): for i in range(len(self)):
yield self[i] yield self[i]
def __len__(self): def __len__(self):
return len(self.metadata) return len(self.metadata)

View File

@ -0,0 +1 @@
from parakeet.models.waveflow.waveflow import WaveFlow

View File

@ -5,10 +5,9 @@ import numpy as np
from paddle import fluid from paddle import fluid
from parakeet.datasets import ljspeech from parakeet.datasets import ljspeech
from parakeet.data import dataset from parakeet.data import SpecBatcher, WavBatcher
from parakeet.data.batch import SpecBatcher, WavBatcher from parakeet.data import DataCargo, DatasetMixin
from parakeet.data.datacargo import DataCargo from parakeet.data import DistributedSampler, BatchSampler
from parakeet.data.sampler import DistributedSampler, BatchSampler
from scipy.io.wavfile import read from scipy.io.wavfile import read
@ -27,7 +26,7 @@ class Dataset(ljspeech.LJSpeech):
return audio return audio
class Subset(dataset.Dataset): class Subset(DatasetMixin):
def __init__(self, dataset, indices, valid): def __init__(self, dataset, indices, valid):
self.dataset = dataset self.dataset = dataset
self.indices = indices self.indices = indices
@ -36,18 +35,18 @@ class Subset(dataset.Dataset):
def get_mel(self, audio): def get_mel(self, audio):
spectrogram = librosa.core.stft( spectrogram = librosa.core.stft(
audio, n_fft=self.config.fft_size, audio,
n_fft=self.config.fft_size,
hop_length=self.config.fft_window_shift, hop_length=self.config.fft_window_shift,
win_length=self.config.fft_window_size) win_length=self.config.fft_window_size)
spectrogram_magnitude = np.abs(spectrogram) spectrogram_magnitude = np.abs(spectrogram)
# mel_filter_bank shape: [n_mels, 1 + n_fft/2] # mel_filter_bank shape: [n_mels, 1 + n_fft/2]
mel_filter_bank = librosa.filters.mel( mel_filter_bank = librosa.filters.mel(sr=self.config.sample_rate,
sr=self.config.sample_rate, n_fft=self.config.fft_size,
n_fft=self.config.fft_size, n_mels=self.config.mel_bands,
n_mels=self.config.mel_bands, fmin=self.config.mel_fmin,
fmin=self.config.mel_fmin, fmax=self.config.mel_fmax)
fmax=self.config.mel_fmax)
# mel shape: [n_mels, num_frames] # mel shape: [n_mels, num_frames]
mel = np.dot(mel_filter_bank, spectrogram_magnitude) mel = np.dot(mel_filter_bank, spectrogram_magnitude)
@ -67,13 +66,14 @@ class Subset(dataset.Dataset):
pass pass
else: else:
# audio shape: [len] # audio shape: [len]
if audio.shape[0] >= segment_length: if audio.shape[0] >= segment_length:
max_audio_start = audio.shape[0] - segment_length max_audio_start = audio.shape[0] - segment_length
audio_start = random.randint(0, max_audio_start) audio_start = random.randint(0, max_audio_start)
audio = audio[audio_start : (audio_start + segment_length)] audio = audio[audio_start:(audio_start + segment_length)]
else: else:
audio = np.pad(audio, (0, segment_length - audio.shape[0]), audio = np.pad(audio, (0, segment_length - audio.shape[0]),
mode='constant', constant_values=0) mode='constant',
constant_values=0)
# Normalize audio to the [-1, 1] range. # Normalize audio to the [-1, 1] range.
audio = audio.astype(np.float32) / 32768.0 audio = audio.astype(np.float32) / 32768.0
@ -109,17 +109,17 @@ class LJSpeech:
# Train dataset. # Train dataset.
trainset = Subset(ds, train_indices, valid=False) trainset = Subset(ds, train_indices, valid=False)
sampler = DistributedSampler(len(trainset), nranks, rank) sampler = DistributedSampler(len(trainset), nranks, rank)
total_bs = config.batch_size total_bs = config.batch_size
assert total_bs % nranks == 0 assert total_bs % nranks == 0
train_sampler = BatchSampler(sampler, total_bs // nranks, train_sampler = BatchSampler(
drop_last=True) sampler, total_bs // nranks, drop_last=True)
trainloader = DataCargo(trainset, batch_sampler=train_sampler) trainloader = DataCargo(trainset, batch_sampler=train_sampler)
trainreader = fluid.io.PyReader(capacity=50, return_list=True) trainreader = fluid.io.PyReader(capacity=50, return_list=True)
trainreader.decorate_batch_generator(trainloader, place) trainreader.decorate_batch_generator(trainloader, place)
self.trainloader = (data for _ in iter(int, 1) self.trainloader = (data for _ in iter(int, 1)
for data in trainreader()) for data in trainreader())
# Valid dataset. # Valid dataset.
validset = Subset(ds, valid_indices, valid=True) validset = Subset(ds, valid_indices, valid=True)
@ -127,5 +127,5 @@ class LJSpeech:
validloader = DataCargo(validset, batch_size=1, shuffle=False) validloader = DataCargo(validset, batch_size=1, shuffle=False)
validreader = fluid.io.PyReader(capacity=20, return_list=True) validreader = fluid.io.PyReader(capacity=20, return_list=True)
validreader.decorate_batch_generator(validloader, place) validreader.decorate_batch_generator(validloader, place)
self.validloader = validreader self.validloader = validreader

View File

@ -8,13 +8,18 @@ from paddle import fluid
from scipy.io.wavfile import write from scipy.io.wavfile import write
import utils import utils
from data import LJSpeech from .data import LJSpeech
from waveflow_modules import WaveFlowLoss, WaveFlowModule from .waveflow_modules import WaveFlowLoss, WaveFlowModule
class WaveFlow(): class WaveFlow():
def __init__(self, config, checkpoint_dir, parallel=False, rank=0, def __init__(self,
nranks=1, tb_logger=None): config,
checkpoint_dir,
parallel=False,
rank=0,
nranks=1,
tb_logger=None):
self.config = config self.config = config
self.checkpoint_dir = checkpoint_dir self.checkpoint_dir = checkpoint_dir
self.parallel = parallel self.parallel = parallel
@ -24,12 +29,12 @@ class WaveFlow():
def build(self, training=True): def build(self, training=True):
config = self.config config = self.config
dataset = LJSpeech(config, self.nranks, self.rank) dataset = LJSpeech(config, self.nranks, self.rank)
self.trainloader = dataset.trainloader self.trainloader = dataset.trainloader
self.validloader = dataset.validloader self.validloader = dataset.validloader
waveflow = WaveFlowModule("waveflow", config) waveflow = WaveFlowModule(config)
# Dry run once to create and initalize all necessary parameters. # Dry run once to create and initalize all necessary parameters.
audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32)) audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32))
mel = dg.to_variable( mel = dg.to_variable(
@ -38,29 +43,36 @@ class WaveFlow():
if training: if training:
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=config.learning_rate) learning_rate=config.learning_rate,
parameter_list=waveflow.parameters())
# Load parameters. # Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank, utils.load_parameters(
waveflow, optimizer, self.checkpoint_dir,
iteration=config.iteration, self.rank,
file_path=config.checkpoint) waveflow,
optimizer,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank)) print("Rank {}: checkpoint loaded.".format(self.rank))
# Data parallelism. # Data parallelism.
if self.parallel: if self.parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
waveflow = dg.parallel.DataParallel(waveflow, strategy) waveflow = dg.parallel.DataParallel(waveflow, strategy)
self.waveflow = waveflow self.waveflow = waveflow
self.optimizer = optimizer self.optimizer = optimizer
self.criterion = WaveFlowLoss(config.sigma) self.criterion = WaveFlowLoss(config.sigma)
else: else:
# Load parameters. # Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank, waveflow, utils.load_parameters(
iteration=config.iteration, self.checkpoint_dir,
file_path=config.checkpoint) self.rank,
waveflow,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank)) print("Rank {}: checkpoint loaded.".format(self.rank))
self.waveflow = waveflow self.waveflow = waveflow
@ -83,7 +95,8 @@ class WaveFlow():
else: else:
loss.backward() loss.backward()
self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters()) self.optimizer.minimize(
loss, parameter_list=self.waveflow.parameters())
self.waveflow.clear_gradients() self.waveflow.clear_gradients()
graph_time = time.time() graph_time = time.time()
@ -139,7 +152,8 @@ class WaveFlow():
sample = config.sample sample = config.sample
output = "{}/{}/iter-{}".format(config.output, config.name, iteration) output = "{}/{}/iter-{}".format(config.output, config.name, iteration)
os.makedirs(output, exist_ok=True) if not os.path.exists(output):
os.makedirs(output)
mels_list = [mels for _, mels in self.validloader()] mels_list = [mels for _, mels in self.validloader()]
if sample is not None: if sample is not None:
@ -148,16 +162,16 @@ class WaveFlow():
for sample, mel in enumerate(mels_list): for sample, mel in enumerate(mels_list):
filename = "{}/valid_{}.wav".format(output, sample) filename = "{}/valid_{}.wav".format(output, sample)
print("Synthesize sample {}, save as {}".format(sample, filename)) print("Synthesize sample {}, save as {}".format(sample, filename))
start_time = time.time() start_time = time.time()
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma) audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
syn_time = time.time() - start_time syn_time = time.time() - start_time
audio = audio[0] audio = audio[0]
audio_time = audio.shape[0] / self.config.sample_rate audio_time = audio.shape[0] / self.config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format( print("audio time {:.4f}, synthesis time {:.4f}".format(audio_time,
audio_time, syn_time)) syn_time))
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range. # Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
audio = audio.numpy() * 32768.0 audio = audio.numpy() * 32768.0
audio = audio.astype('int16') audio = audio.astype('int16')
@ -180,8 +194,8 @@ class WaveFlow():
syn_time = time.time() - start_time syn_time = time.time() - start_time
audio_time = audio.shape[1] * batch_size / self.config.sample_rate audio_time = audio.shape[1] * batch_size / self.config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format( print("audio time {:.4f}, synthesis time {:.4f}".format(audio_time,
audio_time, syn_time)) syn_time))
print("{} X real-time".format(audio_time / syn_time)) print("{} X real-time".format(audio_time / syn_time))
def save(self, iteration): def save(self, iteration):

View File

@ -3,26 +3,27 @@ import itertools
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from paddle import fluid from paddle import fluid
from parakeet.modules import conv, modules, weight_norm from parakeet.modules import weight_norm
def set_param_attr(layer, c_in=1): def get_param_attr(layer_type, filter_size, c_in=1):
if isinstance(layer, (weight_norm.Conv2DTranspose, weight_norm.Conv2D)): if layer_type == "weight_norm":
k = np.sqrt(1.0 / (c_in * np.prod(layer._filter_size))) k = np.sqrt(1.0 / (c_in * np.prod(filter_size)))
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k) weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k) bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
elif isinstance(layer, dg.Conv2D): elif layer_type == "common":
weight_init = fluid.initializer.ConstantInitializer(0.0) weight_init = fluid.initializer.ConstantInitializer(0.0)
bias_init = fluid.initializer.ConstantInitializer(0.0) bias_init = fluid.initializer.ConstantInitializer(0.0)
else: else:
raise TypeError("Unsupported layer type.") raise TypeError("Unsupported layer type.")
layer._param_attr = fluid.ParamAttr(initializer=weight_init) param_attr = fluid.ParamAttr(initializer=weight_init)
layer._bias_attr = fluid.ParamAttr(initializer=bias_init) bias_attr = fluid.ParamAttr(initializer=bias_init)
return param_attr, bias_attr
def unfold(x, n_group): def unfold(x, n_group):
length = x.shape[-1] length = x.shape[-1]
new_shape = x.shape[:-1] + [length // n_group, n_group] new_shape = x.shape[:-1] + [length // n_group, n_group]
return fluid.layers.reshape(x, new_shape) return fluid.layers.reshape(x, new_shape)
@ -48,20 +49,23 @@ class WaveFlowLoss:
class Conditioner(dg.Layer): class Conditioner(dg.Layer):
def __init__(self, name_scope): def __init__(self):
super(Conditioner, self).__init__(name_scope) super(Conditioner, self).__init__()
upsample_factors = [16, 16] upsample_factors = [16, 16]
self.upsample_conv2d = [] self.upsample_conv2d = []
for s in upsample_factors: for s in upsample_factors:
in_channel = 1 in_channel = 1
conv_trans2d = modules.Conv2DTranspose( param_attr, bias_attr = get_param_attr(
self.full_name(), "weight_norm", (3, 2 * s), c_in=in_channel)
conv_trans2d = weight_norm.Conv2DTranspose(
num_channels=in_channel,
num_filters=1, num_filters=1,
filter_size=(3, 2 * s), filter_size=(3, 2 * s),
padding=(1, s // 2), padding=(1, s // 2),
stride=(1, s)) stride=(1, s),
set_param_attr(conv_trans2d, c_in=in_channel) param_attr=param_attr,
bias_attr=bias_attr)
self.upsample_conv2d.append(conv_trans2d) self.upsample_conv2d.append(conv_trans2d)
for i, layer in enumerate(self.upsample_conv2d): for i, layer in enumerate(self.upsample_conv2d):
@ -86,8 +90,8 @@ class Conditioner(dg.Layer):
class Flow(dg.Layer): class Flow(dg.Layer):
def __init__(self, name_scope, config): def __init__(self, config):
super(Flow, self).__init__(name_scope) super(Flow, self).__init__()
self.n_layers = config.n_layers self.n_layers = config.n_layers
self.n_channels = config.n_channels self.n_channels = config.n_channels
self.kernel_h = config.kernel_h self.kernel_h = config.kernel_h
@ -95,27 +99,34 @@ class Flow(dg.Layer):
# Transform audio: [batch, 1, n_group, time/n_group] # Transform audio: [batch, 1, n_group, time/n_group]
# => [batch, n_channels, n_group, time/n_group] # => [batch, n_channels, n_group, time/n_group]
param_attr, bias_attr = get_param_attr("weight_norm", (1, 1), c_in=1)
self.start = weight_norm.Conv2D( self.start = weight_norm.Conv2D(
self.full_name(), num_channels=1,
num_filters=self.n_channels, num_filters=self.n_channels,
filter_size=(1, 1)) filter_size=(1, 1),
set_param_attr(self.start, c_in=1) param_attr=param_attr,
bias_attr=bias_attr)
# Initializing last layer to 0 makes the affine coupling layers # Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability # do nothing at first. This helps with training stability
# output shape: [batch, 2, n_group, time/n_group] # output shape: [batch, 2, n_group, time/n_group]
param_attr, bias_attr = get_param_attr(
"common", (1, 1), c_in=self.n_channels)
self.end = dg.Conv2D( self.end = dg.Conv2D(
self.full_name(), num_channels=self.n_channels,
num_filters=2, num_filters=2,
filter_size=(1, 1)) filter_size=(1, 1),
set_param_attr(self.end) param_attr=param_attr,
bias_attr=bias_attr)
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze # receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
dilation_dict = {8: [1, 1, 1, 1, 1, 1, 1, 1], dilation_dict = {
16: [1, 1, 1, 1, 1, 1, 1, 1], 8: [1, 1, 1, 1, 1, 1, 1, 1],
32: [1, 2, 4, 1, 2, 4, 1, 2], 16: [1, 1, 1, 1, 1, 1, 1, 1],
64: [1, 2, 4, 8, 16, 1, 2, 4], 32: [1, 2, 4, 1, 2, 4, 1, 2],
128: [1, 2, 4, 8, 16, 32, 64, 1]} 64: [1, 2, 4, 8, 16, 1, 2, 4],
128: [1, 2, 4, 8, 16, 32, 64, 1]
}
self.dilation_h_list = dilation_dict[config.n_group] self.dilation_h_list = dilation_dict[config.n_group]
self.in_layers = [] self.in_layers = []
@ -123,32 +134,42 @@ class Flow(dg.Layer):
self.res_skip_layers = [] self.res_skip_layers = []
for i in range(self.n_layers): for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i] dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i dilation_w = 2**i
param_attr, bias_attr = get_param_attr(
"weight_norm", (self.kernel_h, self.kernel_w),
c_in=self.n_channels)
in_layer = weight_norm.Conv2D( in_layer = weight_norm.Conv2D(
self.full_name(), num_channels=self.n_channels,
num_filters=2 * self.n_channels, num_filters=2 * self.n_channels,
filter_size=(self.kernel_h, self.kernel_w), filter_size=(self.kernel_h, self.kernel_w),
dilation=(dilation_h, dilation_w)) dilation=(dilation_h, dilation_w),
set_param_attr(in_layer, c_in=self.n_channels) param_attr=param_attr,
bias_attr=bias_attr)
self.in_layers.append(in_layer) self.in_layers.append(in_layer)
param_attr, bias_attr = get_param_attr(
"weight_norm", (1, 1), c_in=config.mel_bands)
cond_layer = weight_norm.Conv2D( cond_layer = weight_norm.Conv2D(
self.full_name(), num_channels=config.mel_bands,
num_filters=2 * self.n_channels, num_filters=2 * self.n_channels,
filter_size=(1, 1)) filter_size=(1, 1),
set_param_attr(cond_layer, c_in=config.mel_bands) param_attr=param_attr,
bias_attr=bias_attr)
self.cond_layers.append(cond_layer) self.cond_layers.append(cond_layer)
if i < self.n_layers - 1: if i < self.n_layers - 1:
res_skip_channels = 2 * self.n_channels res_skip_channels = 2 * self.n_channels
else: else:
res_skip_channels = self.n_channels res_skip_channels = self.n_channels
param_attr, bias_attr = get_param_attr(
"weight_norm", (1, 1), c_in=self.n_channels)
res_skip_layer = weight_norm.Conv2D( res_skip_layer = weight_norm.Conv2D(
self.full_name(), num_channels=self.n_channels,
num_filters=res_skip_channels, num_filters=res_skip_channels,
filter_size=(1, 1)) filter_size=(1, 1),
set_param_attr(res_skip_layer, c_in=self.n_channels) param_attr=param_attr,
bias_attr=bias_attr)
self.res_skip_layers.append(res_skip_layer) self.res_skip_layers.append(res_skip_layer)
self.add_sublayer("in_layer_{}".format(i), in_layer) self.add_sublayer("in_layer_{}".format(i), in_layer)
@ -162,14 +183,14 @@ class Flow(dg.Layer):
for i in range(self.n_layers): for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i] dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i dilation_w = 2**i
# Pad height dim (n_group): causal convolution # Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution # Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0 pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
pad_left = pad_right = int((self.kernel_w-1) * dilation_w / 2) pad_left = pad_right = int((self.kernel_w - 1) * dilation_w / 2)
audio_pad = fluid.layers.pad2d(audio, audio_pad = fluid.layers.pad2d(
paddings=[pad_top, pad_bottom, pad_left, pad_right]) audio, paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](audio_pad) hidden = self.in_layers[i](audio_pad)
cond_hidden = self.cond_layers[i](mel) cond_hidden = self.cond_layers[i](mel)
@ -196,7 +217,7 @@ class Flow(dg.Layer):
for i in range(self.n_layers): for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i] dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i dilation_w = 2**i
state_size = dilation_h * (self.kernel_h - 1) state_size = dilation_h * (self.kernel_h - 1)
queue = queues[i] queue = queues[i]
@ -206,7 +227,7 @@ class Flow(dg.Layer):
queue.append(fluid.layers.zeros_like(audio)) queue.append(fluid.layers.zeros_like(audio))
state = queue[0:state_size] state = queue[0:state_size]
state = fluid.layers.concat([*state, audio], axis=2) state = fluid.layers.concat(state + [audio], axis=2)
queue.pop(0) queue.pop(0)
queue.append(audio) queue.append(audio)
@ -214,10 +235,10 @@ class Flow(dg.Layer):
# Pad height dim (n_group): causal convolution # Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution # Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = 0, 0 pad_top, pad_bottom = 0, 0
pad_left = int((self.kernel_w-1) * dilation_w / 2) pad_left = int((self.kernel_w - 1) * dilation_w / 2)
pad_right = int((self.kernel_w-1) * dilation_w / 2) pad_right = int((self.kernel_w - 1) * dilation_w / 2)
state = fluid.layers.pad2d(state, state = fluid.layers.pad2d(
paddings=[pad_top, pad_bottom, pad_left, pad_right]) state, paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](state) hidden = self.in_layers[i](state)
cond_hidden = self.cond_layers[i](mel) cond_hidden = self.cond_layers[i](mel)
@ -241,20 +262,20 @@ class Flow(dg.Layer):
class WaveFlowModule(dg.Layer): class WaveFlowModule(dg.Layer):
def __init__(self, name_scope, config): def __init__(self, config):
super(WaveFlowModule, self).__init__(name_scope) super(WaveFlowModule, self).__init__()
self.n_flows = config.n_flows self.n_flows = config.n_flows
self.n_group = config.n_group self.n_group = config.n_group
self.n_layers = config.n_layers self.n_layers = config.n_layers
assert self.n_group % 2 == 0 assert self.n_group % 2 == 0
assert self.n_flows % 2 == 0 assert self.n_flows % 2 == 0
self.conditioner = Conditioner(self.full_name()) self.conditioner = Conditioner()
self.flows = [] self.flows = []
for i in range(self.n_flows): for i in range(self.n_flows):
flow = Flow(self.full_name(), config) flow = Flow(config)
self.flows.append(flow) self.flows.append(flow)
self.add_sublayer("flow_{}".format(i), flow) self.add_sublayer("flow_{}".format(i), flow)
self.perms = [] self.perms = []
half = self.n_group // 2 half = self.n_group // 2
@ -266,7 +287,7 @@ class WaveFlowModule(dg.Layer):
perm[:half] = reversed(perm[:half]) perm[:half] = reversed(perm[:half])
perm[half:] = reversed(perm[half:]) perm[half:] = reversed(perm[half:])
self.perms.append(perm) self.perms.append(perm)
def forward(self, audio, mel): def forward(self, audio, mel):
mel = self.conditioner(mel) mel = self.conditioner(mel)
assert mel.shape[2] >= audio.shape[1] assert mel.shape[2] >= audio.shape[1]
@ -277,14 +298,13 @@ class WaveFlowModule(dg.Layer):
audio = audio[:, :pruned_len] audio = audio[:, :pruned_len]
if mel.shape[2] > pruned_len: if mel.shape[2] > pruned_len:
mel = mel[:, :, :pruned_len] mel = mel[:, :, :pruned_len]
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
# From [bs, time] to [bs, n_group, time/n_group] # From [bs, time] to [bs, n_group, time/n_group]
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1]) audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
# [bs, 1, n_group, time/n_group] # [bs, 1, n_group, time/n_group]
audio = fluid.layers.unsqueeze(audio, 1) audio = fluid.layers.unsqueeze(audio, 1)
log_s_list = [] log_s_list = []
for i in range(self.n_flows): for i in range(self.n_flows):
inputs = audio[:, :, :-1, :] inputs = audio[:, :, :-1, :]
@ -305,7 +325,6 @@ class WaveFlowModule(dg.Layer):
mel = fluid.layers.stack(mel_slices, axis=2) mel = fluid.layers.stack(mel_slices, axis=2)
z = fluid.layers.squeeze(audio, [1]) z = fluid.layers.squeeze(audio, [1])
return z, log_s_list return z, log_s_list
def synthesize(self, mel, sigma=1.0): def synthesize(self, mel, sigma=1.0):
@ -331,7 +350,7 @@ class WaveFlowModule(dg.Layer):
for h in range(1, self.n_group): for h in range(1, self.n_group):
inputs = audio_h inputs = audio_h
conds = mel[:, :, h:(h+1), :] conds = mel[:, :, h:(h + 1), :]
outputs = self.flows[i].infer(inputs, conds, queues) outputs = self.flows[i].infer(inputs, conds, queues)
log_s = outputs[:, 0:1, :, :] log_s = outputs[:, 0:1, :, :]

View File

@ -40,8 +40,8 @@ def norm_except(param, dim, power):
def compute_weight(v, g, dim, power): def compute_weight(v, g, dim, power):
assert len(g.shape) == 1, "magnitude should be a vector" assert len(g.shape) == 1, "magnitude should be a vector"
v_normalized = F.elementwise_div(v, (norm_except(v, dim, power) + 1e-12), v_normalized = F.elementwise_div(
axis=dim) v, (norm_except(v, dim, power) + 1e-12), axis=dim)
weight = F.elementwise_mul(v_normalized, g, axis=dim) weight = F.elementwise_mul(v_normalized, g, axis=dim)
return weight return weight
@ -63,20 +63,21 @@ class WeightNormWrapper(dg.Layer):
original_weight = getattr(layer, param_name) original_weight = getattr(layer, param_name)
self.add_parameter( self.add_parameter(
w_v, w_v,
self.create_parameter(shape=original_weight.shape, self.create_parameter(
dtype=original_weight.dtype)) shape=original_weight.shape, dtype=original_weight.dtype))
F.assign(original_weight, getattr(self, w_v)) F.assign(original_weight, getattr(self, w_v))
delattr(layer, param_name) delattr(layer, param_name)
temp = norm_except(getattr(self, w_v), self.dim, self.power) temp = norm_except(getattr(self, w_v), self.dim, self.power)
self.add_parameter( self.add_parameter(
w_g, self.create_parameter(shape=temp.shape, dtype=temp.dtype)) w_g, self.create_parameter(
shape=temp.shape, dtype=temp.dtype))
F.assign(temp, getattr(self, w_g)) F.assign(temp, getattr(self, w_g))
# also set this when setting up # also set this when setting up
setattr( setattr(self.layer, self.param_name,
self.layer, self.param_name, compute_weight(
compute_weight(getattr(self, w_v), getattr(self, w_g), self.dim, getattr(self, w_v),
self.power)) getattr(self, w_g), self.dim, self.power))
self.weigth_norm_applied = True self.weigth_norm_applied = True
@ -84,10 +85,10 @@ class WeightNormWrapper(dg.Layer):
def hook(self): def hook(self):
w_v = self.param_name + "_v" w_v = self.param_name + "_v"
w_g = self.param_name + "_g" w_g = self.param_name + "_g"
setattr( setattr(self.layer, self.param_name,
self.layer, self.param_name, compute_weight(
compute_weight(getattr(self, w_v), getattr(self, w_g), self.dim, getattr(self, w_v),
self.power)) getattr(self, w_g), self.dim, self.power))
def remove_weight_norm(self): def remove_weight_norm(self):
self.hook() self.hook()