Merge branch 'update_waveflow' into 'master'
Update waveflow See merge request !21
This commit is contained in:
commit
25883dcd3e
|
@ -0,0 +1,27 @@
|
||||||
|
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
|
||||||
|
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
|
||||||
|
hooks:
|
||||||
|
- id: yapf
|
||||||
|
files: \.py$
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
sha: a11d9314b22d8f8c7556443875b731ef05965464
|
||||||
|
hooks:
|
||||||
|
- id: check-merge-conflict
|
||||||
|
- id: check-symlinks
|
||||||
|
- id: detect-private-key
|
||||||
|
files: (?!.*paddle)^.*$
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
files: \.md$
|
||||||
|
- id: trailing-whitespace
|
||||||
|
files: \.md$
|
||||||
|
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
||||||
|
sha: v1.0.1
|
||||||
|
hooks:
|
||||||
|
- id: forbid-crlf
|
||||||
|
files: \.md$
|
||||||
|
- id: remove-crlf
|
||||||
|
files: \.md$
|
||||||
|
- id: forbid-tabs
|
||||||
|
files: \.md$
|
||||||
|
- id: remove-tabs
|
||||||
|
files: \.md$
|
|
@ -28,22 +28,21 @@ if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Train a deepvoice 3 model with LJSpeech dataset.")
|
description="Train a deepvoice 3 model with LJSpeech dataset.")
|
||||||
parser.add_argument("-c", "--config", type=str, help="experimrnt config")
|
parser.add_argument("-c", "--config", type=str, help="experimrnt config")
|
||||||
parser.add_argument("-s",
|
parser.add_argument(
|
||||||
"--data",
|
"-s",
|
||||||
type=str,
|
"--data",
|
||||||
default="/workspace/datasets/LJSpeech-1.1/",
|
type=str,
|
||||||
help="The path of the LJSpeech dataset.")
|
default="/workspace/datasets/LJSpeech-1.1/",
|
||||||
|
help="The path of the LJSpeech dataset.")
|
||||||
parser.add_argument("-r", "--resume", type=str, help="checkpoint to load")
|
parser.add_argument("-r", "--resume", type=str, help="checkpoint to load")
|
||||||
parser.add_argument("-o",
|
parser.add_argument(
|
||||||
"--output",
|
"-o",
|
||||||
type=str,
|
"--output",
|
||||||
default="result",
|
type=str,
|
||||||
help="The directory to save result.")
|
default="result",
|
||||||
parser.add_argument("-g",
|
help="The directory to save result.")
|
||||||
"--device",
|
parser.add_argument(
|
||||||
type=int,
|
"-g", "--device", type=int, default=-1, help="device to use")
|
||||||
default=-1,
|
|
||||||
help="device to use")
|
|
||||||
args, _ = parser.parse_known_args()
|
args, _ = parser.parse_known_args()
|
||||||
with open(args.config, 'rt') as f:
|
with open(args.config, 'rt') as f:
|
||||||
config = ruamel.yaml.safe_load(f)
|
config = ruamel.yaml.safe_load(f)
|
||||||
|
@ -84,18 +83,16 @@ if __name__ == "__main__":
|
||||||
train_config = config["train"]
|
train_config = config["train"]
|
||||||
batch_size = train_config["batch_size"]
|
batch_size = train_config["batch_size"]
|
||||||
text_lengths = [len(example[2]) for example in meta]
|
text_lengths = [len(example[2]) for example in meta]
|
||||||
sampler = PartialyRandomizedSimilarTimeLengthSampler(
|
sampler = PartialyRandomizedSimilarTimeLengthSampler(text_lengths,
|
||||||
text_lengths, batch_size)
|
batch_size)
|
||||||
|
|
||||||
# some hyperparameters affect how we process data, so create a data collector!
|
# some hyperparameters affect how we process data, so create a data collector!
|
||||||
model_config = config["model"]
|
model_config = config["model"]
|
||||||
downsample_factor = model_config["downsample_factor"]
|
downsample_factor = model_config["downsample_factor"]
|
||||||
r = model_config["outputs_per_step"]
|
r = model_config["outputs_per_step"]
|
||||||
collector = DataCollector(downsample_factor=downsample_factor, r=r)
|
collector = DataCollector(downsample_factor=downsample_factor, r=r)
|
||||||
ljspeech_loader = DataCargo(ljspeech,
|
ljspeech_loader = DataCargo(
|
||||||
batch_fn=collector,
|
ljspeech, batch_fn=collector, batch_size=batch_size, sampler=sampler)
|
||||||
batch_size=batch_size,
|
|
||||||
sampler=sampler)
|
|
||||||
|
|
||||||
# =========================model=========================
|
# =========================model=========================
|
||||||
if args.device == -1:
|
if args.device == -1:
|
||||||
|
@ -131,15 +128,14 @@ if __name__ == "__main__":
|
||||||
window_ahead = model_config["window_ahead"]
|
window_ahead = model_config["window_ahead"]
|
||||||
key_projection = model_config["key_projection"]
|
key_projection = model_config["key_projection"]
|
||||||
value_projection = model_config["value_projection"]
|
value_projection = model_config["value_projection"]
|
||||||
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
dv3 = make_model(
|
||||||
padding_idx, embedding_std, max_positions, n_vocab,
|
n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
|
||||||
freeze_embedding, filter_size, encoder_channels,
|
embedding_std, max_positions, n_vocab, freeze_embedding,
|
||||||
n_mels, decoder_channels, r,
|
filter_size, encoder_channels, n_mels, decoder_channels, r,
|
||||||
trainable_positional_encodings, use_memory_mask,
|
trainable_positional_encodings, use_memory_mask,
|
||||||
query_position_rate, key_position_rate,
|
query_position_rate, key_position_rate, window_backward,
|
||||||
window_backward, window_ahead, key_projection,
|
window_ahead, key_projection, value_projection, downsample_factor,
|
||||||
value_projection, downsample_factor, linear_dim,
|
linear_dim, use_decoder_states, converter_channels, dropout)
|
||||||
use_decoder_states, converter_channels, dropout)
|
|
||||||
|
|
||||||
# =========================loss=========================
|
# =========================loss=========================
|
||||||
loss_config = config["loss"]
|
loss_config = config["loss"]
|
||||||
|
@ -149,13 +145,14 @@ if __name__ == "__main__":
|
||||||
priority_freq_weight = loss_config["priority_freq_weight"]
|
priority_freq_weight = loss_config["priority_freq_weight"]
|
||||||
binary_divergence_weight = loss_config["binary_divergence_weight"]
|
binary_divergence_weight = loss_config["binary_divergence_weight"]
|
||||||
guided_attention_sigma = loss_config["guided_attention_sigma"]
|
guided_attention_sigma = loss_config["guided_attention_sigma"]
|
||||||
criterion = TTSLoss(masked_weight=masked_weight,
|
criterion = TTSLoss(
|
||||||
priority_bin=priority_bin,
|
masked_weight=masked_weight,
|
||||||
priority_weight=priority_freq_weight,
|
priority_bin=priority_bin,
|
||||||
binary_divergence_weight=binary_divergence_weight,
|
priority_weight=priority_freq_weight,
|
||||||
guided_attention_sigma=guided_attention_sigma,
|
binary_divergence_weight=binary_divergence_weight,
|
||||||
downsample_factor=downsample_factor,
|
guided_attention_sigma=guided_attention_sigma,
|
||||||
r=r)
|
downsample_factor=downsample_factor,
|
||||||
|
r=r)
|
||||||
|
|
||||||
# =========================lr_scheduler=========================
|
# =========================lr_scheduler=========================
|
||||||
lr_config = config["lr_scheduler"]
|
lr_config = config["lr_scheduler"]
|
||||||
|
@ -169,11 +166,12 @@ if __name__ == "__main__":
|
||||||
beta1 = optim_config["beta1"]
|
beta1 = optim_config["beta1"]
|
||||||
beta2 = optim_config["beta2"]
|
beta2 = optim_config["beta2"]
|
||||||
epsilon = optim_config["epsilon"]
|
epsilon = optim_config["epsilon"]
|
||||||
optim = fluid.optimizer.Adam(lr_scheduler,
|
optim = fluid.optimizer.Adam(
|
||||||
beta1,
|
lr_scheduler,
|
||||||
beta2,
|
beta1,
|
||||||
epsilon=epsilon,
|
beta2,
|
||||||
parameter_list=dv3.parameters())
|
epsilon=epsilon,
|
||||||
|
parameter_list=dv3.parameters())
|
||||||
gradient_clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.1)
|
gradient_clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.1)
|
||||||
|
|
||||||
# generation
|
# generation
|
||||||
|
@ -183,8 +181,8 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# =========================link(dataloader, paddle)=========================
|
# =========================link(dataloader, paddle)=========================
|
||||||
# CAUTION: it does not return a DataLoader
|
# CAUTION: it does not return a DataLoader
|
||||||
loader = fluid.io.DataLoader.from_generator(capacity=10,
|
loader = fluid.io.DataLoader.from_generator(
|
||||||
return_list=True)
|
capacity=10, return_list=True)
|
||||||
loader.set_batch_generator(ljspeech_loader, places=place)
|
loader.set_batch_generator(ljspeech_loader, places=place)
|
||||||
|
|
||||||
# tensorboard & checkpoint preparation
|
# tensorboard & checkpoint preparation
|
||||||
|
@ -247,22 +245,23 @@ if __name__ == "__main__":
|
||||||
# TODO: clean code
|
# TODO: clean code
|
||||||
# train state saving, the first sentence in the batch
|
# train state saving, the first sentence in the batch
|
||||||
if global_step % snap_interval == 0:
|
if global_step % snap_interval == 0:
|
||||||
save_state(state_dir,
|
save_state(
|
||||||
writer,
|
state_dir,
|
||||||
global_step,
|
writer,
|
||||||
mel_input=downsampled_mel_specs,
|
global_step,
|
||||||
mel_output=mel_outputs,
|
mel_input=downsampled_mel_specs,
|
||||||
lin_input=lin_specs,
|
mel_output=mel_outputs,
|
||||||
lin_output=linear_outputs,
|
lin_input=lin_specs,
|
||||||
alignments=alignments,
|
lin_output=linear_outputs,
|
||||||
win_length=win_length,
|
alignments=alignments,
|
||||||
hop_length=hop_length,
|
win_length=win_length,
|
||||||
min_level_db=min_level_db,
|
hop_length=hop_length,
|
||||||
ref_level_db=ref_level_db,
|
min_level_db=min_level_db,
|
||||||
power=power,
|
ref_level_db=ref_level_db,
|
||||||
n_iter=n_iter,
|
power=power,
|
||||||
preemphasis=preemphasis,
|
n_iter=n_iter,
|
||||||
sample_rate=sample_rate)
|
preemphasis=preemphasis,
|
||||||
|
sample_rate=sample_rate)
|
||||||
|
|
||||||
# evaluation
|
# evaluation
|
||||||
if global_step % eval_interval == 0:
|
if global_step % eval_interval == 0:
|
||||||
|
@ -275,27 +274,28 @@ if __name__ == "__main__":
|
||||||
"Some have accepted this as a miracle without any physical explanation.",
|
"Some have accepted this as a miracle without any physical explanation.",
|
||||||
]
|
]
|
||||||
for idx, sent in enumerate(sentences):
|
for idx, sent in enumerate(sentences):
|
||||||
wav, attn = eval_model(dv3, sent,
|
wav, attn = eval_model(
|
||||||
replace_pronounciation_prob,
|
dv3, sent, replace_pronounciation_prob,
|
||||||
min_level_db, ref_level_db,
|
min_level_db, ref_level_db, power, n_iter,
|
||||||
power, n_iter, win_length,
|
win_length, hop_length, preemphasis)
|
||||||
hop_length, preemphasis)
|
|
||||||
wav_path = os.path.join(
|
wav_path = os.path.join(
|
||||||
state_dir, "waveform",
|
state_dir, "waveform",
|
||||||
"eval_sample_{:09d}.wav".format(global_step))
|
"eval_sample_{:09d}.wav".format(global_step))
|
||||||
sf.write(wav_path, wav, sample_rate)
|
sf.write(wav_path, wav, sample_rate)
|
||||||
writer.add_audio("eval_sample_{}".format(idx),
|
writer.add_audio(
|
||||||
wav,
|
"eval_sample_{}".format(idx),
|
||||||
global_step,
|
wav,
|
||||||
sample_rate=sample_rate)
|
global_step,
|
||||||
|
sample_rate=sample_rate)
|
||||||
attn_path = os.path.join(
|
attn_path = os.path.join(
|
||||||
state_dir, "alignments",
|
state_dir, "alignments",
|
||||||
"eval_sample_attn_{:09d}.png".format(global_step))
|
"eval_sample_attn_{:09d}.png".format(global_step))
|
||||||
plot_alignment(attn, attn_path)
|
plot_alignment(attn, attn_path)
|
||||||
writer.add_image("eval_sample_attn{}".format(idx),
|
writer.add_image(
|
||||||
cm.viridis(attn),
|
"eval_sample_attn{}".format(idx),
|
||||||
global_step,
|
cm.viridis(attn),
|
||||||
dataformats="HWC")
|
global_step,
|
||||||
|
dataformats="HWC")
|
||||||
|
|
||||||
# save checkpoint
|
# save checkpoint
|
||||||
if global_step % save_interval == 0:
|
if global_step % save_interval == 0:
|
||||||
|
@ -311,4 +311,4 @@ if __name__ == "__main__":
|
||||||
global_step += 1
|
global_step += 1
|
||||||
# epoch report
|
# epoch report
|
||||||
writer.add_scalar("epoch_average_loss", epoch_loss / i, j)
|
writer.add_scalar("epoch_average_loss", epoch_loss / i, j)
|
||||||
epoch_loss = 0.
|
epoch_loss = 0.
|
||||||
|
|
|
@ -16,10 +16,10 @@ Paddle fluid implementation of [WaveFlow: A Compact Flow-based Model for Raw Aud
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on.
|
There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on.
|
||||||
We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset.
|
We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset.
|
||||||
|
|
||||||
Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`.
|
Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`.
|
||||||
For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`.
|
For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`.
|
||||||
|
|
||||||
Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively.
|
Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively.
|
||||||
|
@ -50,10 +50,10 @@ python -u train.py \
|
||||||
#### Save and Load checkpoints
|
#### Save and Load checkpoints
|
||||||
|
|
||||||
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
|
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
|
||||||
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
|
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
|
||||||
|
|
||||||
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
|
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
|
||||||
1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
|
1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
|
||||||
2. Use `--iteration=500000`.
|
2. Use `--iteration=500000`.
|
||||||
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`.
|
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`.
|
||||||
|
|
||||||
|
@ -108,4 +108,4 @@ python -u benchmark.py \
|
||||||
--config=./configs/waveflow_ljspeech.yaml \
|
--config=./configs/waveflow_ljspeech.yaml \
|
||||||
--root=./data/LJSpeech-1.1 \
|
--root=./data/LJSpeech-1.1 \
|
||||||
--name=${ModelName} --use_gpu=true
|
--name=${ModelName} --use_gpu=true
|
||||||
```
|
```
|
|
@ -2,35 +2,47 @@ import os
|
||||||
import random
|
import random
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
import jsonargparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
from waveflow import WaveFlow
|
from parakeet.models.waveflow import WaveFlow
|
||||||
|
|
||||||
|
|
||||||
def add_options_to_parser(parser):
|
def add_options_to_parser(parser):
|
||||||
parser.add_argument('--model', type=str, default='waveflow',
|
parser.add_argument(
|
||||||
|
'--model',
|
||||||
|
type=str,
|
||||||
|
default='waveflow',
|
||||||
help="general name of the model")
|
help="general name of the model")
|
||||||
parser.add_argument('--name', type=str,
|
parser.add_argument(
|
||||||
help="specific name of the training model")
|
'--name', type=str, help="specific name of the training model")
|
||||||
parser.add_argument('--root', type=str,
|
parser.add_argument(
|
||||||
help="root path of the LJSpeech dataset")
|
'--root', type=str, help="root path of the LJSpeech dataset")
|
||||||
|
|
||||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
parser.add_argument(
|
||||||
|
'--use_gpu',
|
||||||
|
type=bool,
|
||||||
|
default=True,
|
||||||
help="option to use gpu training")
|
help="option to use gpu training")
|
||||||
|
|
||||||
parser.add_argument('--iteration', type=int, default=None,
|
parser.add_argument(
|
||||||
|
'--iteration',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
help=("which iteration of checkpoint to load, "
|
help=("which iteration of checkpoint to load, "
|
||||||
"default to load the latest checkpoint"))
|
"default to load the latest checkpoint"))
|
||||||
parser.add_argument('--checkpoint', type=str, default=None,
|
parser.add_argument(
|
||||||
|
'--checkpoint',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
help="path of the checkpoint to load")
|
help="path of the checkpoint to load")
|
||||||
|
|
||||||
|
|
||||||
def benchmark(config):
|
def benchmark(config):
|
||||||
pprint(jsonargparse.namespace_to_dict(config))
|
pprint(vars(config))
|
||||||
|
|
||||||
# Get checkpoint directory path.
|
# Get checkpoint directory path.
|
||||||
run_dir = os.path.join("runs", config.model, config.name)
|
run_dir = os.path.join("runs", config.model, config.name)
|
||||||
|
@ -47,7 +59,7 @@ def benchmark(config):
|
||||||
fluid.default_startup_program().random_seed = seed
|
fluid.default_startup_program().random_seed = seed
|
||||||
fluid.default_main_program().random_seed = seed
|
fluid.default_main_program().random_seed = seed
|
||||||
print("Random Seed: ", seed)
|
print("Random Seed: ", seed)
|
||||||
|
|
||||||
# Build model.
|
# Build model.
|
||||||
model = WaveFlow(config, checkpoint_dir)
|
model = WaveFlow(config, checkpoint_dir)
|
||||||
model.build(training=False)
|
model.build(training=False)
|
||||||
|
@ -58,9 +70,8 @@ def benchmark(config):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Create parser.
|
# Create parser.
|
||||||
parser = jsonargparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Synthesize audio using WaveNet model",
|
description="Synthesize audio using WaveNet model")
|
||||||
formatter_class='default_argparse')
|
|
||||||
add_options_to_parser(parser)
|
add_options_to_parser(parser)
|
||||||
utils.add_config_options_to_parser(parser)
|
utils.add_config_options_to_parser(parser)
|
||||||
|
|
||||||
|
@ -68,4 +79,5 @@ if __name__ == "__main__":
|
||||||
# For conflicting updates to the same field,
|
# For conflicting updates to the same field,
|
||||||
# the preceding update will be overwritten by the following one.
|
# the preceding update will be overwritten by the following one.
|
||||||
config = parser.parse_args()
|
config = parser.parse_args()
|
||||||
|
config = utils.add_yaml_config(config)
|
||||||
benchmark(config)
|
benchmark(config)
|
|
@ -2,40 +2,58 @@ import os
|
||||||
import random
|
import random
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
import jsonargparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
from waveflow import WaveFlow
|
from parakeet.models.waveflow import WaveFlow
|
||||||
|
|
||||||
|
|
||||||
def add_options_to_parser(parser):
|
def add_options_to_parser(parser):
|
||||||
parser.add_argument('--model', type=str, default='waveflow',
|
parser.add_argument(
|
||||||
|
'--model',
|
||||||
|
type=str,
|
||||||
|
default='waveflow',
|
||||||
help="general name of the model")
|
help="general name of the model")
|
||||||
parser.add_argument('--name', type=str,
|
parser.add_argument(
|
||||||
help="specific name of the training model")
|
'--name', type=str, help="specific name of the training model")
|
||||||
parser.add_argument('--root', type=str,
|
parser.add_argument(
|
||||||
help="root path of the LJSpeech dataset")
|
'--root', type=str, help="root path of the LJSpeech dataset")
|
||||||
|
|
||||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
parser.add_argument(
|
||||||
|
'--use_gpu',
|
||||||
|
type=bool,
|
||||||
|
default=True,
|
||||||
help="option to use gpu training")
|
help="option to use gpu training")
|
||||||
|
|
||||||
parser.add_argument('--iteration', type=int, default=None,
|
parser.add_argument(
|
||||||
|
'--iteration',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
help=("which iteration of checkpoint to load, "
|
help=("which iteration of checkpoint to load, "
|
||||||
"default to load the latest checkpoint"))
|
"default to load the latest checkpoint"))
|
||||||
parser.add_argument('--checkpoint', type=str, default=None,
|
parser.add_argument(
|
||||||
|
'--checkpoint',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
help="path of the checkpoint to load")
|
help="path of the checkpoint to load")
|
||||||
|
|
||||||
parser.add_argument('--output', type=str, default="./syn_audios",
|
parser.add_argument(
|
||||||
|
'--output',
|
||||||
|
type=str,
|
||||||
|
default="./syn_audios",
|
||||||
help="path to write synthesized audio files")
|
help="path to write synthesized audio files")
|
||||||
parser.add_argument('--sample', type=int, default=None,
|
parser.add_argument(
|
||||||
|
'--sample',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
help="which of the valid samples to synthesize audio")
|
help="which of the valid samples to synthesize audio")
|
||||||
|
|
||||||
|
|
||||||
def synthesize(config):
|
def synthesize(config):
|
||||||
pprint(jsonargparse.namespace_to_dict(config))
|
pprint(vars(config))
|
||||||
|
|
||||||
# Get checkpoint directory path.
|
# Get checkpoint directory path.
|
||||||
run_dir = os.path.join("runs", config.model, config.name)
|
run_dir = os.path.join("runs", config.model, config.name)
|
||||||
|
@ -52,7 +70,7 @@ def synthesize(config):
|
||||||
fluid.default_startup_program().random_seed = seed
|
fluid.default_startup_program().random_seed = seed
|
||||||
fluid.default_main_program().random_seed = seed
|
fluid.default_main_program().random_seed = seed
|
||||||
print("Random Seed: ", seed)
|
print("Random Seed: ", seed)
|
||||||
|
|
||||||
# Build model.
|
# Build model.
|
||||||
model = WaveFlow(config, checkpoint_dir)
|
model = WaveFlow(config, checkpoint_dir)
|
||||||
model.build(training=False)
|
model.build(training=False)
|
||||||
|
@ -72,9 +90,8 @@ def synthesize(config):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Create parser.
|
# Create parser.
|
||||||
parser = jsonargparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Synthesize audio using WaveNet model",
|
description="Synthesize audio using WaveNet model")
|
||||||
formatter_class='default_argparse')
|
|
||||||
add_options_to_parser(parser)
|
add_options_to_parser(parser)
|
||||||
utils.add_config_options_to_parser(parser)
|
utils.add_config_options_to_parser(parser)
|
||||||
|
|
||||||
|
@ -82,4 +99,5 @@ if __name__ == "__main__":
|
||||||
# For conflicting updates to the same field,
|
# For conflicting updates to the same field,
|
||||||
# the preceding update will be overwritten by the following one.
|
# the preceding update will be overwritten by the following one.
|
||||||
config = parser.parse_args()
|
config = parser.parse_args()
|
||||||
|
config = utils.add_yaml_config(config)
|
||||||
synthesize(config)
|
synthesize(config)
|
|
@ -4,34 +4,48 @@ import subprocess
|
||||||
import time
|
import time
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
import jsonargparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
import slurm
|
|
||||||
import utils
|
import utils
|
||||||
from waveflow import WaveFlow
|
from parakeet.models.waveflow import WaveFlow
|
||||||
|
|
||||||
|
|
||||||
def add_options_to_parser(parser):
|
def add_options_to_parser(parser):
|
||||||
parser.add_argument('--model', type=str, default='waveflow',
|
parser.add_argument(
|
||||||
|
'--model',
|
||||||
|
type=str,
|
||||||
|
default='waveflow',
|
||||||
help="general name of the model")
|
help="general name of the model")
|
||||||
parser.add_argument('--name', type=str,
|
parser.add_argument(
|
||||||
help="specific name of the training model")
|
'--name', type=str, help="specific name of the training model")
|
||||||
parser.add_argument('--root', type=str,
|
parser.add_argument(
|
||||||
help="root path of the LJSpeech dataset")
|
'--root', type=str, help="root path of the LJSpeech dataset")
|
||||||
|
|
||||||
parser.add_argument('--parallel', type=bool, default=True,
|
parser.add_argument(
|
||||||
|
'--parallel',
|
||||||
|
type=utils.str2bool,
|
||||||
|
default=True,
|
||||||
help="option to use data parallel training")
|
help="option to use data parallel training")
|
||||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
parser.add_argument(
|
||||||
|
'--use_gpu',
|
||||||
|
type=utils.str2bool,
|
||||||
|
default=True,
|
||||||
help="option to use gpu training")
|
help="option to use gpu training")
|
||||||
|
|
||||||
parser.add_argument('--iteration', type=int, default=None,
|
parser.add_argument(
|
||||||
|
'--iteration',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
help=("which iteration of checkpoint to load, "
|
help=("which iteration of checkpoint to load, "
|
||||||
"default to load the latest checkpoint"))
|
"default to load the latest checkpoint"))
|
||||||
parser.add_argument('--checkpoint', type=str, default=None,
|
parser.add_argument(
|
||||||
|
'--checkpoint',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
help="path of the checkpoint to load")
|
help="path of the checkpoint to load")
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,12 +59,13 @@ def train(config):
|
||||||
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
# Print the whole config setting.
|
# Print the whole config setting.
|
||||||
pprint(jsonargparse.namespace_to_dict(config))
|
pprint(vars(config))
|
||||||
|
|
||||||
# Make checkpoint directory.
|
# Make checkpoint directory.
|
||||||
run_dir = os.path.join("runs", config.model, config.name)
|
run_dir = os.path.join("runs", config.model, config.name)
|
||||||
checkpoint_dir = os.path.join(run_dir, "checkpoint")
|
checkpoint_dir = os.path.join(run_dir, "checkpoint")
|
||||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
if not os.path.exists(checkpoint_dir):
|
||||||
|
os.makedirs(checkpoint_dir)
|
||||||
|
|
||||||
# Create tensorboard logger.
|
# Create tensorboard logger.
|
||||||
tb = SummaryWriter(os.path.join(run_dir, "logs")) \
|
tb = SummaryWriter(os.path.join(run_dir, "logs")) \
|
||||||
|
@ -102,8 +117,8 @@ def train(config):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Create parser.
|
# Create parser.
|
||||||
parser = jsonargparse.ArgumentParser(description="Train WaveFlow model",
|
parser = argparse.ArgumentParser(description="Train WaveFlow model")
|
||||||
formatter_class='default_argparse')
|
#formatter_class='default_argparse')
|
||||||
add_options_to_parser(parser)
|
add_options_to_parser(parser)
|
||||||
utils.add_config_options_to_parser(parser)
|
utils.add_config_options_to_parser(parser)
|
||||||
|
|
||||||
|
@ -111,4 +126,5 @@ if __name__ == "__main__":
|
||||||
# For conflicting updates to the same field,
|
# For conflicting updates to the same field,
|
||||||
# the preceding update will be overwritten by the following one.
|
# the preceding update will be overwritten by the following one.
|
||||||
config = parser.parse_args()
|
config = parser.parse_args()
|
||||||
train(config)
|
config = utils.add_yaml_config(config)
|
||||||
|
train(config)
|
|
@ -2,59 +2,96 @@ import itertools
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import jsonargparse
|
import argparse
|
||||||
|
import ruamel.yaml
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
|
|
||||||
|
|
||||||
|
def str2bool(v):
|
||||||
|
return v.lower() in ("true", "t", "1")
|
||||||
|
|
||||||
|
|
||||||
def add_config_options_to_parser(parser):
|
def add_config_options_to_parser(parser):
|
||||||
parser.add_argument('--valid_size', type=int,
|
parser.add_argument(
|
||||||
help="size of the valid dataset")
|
'--valid_size', type=int, help="size of the valid dataset")
|
||||||
parser.add_argument('--segment_length', type=int,
|
parser.add_argument(
|
||||||
|
'--segment_length',
|
||||||
|
type=int,
|
||||||
help="the length of audio clip for training")
|
help="the length of audio clip for training")
|
||||||
parser.add_argument('--sample_rate', type=int,
|
parser.add_argument(
|
||||||
help="sampling rate of audio data file")
|
'--sample_rate', type=int, help="sampling rate of audio data file")
|
||||||
parser.add_argument('--fft_window_shift', type=int,
|
parser.add_argument(
|
||||||
|
'--fft_window_shift',
|
||||||
|
type=int,
|
||||||
help="the shift of fft window for each frame")
|
help="the shift of fft window for each frame")
|
||||||
parser.add_argument('--fft_window_size', type=int,
|
parser.add_argument(
|
||||||
|
'--fft_window_size',
|
||||||
|
type=int,
|
||||||
help="the size of fft window for each frame")
|
help="the size of fft window for each frame")
|
||||||
parser.add_argument('--fft_size', type=int,
|
parser.add_argument(
|
||||||
help="the size of fft filter on each frame")
|
'--fft_size', type=int, help="the size of fft filter on each frame")
|
||||||
parser.add_argument('--mel_bands', type=int,
|
parser.add_argument(
|
||||||
|
'--mel_bands',
|
||||||
|
type=int,
|
||||||
help="the number of mel bands when calculating mel spectrograms")
|
help="the number of mel bands when calculating mel spectrograms")
|
||||||
parser.add_argument('--mel_fmin', type=float,
|
parser.add_argument(
|
||||||
|
'--mel_fmin',
|
||||||
|
type=float,
|
||||||
help="lowest frequency in calculating mel spectrograms")
|
help="lowest frequency in calculating mel spectrograms")
|
||||||
parser.add_argument('--mel_fmax', type=float,
|
parser.add_argument(
|
||||||
|
'--mel_fmax',
|
||||||
|
type=float,
|
||||||
help="highest frequency in calculating mel spectrograms")
|
help="highest frequency in calculating mel spectrograms")
|
||||||
|
|
||||||
parser.add_argument('--seed', type=int,
|
parser.add_argument(
|
||||||
help="seed of random initialization for the model")
|
'--seed', type=int, help="seed of random initialization for the model")
|
||||||
parser.add_argument('--learning_rate', type=float)
|
parser.add_argument('--learning_rate', type=float)
|
||||||
parser.add_argument('--batch_size', type=int,
|
parser.add_argument(
|
||||||
help="batch size for training")
|
'--batch_size', type=int, help="batch size for training")
|
||||||
parser.add_argument('--test_every', type=int,
|
parser.add_argument(
|
||||||
help="test interval during training")
|
'--test_every', type=int, help="test interval during training")
|
||||||
parser.add_argument('--save_every', type=int,
|
parser.add_argument(
|
||||||
|
'--save_every',
|
||||||
|
type=int,
|
||||||
help="checkpointing interval during training")
|
help="checkpointing interval during training")
|
||||||
parser.add_argument('--max_iterations', type=int,
|
parser.add_argument(
|
||||||
help="maximum training iterations")
|
'--max_iterations', type=int, help="maximum training iterations")
|
||||||
|
|
||||||
parser.add_argument('--sigma', type=float,
|
parser.add_argument(
|
||||||
|
'--sigma',
|
||||||
|
type=float,
|
||||||
help="standard deviation of the latent Gaussian variable")
|
help="standard deviation of the latent Gaussian variable")
|
||||||
parser.add_argument('--n_flows', type=int,
|
parser.add_argument('--n_flows', type=int, help="number of flows")
|
||||||
help="number of flows")
|
parser.add_argument(
|
||||||
parser.add_argument('--n_group', type=int,
|
'--n_group',
|
||||||
|
type=int,
|
||||||
help="number of adjacent audio samples to squeeze into one column")
|
help="number of adjacent audio samples to squeeze into one column")
|
||||||
parser.add_argument('--n_layers', type=int,
|
parser.add_argument(
|
||||||
|
'--n_layers',
|
||||||
|
type=int,
|
||||||
help="number of conv2d layer in one wavenet-like flow architecture")
|
help="number of conv2d layer in one wavenet-like flow architecture")
|
||||||
parser.add_argument('--n_channels', type=int,
|
parser.add_argument(
|
||||||
help="number of residual channels in flow")
|
'--n_channels', type=int, help="number of residual channels in flow")
|
||||||
parser.add_argument('--kernel_h', type=int,
|
parser.add_argument(
|
||||||
|
'--kernel_h',
|
||||||
|
type=int,
|
||||||
help="height of the kernel in the conv2d layer")
|
help="height of the kernel in the conv2d layer")
|
||||||
parser.add_argument('--kernel_w', type=int,
|
parser.add_argument(
|
||||||
help="width of the kernel in the conv2d layer")
|
'--kernel_w', type=int, help="width of the kernel in the conv2d layer")
|
||||||
|
|
||||||
parser.add_argument('--config', action=jsonargparse.ActionConfigFile)
|
parser.add_argument('--config', type=str, help="Path to the config file.")
|
||||||
|
|
||||||
|
|
||||||
|
def add_yaml_config(config):
|
||||||
|
with open(config.config, 'rt') as f:
|
||||||
|
yaml_cfg = ruamel.yaml.safe_load(f)
|
||||||
|
cfg_vars = vars(config)
|
||||||
|
for k, v in yaml_cfg.items():
|
||||||
|
if k in cfg_vars and cfg_vars[k] is not None:
|
||||||
|
continue
|
||||||
|
cfg_vars[k] = v
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
def load_latest_checkpoint(checkpoint_dir, rank=0):
|
def load_latest_checkpoint(checkpoint_dir, rank=0):
|
||||||
|
@ -84,8 +121,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
|
||||||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||||
|
|
||||||
|
|
||||||
def load_parameters(checkpoint_dir, rank, model, optimizer=None,
|
def load_parameters(checkpoint_dir,
|
||||||
iteration=None, file_path=None):
|
rank,
|
||||||
|
model,
|
||||||
|
optimizer=None,
|
||||||
|
iteration=None,
|
||||||
|
file_path=None):
|
||||||
if file_path is None:
|
if file_path is None:
|
||||||
if iteration is None:
|
if iteration is None:
|
||||||
iteration = load_latest_checkpoint(checkpoint_dir, rank)
|
iteration = load_latest_checkpoint(checkpoint_dir, rank)
|
||||||
|
@ -99,7 +140,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
|
||||||
if optimizer and optimizer_dict:
|
if optimizer and optimizer_dict:
|
||||||
optimizer.set_dict(optimizer_dict)
|
optimizer.set_dict(optimizer_dict)
|
||||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
||||||
rank, file_path))
|
rank, file_path))
|
||||||
|
|
||||||
|
|
||||||
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
|
@ -5,24 +5,29 @@ import librosa
|
||||||
from .. import g2p
|
from .. import g2p
|
||||||
|
|
||||||
from ..data.sampler import SequentialSampler, RandomSampler, BatchSampler
|
from ..data.sampler import SequentialSampler, RandomSampler, BatchSampler
|
||||||
from ..data.dataset import Dataset
|
from ..data.dataset import DatasetMixin
|
||||||
from ..data.datacargo import DataCargo
|
from ..data.datacargo import DataCargo
|
||||||
from ..data.batch import TextIDBatcher, SpecBatcher
|
from ..data.batch import TextIDBatcher, SpecBatcher
|
||||||
|
|
||||||
|
|
||||||
class LJSpeech(Dataset):
|
class LJSpeech(DatasetMixin):
|
||||||
def __init__(self, root):
|
def __init__(self, root):
|
||||||
super(LJSpeech, self).__init__()
|
super(LJSpeech, self).__init__()
|
||||||
assert isinstance(root, (str, Path)), "root should be a string or Path object"
|
assert isinstance(root, (
|
||||||
|
str, Path)), "root should be a string or Path object"
|
||||||
self.root = root if isinstance(root, Path) else Path(root)
|
self.root = root if isinstance(root, Path) else Path(root)
|
||||||
self.metadata = self._prepare_metadata()
|
self.metadata = self._prepare_metadata()
|
||||||
|
|
||||||
def _prepare_metadata(self):
|
def _prepare_metadata(self):
|
||||||
csv_path = self.root.joinpath("metadata.csv")
|
csv_path = self.root.joinpath("metadata.csv")
|
||||||
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
|
metadata = pd.read_csv(
|
||||||
names=["fname", "raw_text", "normalized_text"])
|
csv_path,
|
||||||
|
sep="|",
|
||||||
|
header=None,
|
||||||
|
quoting=3,
|
||||||
|
names=["fname", "raw_text", "normalized_text"])
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
def _get_example(self, metadatum):
|
def _get_example(self, metadatum):
|
||||||
"""All the code for generating an Example from a metadatum. If you want a
|
"""All the code for generating an Example from a metadatum. If you want a
|
||||||
different preprocessing pipeline, you can override this method.
|
different preprocessing pipeline, you can override this method.
|
||||||
|
@ -30,28 +35,32 @@ class LJSpeech(Dataset):
|
||||||
In this case, you'd better pass a composed transform and pass it to the init
|
In this case, you'd better pass a composed transform and pass it to the init
|
||||||
method.
|
method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
fname, raw_text, normalized_text = metadatum
|
fname, raw_text, normalized_text = metadatum
|
||||||
wav_path = self.root.joinpath("wavs", fname + ".wav")
|
wav_path = self.root.joinpath("wavs", fname + ".wav")
|
||||||
|
|
||||||
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||||
wav, sample_rate = librosa.load(wav_path, sr=None) # we would rather use functor to hold its parameters
|
wav, sample_rate = librosa.load(
|
||||||
|
wav_path,
|
||||||
|
sr=None) # we would rather use functor to hold its parameters
|
||||||
trimed, _ = librosa.effects.trim(wav)
|
trimed, _ = librosa.effects.trim(wav)
|
||||||
preemphasized = librosa.effects.preemphasis(trimed)
|
preemphasized = librosa.effects.preemphasis(trimed)
|
||||||
D = librosa.stft(preemphasized)
|
D = librosa.stft(preemphasized)
|
||||||
mag, phase = librosa.magphase(D)
|
mag, phase = librosa.magphase(D)
|
||||||
mel = librosa.feature.melspectrogram(S=mag)
|
mel = librosa.feature.melspectrogram(S=mag)
|
||||||
|
|
||||||
mag = librosa.amplitude_to_db(S=mag)
|
mag = librosa.amplitude_to_db(S=mag)
|
||||||
mel = librosa.amplitude_to_db(S=mel)
|
mel = librosa.amplitude_to_db(S=mel)
|
||||||
|
|
||||||
ref_db = 20
|
ref_db = 20
|
||||||
max_db = 100
|
max_db = 100
|
||||||
mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
|
mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
|
||||||
mel = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
|
mel = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
|
||||||
|
|
||||||
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
phonemes = np.array(
|
||||||
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||||
|
return (mag, mel, phonemes
|
||||||
|
) # maybe we need to implement it as a map in the future
|
||||||
|
|
||||||
def _batch_examples(self, minibatch):
|
def _batch_examples(self, minibatch):
|
||||||
mag_batch = []
|
mag_batch = []
|
||||||
|
@ -71,12 +80,10 @@ class LJSpeech(Dataset):
|
||||||
metadatum = self.metadata.iloc[index]
|
metadatum = self.metadata.iloc[index]
|
||||||
example = self._get_example(metadatum)
|
example = self._get_example(metadatum)
|
||||||
return example
|
return example
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for i in range(len(self)):
|
for i in range(len(self)):
|
||||||
yield self[i]
|
yield self[i]
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.metadata)
|
return len(self.metadata)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
from parakeet.models.waveflow.waveflow import WaveFlow
|
|
@ -5,10 +5,9 @@ import numpy as np
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
|
|
||||||
from parakeet.datasets import ljspeech
|
from parakeet.datasets import ljspeech
|
||||||
from parakeet.data import dataset
|
from parakeet.data import SpecBatcher, WavBatcher
|
||||||
from parakeet.data.batch import SpecBatcher, WavBatcher
|
from parakeet.data import DataCargo, DatasetMixin
|
||||||
from parakeet.data.datacargo import DataCargo
|
from parakeet.data import DistributedSampler, BatchSampler
|
||||||
from parakeet.data.sampler import DistributedSampler, BatchSampler
|
|
||||||
from scipy.io.wavfile import read
|
from scipy.io.wavfile import read
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,7 +26,7 @@ class Dataset(ljspeech.LJSpeech):
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
class Subset(dataset.Dataset):
|
class Subset(DatasetMixin):
|
||||||
def __init__(self, dataset, indices, valid):
|
def __init__(self, dataset, indices, valid):
|
||||||
self.dataset = dataset
|
self.dataset = dataset
|
||||||
self.indices = indices
|
self.indices = indices
|
||||||
|
@ -36,18 +35,18 @@ class Subset(dataset.Dataset):
|
||||||
|
|
||||||
def get_mel(self, audio):
|
def get_mel(self, audio):
|
||||||
spectrogram = librosa.core.stft(
|
spectrogram = librosa.core.stft(
|
||||||
audio, n_fft=self.config.fft_size,
|
audio,
|
||||||
|
n_fft=self.config.fft_size,
|
||||||
hop_length=self.config.fft_window_shift,
|
hop_length=self.config.fft_window_shift,
|
||||||
win_length=self.config.fft_window_size)
|
win_length=self.config.fft_window_size)
|
||||||
spectrogram_magnitude = np.abs(spectrogram)
|
spectrogram_magnitude = np.abs(spectrogram)
|
||||||
|
|
||||||
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
|
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
|
||||||
mel_filter_bank = librosa.filters.mel(
|
mel_filter_bank = librosa.filters.mel(sr=self.config.sample_rate,
|
||||||
sr=self.config.sample_rate,
|
n_fft=self.config.fft_size,
|
||||||
n_fft=self.config.fft_size,
|
n_mels=self.config.mel_bands,
|
||||||
n_mels=self.config.mel_bands,
|
fmin=self.config.mel_fmin,
|
||||||
fmin=self.config.mel_fmin,
|
fmax=self.config.mel_fmax)
|
||||||
fmax=self.config.mel_fmax)
|
|
||||||
# mel shape: [n_mels, num_frames]
|
# mel shape: [n_mels, num_frames]
|
||||||
mel = np.dot(mel_filter_bank, spectrogram_magnitude)
|
mel = np.dot(mel_filter_bank, spectrogram_magnitude)
|
||||||
|
|
||||||
|
@ -67,13 +66,14 @@ class Subset(dataset.Dataset):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# audio shape: [len]
|
# audio shape: [len]
|
||||||
if audio.shape[0] >= segment_length:
|
if audio.shape[0] >= segment_length:
|
||||||
max_audio_start = audio.shape[0] - segment_length
|
max_audio_start = audio.shape[0] - segment_length
|
||||||
audio_start = random.randint(0, max_audio_start)
|
audio_start = random.randint(0, max_audio_start)
|
||||||
audio = audio[audio_start : (audio_start + segment_length)]
|
audio = audio[audio_start:(audio_start + segment_length)]
|
||||||
else:
|
else:
|
||||||
audio = np.pad(audio, (0, segment_length - audio.shape[0]),
|
audio = np.pad(audio, (0, segment_length - audio.shape[0]),
|
||||||
mode='constant', constant_values=0)
|
mode='constant',
|
||||||
|
constant_values=0)
|
||||||
|
|
||||||
# Normalize audio to the [-1, 1] range.
|
# Normalize audio to the [-1, 1] range.
|
||||||
audio = audio.astype(np.float32) / 32768.0
|
audio = audio.astype(np.float32) / 32768.0
|
||||||
|
@ -109,17 +109,17 @@ class LJSpeech:
|
||||||
|
|
||||||
# Train dataset.
|
# Train dataset.
|
||||||
trainset = Subset(ds, train_indices, valid=False)
|
trainset = Subset(ds, train_indices, valid=False)
|
||||||
sampler = DistributedSampler(len(trainset), nranks, rank)
|
sampler = DistributedSampler(len(trainset), nranks, rank)
|
||||||
total_bs = config.batch_size
|
total_bs = config.batch_size
|
||||||
assert total_bs % nranks == 0
|
assert total_bs % nranks == 0
|
||||||
train_sampler = BatchSampler(sampler, total_bs // nranks,
|
train_sampler = BatchSampler(
|
||||||
drop_last=True)
|
sampler, total_bs // nranks, drop_last=True)
|
||||||
trainloader = DataCargo(trainset, batch_sampler=train_sampler)
|
trainloader = DataCargo(trainset, batch_sampler=train_sampler)
|
||||||
|
|
||||||
trainreader = fluid.io.PyReader(capacity=50, return_list=True)
|
trainreader = fluid.io.PyReader(capacity=50, return_list=True)
|
||||||
trainreader.decorate_batch_generator(trainloader, place)
|
trainreader.decorate_batch_generator(trainloader, place)
|
||||||
self.trainloader = (data for _ in iter(int, 1)
|
self.trainloader = (data for _ in iter(int, 1)
|
||||||
for data in trainreader())
|
for data in trainreader())
|
||||||
|
|
||||||
# Valid dataset.
|
# Valid dataset.
|
||||||
validset = Subset(ds, valid_indices, valid=True)
|
validset = Subset(ds, valid_indices, valid=True)
|
||||||
|
@ -127,5 +127,5 @@ class LJSpeech:
|
||||||
validloader = DataCargo(validset, batch_size=1, shuffle=False)
|
validloader = DataCargo(validset, batch_size=1, shuffle=False)
|
||||||
|
|
||||||
validreader = fluid.io.PyReader(capacity=20, return_list=True)
|
validreader = fluid.io.PyReader(capacity=20, return_list=True)
|
||||||
validreader.decorate_batch_generator(validloader, place)
|
validreader.decorate_batch_generator(validloader, place)
|
||||||
self.validloader = validreader
|
self.validloader = validreader
|
||||||
|
|
|
@ -8,13 +8,18 @@ from paddle import fluid
|
||||||
from scipy.io.wavfile import write
|
from scipy.io.wavfile import write
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
from data import LJSpeech
|
from .data import LJSpeech
|
||||||
from waveflow_modules import WaveFlowLoss, WaveFlowModule
|
from .waveflow_modules import WaveFlowLoss, WaveFlowModule
|
||||||
|
|
||||||
|
|
||||||
class WaveFlow():
|
class WaveFlow():
|
||||||
def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
|
def __init__(self,
|
||||||
nranks=1, tb_logger=None):
|
config,
|
||||||
|
checkpoint_dir,
|
||||||
|
parallel=False,
|
||||||
|
rank=0,
|
||||||
|
nranks=1,
|
||||||
|
tb_logger=None):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.checkpoint_dir = checkpoint_dir
|
self.checkpoint_dir = checkpoint_dir
|
||||||
self.parallel = parallel
|
self.parallel = parallel
|
||||||
|
@ -24,12 +29,12 @@ class WaveFlow():
|
||||||
|
|
||||||
def build(self, training=True):
|
def build(self, training=True):
|
||||||
config = self.config
|
config = self.config
|
||||||
dataset = LJSpeech(config, self.nranks, self.rank)
|
dataset = LJSpeech(config, self.nranks, self.rank)
|
||||||
self.trainloader = dataset.trainloader
|
self.trainloader = dataset.trainloader
|
||||||
self.validloader = dataset.validloader
|
self.validloader = dataset.validloader
|
||||||
|
|
||||||
waveflow = WaveFlowModule("waveflow", config)
|
waveflow = WaveFlowModule(config)
|
||||||
|
|
||||||
# Dry run once to create and initalize all necessary parameters.
|
# Dry run once to create and initalize all necessary parameters.
|
||||||
audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32))
|
audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32))
|
||||||
mel = dg.to_variable(
|
mel = dg.to_variable(
|
||||||
|
@ -38,29 +43,36 @@ class WaveFlow():
|
||||||
|
|
||||||
if training:
|
if training:
|
||||||
optimizer = fluid.optimizer.AdamOptimizer(
|
optimizer = fluid.optimizer.AdamOptimizer(
|
||||||
learning_rate=config.learning_rate)
|
learning_rate=config.learning_rate,
|
||||||
|
parameter_list=waveflow.parameters())
|
||||||
|
|
||||||
# Load parameters.
|
# Load parameters.
|
||||||
utils.load_parameters(self.checkpoint_dir, self.rank,
|
utils.load_parameters(
|
||||||
waveflow, optimizer,
|
self.checkpoint_dir,
|
||||||
iteration=config.iteration,
|
self.rank,
|
||||||
file_path=config.checkpoint)
|
waveflow,
|
||||||
|
optimizer,
|
||||||
|
iteration=config.iteration,
|
||||||
|
file_path=config.checkpoint)
|
||||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||||
|
|
||||||
# Data parallelism.
|
# Data parallelism.
|
||||||
if self.parallel:
|
if self.parallel:
|
||||||
strategy = dg.parallel.prepare_context()
|
strategy = dg.parallel.prepare_context()
|
||||||
waveflow = dg.parallel.DataParallel(waveflow, strategy)
|
waveflow = dg.parallel.DataParallel(waveflow, strategy)
|
||||||
|
|
||||||
self.waveflow = waveflow
|
self.waveflow = waveflow
|
||||||
self.optimizer = optimizer
|
self.optimizer = optimizer
|
||||||
self.criterion = WaveFlowLoss(config.sigma)
|
self.criterion = WaveFlowLoss(config.sigma)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Load parameters.
|
# Load parameters.
|
||||||
utils.load_parameters(self.checkpoint_dir, self.rank, waveflow,
|
utils.load_parameters(
|
||||||
iteration=config.iteration,
|
self.checkpoint_dir,
|
||||||
file_path=config.checkpoint)
|
self.rank,
|
||||||
|
waveflow,
|
||||||
|
iteration=config.iteration,
|
||||||
|
file_path=config.checkpoint)
|
||||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||||
|
|
||||||
self.waveflow = waveflow
|
self.waveflow = waveflow
|
||||||
|
@ -83,7 +95,8 @@ class WaveFlow():
|
||||||
else:
|
else:
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters())
|
self.optimizer.minimize(
|
||||||
|
loss, parameter_list=self.waveflow.parameters())
|
||||||
self.waveflow.clear_gradients()
|
self.waveflow.clear_gradients()
|
||||||
|
|
||||||
graph_time = time.time()
|
graph_time = time.time()
|
||||||
|
@ -139,7 +152,8 @@ class WaveFlow():
|
||||||
sample = config.sample
|
sample = config.sample
|
||||||
|
|
||||||
output = "{}/{}/iter-{}".format(config.output, config.name, iteration)
|
output = "{}/{}/iter-{}".format(config.output, config.name, iteration)
|
||||||
os.makedirs(output, exist_ok=True)
|
if not os.path.exists(output):
|
||||||
|
os.makedirs(output)
|
||||||
|
|
||||||
mels_list = [mels for _, mels in self.validloader()]
|
mels_list = [mels for _, mels in self.validloader()]
|
||||||
if sample is not None:
|
if sample is not None:
|
||||||
|
@ -148,16 +162,16 @@ class WaveFlow():
|
||||||
for sample, mel in enumerate(mels_list):
|
for sample, mel in enumerate(mels_list):
|
||||||
filename = "{}/valid_{}.wav".format(output, sample)
|
filename = "{}/valid_{}.wav".format(output, sample)
|
||||||
print("Synthesize sample {}, save as {}".format(sample, filename))
|
print("Synthesize sample {}, save as {}".format(sample, filename))
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
|
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
|
||||||
syn_time = time.time() - start_time
|
syn_time = time.time() - start_time
|
||||||
|
|
||||||
audio = audio[0]
|
audio = audio[0]
|
||||||
audio_time = audio.shape[0] / self.config.sample_rate
|
audio_time = audio.shape[0] / self.config.sample_rate
|
||||||
print("audio time {:.4f}, synthesis time {:.4f}".format(
|
print("audio time {:.4f}, synthesis time {:.4f}".format(audio_time,
|
||||||
audio_time, syn_time))
|
syn_time))
|
||||||
|
|
||||||
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
|
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
|
||||||
audio = audio.numpy() * 32768.0
|
audio = audio.numpy() * 32768.0
|
||||||
audio = audio.astype('int16')
|
audio = audio.astype('int16')
|
||||||
|
@ -180,8 +194,8 @@ class WaveFlow():
|
||||||
syn_time = time.time() - start_time
|
syn_time = time.time() - start_time
|
||||||
|
|
||||||
audio_time = audio.shape[1] * batch_size / self.config.sample_rate
|
audio_time = audio.shape[1] * batch_size / self.config.sample_rate
|
||||||
print("audio time {:.4f}, synthesis time {:.4f}".format(
|
print("audio time {:.4f}, synthesis time {:.4f}".format(audio_time,
|
||||||
audio_time, syn_time))
|
syn_time))
|
||||||
print("{} X real-time".format(audio_time / syn_time))
|
print("{} X real-time".format(audio_time / syn_time))
|
||||||
|
|
||||||
def save(self, iteration):
|
def save(self, iteration):
|
||||||
|
|
|
@ -3,26 +3,27 @@ import itertools
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
from parakeet.modules import conv, modules, weight_norm
|
from parakeet.modules import weight_norm
|
||||||
|
|
||||||
|
|
||||||
def set_param_attr(layer, c_in=1):
|
def get_param_attr(layer_type, filter_size, c_in=1):
|
||||||
if isinstance(layer, (weight_norm.Conv2DTranspose, weight_norm.Conv2D)):
|
if layer_type == "weight_norm":
|
||||||
k = np.sqrt(1.0 / (c_in * np.prod(layer._filter_size)))
|
k = np.sqrt(1.0 / (c_in * np.prod(filter_size)))
|
||||||
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
||||||
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
||||||
elif isinstance(layer, dg.Conv2D):
|
elif layer_type == "common":
|
||||||
weight_init = fluid.initializer.ConstantInitializer(0.0)
|
weight_init = fluid.initializer.ConstantInitializer(0.0)
|
||||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||||
else:
|
else:
|
||||||
raise TypeError("Unsupported layer type.")
|
raise TypeError("Unsupported layer type.")
|
||||||
|
|
||||||
layer._param_attr = fluid.ParamAttr(initializer=weight_init)
|
param_attr = fluid.ParamAttr(initializer=weight_init)
|
||||||
layer._bias_attr = fluid.ParamAttr(initializer=bias_init)
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||||
|
return param_attr, bias_attr
|
||||||
|
|
||||||
|
|
||||||
def unfold(x, n_group):
|
def unfold(x, n_group):
|
||||||
length = x.shape[-1]
|
length = x.shape[-1]
|
||||||
new_shape = x.shape[:-1] + [length // n_group, n_group]
|
new_shape = x.shape[:-1] + [length // n_group, n_group]
|
||||||
return fluid.layers.reshape(x, new_shape)
|
return fluid.layers.reshape(x, new_shape)
|
||||||
|
|
||||||
|
@ -48,20 +49,23 @@ class WaveFlowLoss:
|
||||||
|
|
||||||
|
|
||||||
class Conditioner(dg.Layer):
|
class Conditioner(dg.Layer):
|
||||||
def __init__(self, name_scope):
|
def __init__(self):
|
||||||
super(Conditioner, self).__init__(name_scope)
|
super(Conditioner, self).__init__()
|
||||||
upsample_factors = [16, 16]
|
upsample_factors = [16, 16]
|
||||||
|
|
||||||
self.upsample_conv2d = []
|
self.upsample_conv2d = []
|
||||||
for s in upsample_factors:
|
for s in upsample_factors:
|
||||||
in_channel = 1
|
in_channel = 1
|
||||||
conv_trans2d = modules.Conv2DTranspose(
|
param_attr, bias_attr = get_param_attr(
|
||||||
self.full_name(),
|
"weight_norm", (3, 2 * s), c_in=in_channel)
|
||||||
|
conv_trans2d = weight_norm.Conv2DTranspose(
|
||||||
|
num_channels=in_channel,
|
||||||
num_filters=1,
|
num_filters=1,
|
||||||
filter_size=(3, 2 * s),
|
filter_size=(3, 2 * s),
|
||||||
padding=(1, s // 2),
|
padding=(1, s // 2),
|
||||||
stride=(1, s))
|
stride=(1, s),
|
||||||
set_param_attr(conv_trans2d, c_in=in_channel)
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
self.upsample_conv2d.append(conv_trans2d)
|
self.upsample_conv2d.append(conv_trans2d)
|
||||||
|
|
||||||
for i, layer in enumerate(self.upsample_conv2d):
|
for i, layer in enumerate(self.upsample_conv2d):
|
||||||
|
@ -86,8 +90,8 @@ class Conditioner(dg.Layer):
|
||||||
|
|
||||||
|
|
||||||
class Flow(dg.Layer):
|
class Flow(dg.Layer):
|
||||||
def __init__(self, name_scope, config):
|
def __init__(self, config):
|
||||||
super(Flow, self).__init__(name_scope)
|
super(Flow, self).__init__()
|
||||||
self.n_layers = config.n_layers
|
self.n_layers = config.n_layers
|
||||||
self.n_channels = config.n_channels
|
self.n_channels = config.n_channels
|
||||||
self.kernel_h = config.kernel_h
|
self.kernel_h = config.kernel_h
|
||||||
|
@ -95,27 +99,34 @@ class Flow(dg.Layer):
|
||||||
|
|
||||||
# Transform audio: [batch, 1, n_group, time/n_group]
|
# Transform audio: [batch, 1, n_group, time/n_group]
|
||||||
# => [batch, n_channels, n_group, time/n_group]
|
# => [batch, n_channels, n_group, time/n_group]
|
||||||
|
param_attr, bias_attr = get_param_attr("weight_norm", (1, 1), c_in=1)
|
||||||
self.start = weight_norm.Conv2D(
|
self.start = weight_norm.Conv2D(
|
||||||
self.full_name(),
|
num_channels=1,
|
||||||
num_filters=self.n_channels,
|
num_filters=self.n_channels,
|
||||||
filter_size=(1, 1))
|
filter_size=(1, 1),
|
||||||
set_param_attr(self.start, c_in=1)
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
|
|
||||||
# Initializing last layer to 0 makes the affine coupling layers
|
# Initializing last layer to 0 makes the affine coupling layers
|
||||||
# do nothing at first. This helps with training stability
|
# do nothing at first. This helps with training stability
|
||||||
# output shape: [batch, 2, n_group, time/n_group]
|
# output shape: [batch, 2, n_group, time/n_group]
|
||||||
|
param_attr, bias_attr = get_param_attr(
|
||||||
|
"common", (1, 1), c_in=self.n_channels)
|
||||||
self.end = dg.Conv2D(
|
self.end = dg.Conv2D(
|
||||||
self.full_name(),
|
num_channels=self.n_channels,
|
||||||
num_filters=2,
|
num_filters=2,
|
||||||
filter_size=(1, 1))
|
filter_size=(1, 1),
|
||||||
set_param_attr(self.end)
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
|
|
||||||
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
|
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
|
||||||
dilation_dict = {8: [1, 1, 1, 1, 1, 1, 1, 1],
|
dilation_dict = {
|
||||||
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
||||||
128: [1, 2, 4, 8, 16, 32, 64, 1]}
|
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
||||||
|
128: [1, 2, 4, 8, 16, 32, 64, 1]
|
||||||
|
}
|
||||||
self.dilation_h_list = dilation_dict[config.n_group]
|
self.dilation_h_list = dilation_dict[config.n_group]
|
||||||
|
|
||||||
self.in_layers = []
|
self.in_layers = []
|
||||||
|
@ -123,32 +134,42 @@ class Flow(dg.Layer):
|
||||||
self.res_skip_layers = []
|
self.res_skip_layers = []
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
dilation_h = self.dilation_h_list[i]
|
dilation_h = self.dilation_h_list[i]
|
||||||
dilation_w = 2 ** i
|
dilation_w = 2**i
|
||||||
|
|
||||||
|
param_attr, bias_attr = get_param_attr(
|
||||||
|
"weight_norm", (self.kernel_h, self.kernel_w),
|
||||||
|
c_in=self.n_channels)
|
||||||
in_layer = weight_norm.Conv2D(
|
in_layer = weight_norm.Conv2D(
|
||||||
self.full_name(),
|
num_channels=self.n_channels,
|
||||||
num_filters=2 * self.n_channels,
|
num_filters=2 * self.n_channels,
|
||||||
filter_size=(self.kernel_h, self.kernel_w),
|
filter_size=(self.kernel_h, self.kernel_w),
|
||||||
dilation=(dilation_h, dilation_w))
|
dilation=(dilation_h, dilation_w),
|
||||||
set_param_attr(in_layer, c_in=self.n_channels)
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
self.in_layers.append(in_layer)
|
self.in_layers.append(in_layer)
|
||||||
|
|
||||||
|
param_attr, bias_attr = get_param_attr(
|
||||||
|
"weight_norm", (1, 1), c_in=config.mel_bands)
|
||||||
cond_layer = weight_norm.Conv2D(
|
cond_layer = weight_norm.Conv2D(
|
||||||
self.full_name(),
|
num_channels=config.mel_bands,
|
||||||
num_filters=2 * self.n_channels,
|
num_filters=2 * self.n_channels,
|
||||||
filter_size=(1, 1))
|
filter_size=(1, 1),
|
||||||
set_param_attr(cond_layer, c_in=config.mel_bands)
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
self.cond_layers.append(cond_layer)
|
self.cond_layers.append(cond_layer)
|
||||||
|
|
||||||
if i < self.n_layers - 1:
|
if i < self.n_layers - 1:
|
||||||
res_skip_channels = 2 * self.n_channels
|
res_skip_channels = 2 * self.n_channels
|
||||||
else:
|
else:
|
||||||
res_skip_channels = self.n_channels
|
res_skip_channels = self.n_channels
|
||||||
|
param_attr, bias_attr = get_param_attr(
|
||||||
|
"weight_norm", (1, 1), c_in=self.n_channels)
|
||||||
res_skip_layer = weight_norm.Conv2D(
|
res_skip_layer = weight_norm.Conv2D(
|
||||||
self.full_name(),
|
num_channels=self.n_channels,
|
||||||
num_filters=res_skip_channels,
|
num_filters=res_skip_channels,
|
||||||
filter_size=(1, 1))
|
filter_size=(1, 1),
|
||||||
set_param_attr(res_skip_layer, c_in=self.n_channels)
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
self.res_skip_layers.append(res_skip_layer)
|
self.res_skip_layers.append(res_skip_layer)
|
||||||
|
|
||||||
self.add_sublayer("in_layer_{}".format(i), in_layer)
|
self.add_sublayer("in_layer_{}".format(i), in_layer)
|
||||||
|
@ -162,14 +183,14 @@ class Flow(dg.Layer):
|
||||||
|
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
dilation_h = self.dilation_h_list[i]
|
dilation_h = self.dilation_h_list[i]
|
||||||
dilation_w = 2 ** i
|
dilation_w = 2**i
|
||||||
|
|
||||||
# Pad height dim (n_group): causal convolution
|
# Pad height dim (n_group): causal convolution
|
||||||
# Pad width dim (time): dialated non-causal convolution
|
# Pad width dim (time): dialated non-causal convolution
|
||||||
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
|
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
|
||||||
pad_left = pad_right = int((self.kernel_w-1) * dilation_w / 2)
|
pad_left = pad_right = int((self.kernel_w - 1) * dilation_w / 2)
|
||||||
audio_pad = fluid.layers.pad2d(audio,
|
audio_pad = fluid.layers.pad2d(
|
||||||
paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
audio, paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
||||||
|
|
||||||
hidden = self.in_layers[i](audio_pad)
|
hidden = self.in_layers[i](audio_pad)
|
||||||
cond_hidden = self.cond_layers[i](mel)
|
cond_hidden = self.cond_layers[i](mel)
|
||||||
|
@ -196,7 +217,7 @@ class Flow(dg.Layer):
|
||||||
|
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
dilation_h = self.dilation_h_list[i]
|
dilation_h = self.dilation_h_list[i]
|
||||||
dilation_w = 2 ** i
|
dilation_w = 2**i
|
||||||
|
|
||||||
state_size = dilation_h * (self.kernel_h - 1)
|
state_size = dilation_h * (self.kernel_h - 1)
|
||||||
queue = queues[i]
|
queue = queues[i]
|
||||||
|
@ -206,7 +227,7 @@ class Flow(dg.Layer):
|
||||||
queue.append(fluid.layers.zeros_like(audio))
|
queue.append(fluid.layers.zeros_like(audio))
|
||||||
|
|
||||||
state = queue[0:state_size]
|
state = queue[0:state_size]
|
||||||
state = fluid.layers.concat([*state, audio], axis=2)
|
state = fluid.layers.concat(state + [audio], axis=2)
|
||||||
|
|
||||||
queue.pop(0)
|
queue.pop(0)
|
||||||
queue.append(audio)
|
queue.append(audio)
|
||||||
|
@ -214,10 +235,10 @@ class Flow(dg.Layer):
|
||||||
# Pad height dim (n_group): causal convolution
|
# Pad height dim (n_group): causal convolution
|
||||||
# Pad width dim (time): dialated non-causal convolution
|
# Pad width dim (time): dialated non-causal convolution
|
||||||
pad_top, pad_bottom = 0, 0
|
pad_top, pad_bottom = 0, 0
|
||||||
pad_left = int((self.kernel_w-1) * dilation_w / 2)
|
pad_left = int((self.kernel_w - 1) * dilation_w / 2)
|
||||||
pad_right = int((self.kernel_w-1) * dilation_w / 2)
|
pad_right = int((self.kernel_w - 1) * dilation_w / 2)
|
||||||
state = fluid.layers.pad2d(state,
|
state = fluid.layers.pad2d(
|
||||||
paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
state, paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
||||||
|
|
||||||
hidden = self.in_layers[i](state)
|
hidden = self.in_layers[i](state)
|
||||||
cond_hidden = self.cond_layers[i](mel)
|
cond_hidden = self.cond_layers[i](mel)
|
||||||
|
@ -241,20 +262,20 @@ class Flow(dg.Layer):
|
||||||
|
|
||||||
|
|
||||||
class WaveFlowModule(dg.Layer):
|
class WaveFlowModule(dg.Layer):
|
||||||
def __init__(self, name_scope, config):
|
def __init__(self, config):
|
||||||
super(WaveFlowModule, self).__init__(name_scope)
|
super(WaveFlowModule, self).__init__()
|
||||||
self.n_flows = config.n_flows
|
self.n_flows = config.n_flows
|
||||||
self.n_group = config.n_group
|
self.n_group = config.n_group
|
||||||
self.n_layers = config.n_layers
|
self.n_layers = config.n_layers
|
||||||
assert self.n_group % 2 == 0
|
assert self.n_group % 2 == 0
|
||||||
assert self.n_flows % 2 == 0
|
assert self.n_flows % 2 == 0
|
||||||
|
|
||||||
self.conditioner = Conditioner(self.full_name())
|
self.conditioner = Conditioner()
|
||||||
self.flows = []
|
self.flows = []
|
||||||
for i in range(self.n_flows):
|
for i in range(self.n_flows):
|
||||||
flow = Flow(self.full_name(), config)
|
flow = Flow(config)
|
||||||
self.flows.append(flow)
|
self.flows.append(flow)
|
||||||
self.add_sublayer("flow_{}".format(i), flow)
|
self.add_sublayer("flow_{}".format(i), flow)
|
||||||
|
|
||||||
self.perms = []
|
self.perms = []
|
||||||
half = self.n_group // 2
|
half = self.n_group // 2
|
||||||
|
@ -266,7 +287,7 @@ class WaveFlowModule(dg.Layer):
|
||||||
perm[:half] = reversed(perm[:half])
|
perm[:half] = reversed(perm[:half])
|
||||||
perm[half:] = reversed(perm[half:])
|
perm[half:] = reversed(perm[half:])
|
||||||
self.perms.append(perm)
|
self.perms.append(perm)
|
||||||
|
|
||||||
def forward(self, audio, mel):
|
def forward(self, audio, mel):
|
||||||
mel = self.conditioner(mel)
|
mel = self.conditioner(mel)
|
||||||
assert mel.shape[2] >= audio.shape[1]
|
assert mel.shape[2] >= audio.shape[1]
|
||||||
|
@ -277,14 +298,13 @@ class WaveFlowModule(dg.Layer):
|
||||||
audio = audio[:, :pruned_len]
|
audio = audio[:, :pruned_len]
|
||||||
if mel.shape[2] > pruned_len:
|
if mel.shape[2] > pruned_len:
|
||||||
mel = mel[:, :, :pruned_len]
|
mel = mel[:, :, :pruned_len]
|
||||||
|
|
||||||
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
|
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
|
||||||
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
|
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
|
||||||
# From [bs, time] to [bs, n_group, time/n_group]
|
# From [bs, time] to [bs, n_group, time/n_group]
|
||||||
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
|
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
|
||||||
# [bs, 1, n_group, time/n_group]
|
# [bs, 1, n_group, time/n_group]
|
||||||
audio = fluid.layers.unsqueeze(audio, 1)
|
audio = fluid.layers.unsqueeze(audio, 1)
|
||||||
|
|
||||||
log_s_list = []
|
log_s_list = []
|
||||||
for i in range(self.n_flows):
|
for i in range(self.n_flows):
|
||||||
inputs = audio[:, :, :-1, :]
|
inputs = audio[:, :, :-1, :]
|
||||||
|
@ -305,7 +325,6 @@ class WaveFlowModule(dg.Layer):
|
||||||
mel = fluid.layers.stack(mel_slices, axis=2)
|
mel = fluid.layers.stack(mel_slices, axis=2)
|
||||||
|
|
||||||
z = fluid.layers.squeeze(audio, [1])
|
z = fluid.layers.squeeze(audio, [1])
|
||||||
|
|
||||||
return z, log_s_list
|
return z, log_s_list
|
||||||
|
|
||||||
def synthesize(self, mel, sigma=1.0):
|
def synthesize(self, mel, sigma=1.0):
|
||||||
|
@ -331,7 +350,7 @@ class WaveFlowModule(dg.Layer):
|
||||||
|
|
||||||
for h in range(1, self.n_group):
|
for h in range(1, self.n_group):
|
||||||
inputs = audio_h
|
inputs = audio_h
|
||||||
conds = mel[:, :, h:(h+1), :]
|
conds = mel[:, :, h:(h + 1), :]
|
||||||
outputs = self.flows[i].infer(inputs, conds, queues)
|
outputs = self.flows[i].infer(inputs, conds, queues)
|
||||||
|
|
||||||
log_s = outputs[:, 0:1, :, :]
|
log_s = outputs[:, 0:1, :, :]
|
||||||
|
|
|
@ -40,8 +40,8 @@ def norm_except(param, dim, power):
|
||||||
|
|
||||||
def compute_weight(v, g, dim, power):
|
def compute_weight(v, g, dim, power):
|
||||||
assert len(g.shape) == 1, "magnitude should be a vector"
|
assert len(g.shape) == 1, "magnitude should be a vector"
|
||||||
v_normalized = F.elementwise_div(v, (norm_except(v, dim, power) + 1e-12),
|
v_normalized = F.elementwise_div(
|
||||||
axis=dim)
|
v, (norm_except(v, dim, power) + 1e-12), axis=dim)
|
||||||
weight = F.elementwise_mul(v_normalized, g, axis=dim)
|
weight = F.elementwise_mul(v_normalized, g, axis=dim)
|
||||||
return weight
|
return weight
|
||||||
|
|
||||||
|
@ -63,20 +63,21 @@ class WeightNormWrapper(dg.Layer):
|
||||||
original_weight = getattr(layer, param_name)
|
original_weight = getattr(layer, param_name)
|
||||||
self.add_parameter(
|
self.add_parameter(
|
||||||
w_v,
|
w_v,
|
||||||
self.create_parameter(shape=original_weight.shape,
|
self.create_parameter(
|
||||||
dtype=original_weight.dtype))
|
shape=original_weight.shape, dtype=original_weight.dtype))
|
||||||
F.assign(original_weight, getattr(self, w_v))
|
F.assign(original_weight, getattr(self, w_v))
|
||||||
delattr(layer, param_name)
|
delattr(layer, param_name)
|
||||||
temp = norm_except(getattr(self, w_v), self.dim, self.power)
|
temp = norm_except(getattr(self, w_v), self.dim, self.power)
|
||||||
self.add_parameter(
|
self.add_parameter(
|
||||||
w_g, self.create_parameter(shape=temp.shape, dtype=temp.dtype))
|
w_g, self.create_parameter(
|
||||||
|
shape=temp.shape, dtype=temp.dtype))
|
||||||
F.assign(temp, getattr(self, w_g))
|
F.assign(temp, getattr(self, w_g))
|
||||||
|
|
||||||
# also set this when setting up
|
# also set this when setting up
|
||||||
setattr(
|
setattr(self.layer, self.param_name,
|
||||||
self.layer, self.param_name,
|
compute_weight(
|
||||||
compute_weight(getattr(self, w_v), getattr(self, w_g), self.dim,
|
getattr(self, w_v),
|
||||||
self.power))
|
getattr(self, w_g), self.dim, self.power))
|
||||||
|
|
||||||
self.weigth_norm_applied = True
|
self.weigth_norm_applied = True
|
||||||
|
|
||||||
|
@ -84,10 +85,10 @@ class WeightNormWrapper(dg.Layer):
|
||||||
def hook(self):
|
def hook(self):
|
||||||
w_v = self.param_name + "_v"
|
w_v = self.param_name + "_v"
|
||||||
w_g = self.param_name + "_g"
|
w_g = self.param_name + "_g"
|
||||||
setattr(
|
setattr(self.layer, self.param_name,
|
||||||
self.layer, self.param_name,
|
compute_weight(
|
||||||
compute_weight(getattr(self, w_v), getattr(self, w_g), self.dim,
|
getattr(self, w_v),
|
||||||
self.power))
|
getattr(self, w_g), self.dim, self.power))
|
||||||
|
|
||||||
def remove_weight_norm(self):
|
def remove_weight_norm(self):
|
||||||
self.hook()
|
self.hook()
|
||||||
|
|
Loading…
Reference in New Issue