refine code

This commit is contained in:
Kexin Zhao 2019-12-19 00:03:06 -08:00
parent 8c22397b55
commit 0e18d60057
9 changed files with 170 additions and 151 deletions

View File

@ -0,0 +1,71 @@
import os
import random
from pprint import pprint
import jsonargparse
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
import utils
from waveflow import WaveFlow
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow',
help="general name of the model")
parser.add_argument('--name', type=str,
help="specific name of the training model")
parser.add_argument('--root', type=str,
help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True,
help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None,
help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
help="path of the checkpoint to load")
def benchmark(config):
# Get checkpoint directory path.
run_dir = os.path.join("runs", config.model,
checkpoint_dir = os.path.join(run_dir, "checkpoint")
# Configurate device.
place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
with dg.guard(place):
# Fix random seed.
seed = config.seed
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed)
# Build model.
model = WaveFlow(config, checkpoint_dir)
# Run model inference.
if __name__ == "__main__":
# Create parser.
parser = jsonargparse.ArgumentParser(
description="Synthesize audio using WaveNet model",
# Parse argument from both command line and yaml config file.
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
config = parser.parse_args()

View File

@ -1,24 +0,0 @@
valid_size: 16
segment_length: 16000
sample_rate: 22050
fft_window_shift: 256
fft_window_size: 1024
fft_size: 1024
mel_bands: 80
mel_fmin: 0.0
mel_fmax: 8000.0
seed: 123
learning_rate: 0.0002
batch_size: 8
test_every: 2000
save_every: 5000
max_iterations: 2000000
sigma: 1.0
n_flows: 8
n_group: 16
n_layers: 8
n_channels: 64
kernel_h: 3
kernel_w: 3

View File

@ -4,7 +4,6 @@ import librosa
import numpy as np
from paddle import fluid
import utils
from parakeet.datasets import ljspeech
from import dataset
from import SpecBatcher, WavBatcher
@ -12,8 +11,6 @@ from import DataCargo
from import DistributedSampler, BatchSampler
from import read
MAX_WAV_VALUE = 32768.0
class Dataset(ljspeech.LJSpeech):
def __init__(self, config):
@ -78,10 +75,9 @@ class Subset(dataset.Dataset):
audio = np.pad(audio, (0, segment_length - audio.shape[0]),
mode='constant', constant_values=0)
# Normalize audio.
audio = audio.astype(np.float32) / MAX_WAV_VALUE
# Normalize audio to the [-1, 1] range.
audio = audio.astype(np.float32) / 32768.0
mel = self.get_mel(audio)
#print("mel = {}, dtype {}, shape {}".format(mel, mel.dtype, mel.shape))
return audio, mel

View File

@ -1,3 +0,0 @@

View File

@ -14,8 +14,6 @@ import slurm
import utils
from waveflow import WaveFlow
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow',
@ -35,8 +33,6 @@ def add_options_to_parser(parser):
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
help="path of the checkpoint to load")
parser.add_argument('--slurm', type=bool, default=False,
help="whether you are using slurm to submit training jobs")
def train(config):
@ -85,13 +81,6 @@ def train(config):
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
# Get restart command if using slurm.
if config.slurm:
resume_command, death_time = slurm.restart_command()
if rank == 0:
print("Restart command:", " ".join(resume_command))
done = False
while iteration < config.max_iterations:
# Run one single training step.
@ -102,20 +91,6 @@ def train(config):
# Run validation step.
# Check whether reaching the time limit.
if config.slurm:
done = (death_time is not None and death_time - time.time() <
if rank == 0 and done:
print("Saving progress before exiting.")
print("Running restart command:", " ".join(resume_command))
# Submit restart command.
if rank == 0 and iteration % config.save_every == 0:
# Save parameters.

View File

@ -57,27 +57,6 @@ def add_config_options_to_parser(parser):
parser.add_argument('--config', action=jsonargparse.ActionConfigFile)
def pad_to_size(array, length, pad_with=0.0):
Pad an array on the first (length) axis to a given length.
padding = length - array.shape[0]
assert padding >= 0, "Padding required was less than zero"
paddings = [(0, 0)] * len(array.shape)
paddings[0] = (0, padding)
return np.pad(array, paddings, mode='constant', constant_values=pad_with)
def calculate_context_size(config):
dilations = list(
itertools.cycle(config.dilation_block), config.layers))
config.context_size = sum(dilations) + 1
print("Context size is", config.context_size)
def load_latest_checkpoint(checkpoint_dir, rank=0):
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
# Create checkpoint index file if not exist.

View File

@ -2,11 +2,10 @@ import itertools
import os
import time
#import librosa
from import write
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
from import write
import utils
from data import LJSpeech
@ -29,18 +28,6 @@ class WaveFlow():
self.trainloader = dataset.trainloader
self.validloader = dataset.validloader
# if self.rank == 0:
# for i, (audios, mels) in enumerate(self.validloader()):
# print("audios {}, mels {}".format(audios.dtype, mels.dtype))
# print("{}: rank {}, audios {}, mels {}".format(
# i, self.rank, audios.shape, mels.shape))
# for i, (audios, mels) in enumerate(self.trainloader):
# print("{}: rank {}, audios {}, mels {}".format(
# i, self.rank, audios.shape, mels.shape))
# exit()
waveflow = WaveFlowModule("waveflow", config)
# Dry run once to create and initalize all necessary parameters.
@ -96,8 +83,6 @@ class WaveFlow():
current_lr = self.optimizer._learning_rate
self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters())
@ -113,7 +98,6 @@ class WaveFlow():
tb = self.tb_logger
tb.add_scalar("Train-Loss-Rank-0", loss_val, iteration)
tb.add_scalar("Learning-Rate", current_lr, iteration)
def valid_step(self, iteration):
@ -161,34 +145,44 @@ class WaveFlow():
if sample is not None:
mels_list = [mels_list[sample]]
audio_times = []
inf_times = []
for sample, mel in enumerate(mels_list):
filename = "{}/valid_{}.wav".format(output, sample)
print("Synthesize sample {}, save as {}".format(sample, filename))
start_time = time.time()
audio = self.waveflow.synthesize(mel)
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
syn_time = time.time() - start_time
audio_time = audio.shape[0] / 22050
print("audio time {}, synthesis time {}, speedup: {}".format(
audio_time, syn_time, audio_time / syn_time))
audio = audio[0]
audio_time = audio.shape[0] / self.config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format(
audio_time, syn_time))
#librosa.output.write_wav(filename, syn_audio,
# sr=config.sample_rate)
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
audio = audio.numpy() * 32768.0
audio = audio.astype('int16')
write(filename, config.sample_rate, audio)
def benchmark(self):
total_audio = sum(audio_times)
total_inf = sum(inf_times)
mels_list = [mels for _, mels in self.validloader()]
mel = fluid.layers.concat(mels_list, axis=2)
mel = mel[:, :, :864]
batch_size = 8
mel = fluid.layers.expand(mel, [batch_size, 1, 1])
print("Total audio: {}, total inf time {}, speedup: {}".format(
total_audio, total_inf, total_audio / total_inf))
for i in range(10):
start_time = time.time()
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
print("audio.shape = ", audio.shape)
syn_time = time.time() - start_time
audio_time = audio.shape[1] * batch_size / self.config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format(
audio_time, syn_time))
print("{} X real-time".format(audio_time / syn_time))
def save(self, iteration):
utils.save_latest_parameters(self.checkpoint_dir, iteration,

View File

@ -23,7 +23,6 @@ def set_param_attr(layer, c_in=1):
def unfold(x, n_group):
length = x.shape[-1]
#assert length % n_group == 0
new_shape = x.shape[:-1] + [length // n_group, n_group]
return fluid.layers.reshape(x, new_shape)
@ -192,13 +191,53 @@ class Flow(dg.Layer):
return self.end(output)
def infer(self, audio, mel, queues):
audio = self.start(audio)
def debug(x, msg):
y = x.numpy()
print(msg + " :\n", y)
print("shape: ", y.shape)
print("dtype: ", y.dtype)
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i
state_size = dilation_h * (self.kernel_h - 1)
queue = queues[i]
if len(queue) == 0:
for j in range(state_size):
state = queue[0:state_size]
state = fluid.layers.concat([*state, audio], axis=2)
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = 0, 0
pad_left = int((self.kernel_w-1) * dilation_w / 2)
pad_right = int((self.kernel_w-1) * dilation_w / 2)
state = fluid.layers.pad2d(state,
paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](state)
cond_hidden = self.cond_layers[i](mel)
in_acts = hidden + cond_hidden
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
res_skip_acts = self.res_skip_layers[i](out_acts)
if i < self.n_layers - 1:
audio += res_skip_acts[:, :self.n_channels, :, :]
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
skip_acts = res_skip_acts
if i == 0:
output = skip_acts
output += skip_acts
return self.end(output)
class WaveFlowModule(dg.Layer):
@ -206,7 +245,9 @@ class WaveFlowModule(dg.Layer):
super(WaveFlowModule, self).__init__(name_scope)
self.n_flows = config.n_flows
self.n_group = config.n_group
self.n_layers = config.n_layers
assert self.n_group % 2 == 0
assert self.n_flows % 2 == 0
self.conditioner = Conditioner(self.full_name())
self.flows = []
@ -215,14 +256,16 @@ class WaveFlowModule(dg.Layer):
self.add_sublayer("flow_{}".format(i), flow)
self.perms = [[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8],
[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8],
[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8],
[7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8]]
self.perms = []
half = self.n_group // 2
for i in range(self.n_flows):
perm = list(range(self.n_group))
if i < self.n_flows // 2:
perm = perm[::-1]
perm[:half] = reversed(perm[:half])
perm[half:] = reversed(perm[half:])
def forward(self, audio, mel):
mel = self.conditioner(mel)
@ -266,19 +309,13 @@ class WaveFlowModule(dg.Layer):
return z, log_s_list
def synthesize(self, mel, sigma=1.0):
#debug(mel, "mel")
mel = self.conditioner.infer(mel)
#debug(mel, "mel after conditioner")
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
#debug(mel, "after group")
audio = fluid.layers.gaussian_random(
shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma)
#debug(audio, "audio")
for i in reversed(range(self.n_flows)):
# Permute over the height dimension.
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
@ -287,34 +324,28 @@ class WaveFlowModule(dg.Layer):
mel = fluid.layers.stack(mel_slices, axis=2)
audio_list = []
audio_0 = audio[:, :, :1, :]
audio_0 = audio[:, :, 0:1, :]
audio_h = audio_0
queues = [[] for _ in range(self.n_layers)]
for h in range(1, self.n_group):
# inputs: [bs, 1, h, time/n_group]
inputs = fluid.layers.concat(audio_list, axis=2)
conds = mel[:, :, 1:(h+1), :]
outputs = self.flows[i](inputs, conds)
inputs = audio_h
conds = mel[:, :, h:(h+1), :]
outputs = self.flows[i].infer(inputs, conds, queues)
log_s = outputs[:, :1, (h-1):h, :]
b = outputs[:, 1:, (h-1):h, :]
audio_h = (audio[:, :, h:(h+1), :] - b) / fluid.layers.exp(log_s)
log_s = outputs[:, 0:1, :, :]
b = outputs[:, 1:, :, :]
audio_h = (audio[:, :, h:(h+1), :] - b) / \
audio = fluid.layers.concat(audio_list, axis=2)
#print("audio.shape =", audio.shape)
# Assume batch size = 1
# audio: [n_group, time/n_group]
audio = fluid.layers.squeeze(audio, [0, 1])
# audio: [time]
# audio: [bs, n_group, time/n_group]
audio = fluid.layers.squeeze(audio, [1])
# audio: [bs, time]
audio = fluid.layers.reshape(
fluid.layers.transpose(audio, [1, 0]), [-1])
#print("audio.shape =", audio.shape)
fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1])
return audio
def start_new_sequence(self):
for layer in self.sublayers():
if isinstance(layer, conv.Conv1D):