add deepvoice3 model and example

This commit is contained in:
chenfeiyu 2020-02-13 02:24:34 +00:00 committed by liuyibing01
parent 04d7f8b598
commit 155dfe633d
40 changed files with 3077 additions and 5712 deletions

184
examples/deepvoice3/data.py Normal file
View File

@ -0,0 +1,184 @@
import os
import csv
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from scipy import signal, io
from parakeet.data import DatasetMixin, TransformDataset, FilterDataset
from parakeet.g2p.en import text_to_sequence, sequence_to_text
class LJSpeechMetaData(DatasetMixin):
def __init__(self, root):
self.root = Path(root)
self._wav_dir = self.root.joinpath("wavs")
csv_path = self.root.joinpath("metadata.csv")
self._table = pd.read_csv(
csv_path,
sep="|",
header=None,
quoting=csv.QUOTE_NONE,
names=["fname", "raw_text", "normalized_text"])
def get_example(self, i):
fname, raw_text, normalized_text = self._table.iloc[i]
fname = str(self._wav_dir.joinpath(fname + ".wav"))
return fname, raw_text, normalized_text
def __len__(self):
return len(self._table)
class Transform(object):
def __init__(self,
replace_pronounciation_prob=0.,
sample_rate=22050,
preemphasis=.97,
n_fft=1024,
win_length=1024,
hop_length=256,
fmin=125,
fmax=7600,
n_mels=80,
min_level_db=-100,
ref_level_db=20,
max_norm=0.999,
clip_norm=True):
self.replace_pronounciation_prob = replace_pronounciation_prob
self.sample_rate = sample_rate
self.preemphasis = preemphasis
self.n_fft = n_fft
self.win_length = win_length
self.hop_length = hop_length
self.fmin = fmin
self.fmax = fmax
self.n_mels = n_mels
self.min_level_db = min_level_db
self.ref_level_db = ref_level_db
self.max_norm = max_norm
self.clip_norm = clip_norm
def __call__(self, in_data):
fname, _, normalized_text = in_data
# text processing
mix_grapheme_phonemes = text_to_sequence(
normalized_text, self.replace_pronounciation_prob)
text_length = len(mix_grapheme_phonemes)
# CAUTION: positions start from 1
speaker_id = None
# wave processing
wav, _ = librosa.load(fname, sr=self.sample_rate)
# preemphasis
y = signal.lfilter([1., -self.preemphasis], [1.], wav)
# STFT
D = librosa.stft(y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
S = np.abs(D)
# to db and normalize to 0-1
amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) # 1e-5
S_norm = 20 * np.log10(np.maximum(amplitude_min,
S)) - self.ref_level_db
S_norm = (S_norm - self.min_level_db) / (-self.min_level_db)
S_norm = self.max_norm * S_norm
if self.clip_norm:
S_norm = np.clip(S_norm, 0, self.max_norm)
# mel scale and to db and normalize to 0-1,
# CAUTION: pass linear scale S, not dbscaled S
S_mel = librosa.feature.melspectrogram(S=S,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
power=1.)
S_mel = 20 * np.log10(np.maximum(amplitude_min,
S_mel)) - self.ref_level_db
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
S_mel_norm = self.max_norm * S_mel_norm
if self.clip_norm:
S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm)
# num_frames
n_frames = S_mel_norm.shape[-1] # CAUTION: original number of frames
return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, n_frames)
class DataCollector(object):
def __init__(self, downsample_factor=4, r=1):
self.downsample_factor = int(downsample_factor)
self.frames_per_step = int(r)
self._factor = int(downsample_factor * r)
# CAUTION: small diff here
self._pad_begin = int(downsample_factor * r)
def __call__(self, examples):
batch_size = len(examples)
# lengths
text_lengths = np.array([example[1]
for example in examples]).astype(np.int64)
frames = np.array([example[5]
for example in examples]).astype(np.int64)
max_text_length = int(np.max(text_lengths))
max_frames = int(np.max(frames))
if max_frames % self._factor != 0:
max_frames += (self._factor - max_frames % self._factor)
max_frames += self._pad_begin
max_decoder_length = max_frames // self._factor
# pad time sequence
text_sequences = []
lin_specs = []
mel_specs = []
done_flags = []
for example in examples:
(mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, num_frames) = example
text_sequences.append(
np.pad(mix_grapheme_phonemes,
(0, max_text_length - text_length)))
lin_specs.append(
np.pad(S_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
mel_specs.append(
np.pad(S_mel_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
(0, max_decoder_length -
int(np.ceil(num_frames // self._factor))),
constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs),
(0, 2, 1)).astype(np.float32)
mel_specs = np.transpose(np.array(mel_specs),
(0, 2, 1)).astype(np.float32)
done_flags = np.array(done_flags).astype(np.float32)
# text positions
text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(
text_lengths, -1)).astype(np.int64)
text_positions = np.arange(1, 1 + max_text_length) * text_mask
# decoder_positions
decoder_positions = np.tile(
np.expand_dims(np.arange(1, 1 + max_decoder_length), 0),
(batch_size, 1))
return (text_sequences, text_lengths, text_positions, mel_specs,
lin_specs, frames, decoder_positions, done_flags)

View File

@ -0,0 +1,122 @@
import os
import csv
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from scipy import signal, io
from parakeet.data import DatasetMixin, TransformDataset, FilterDataset
from parakeet.g2p.en import text_to_sequence, sequence_to_text
class LJSpeechMetaData(DatasetMixin):
def __init__(self, root):
self.root = Path(root)
csv_path = self.root.joinpath("train.txt")
self._table = pd.read_csv(
csv_path,
sep="|",
header=None,
quoting=csv.QUOTE_NONE,
names=["lin_spec", "mel_spec", "n_frames", "text"])
def get_example(self, i):
lin_spec, mel_spec, n_frames, text = self._table.iloc[i]
lin_spec = str(self.root.joinpath(lin_spec))
mel_spec = str(self.root.joinpath(mel_spec))
return lin_spec, mel_spec, n_frames, text + "\n"
def __len__(self):
return len(self._table)
class Transform(object):
def __init__(self, replace_pronounciation_prob=0.):
self.replace_pronounciation_prob = replace_pronounciation_prob
def __call__(self, in_data):
lin_spec, mel_spec, n_frames, text = in_data
# text processing
mix_grapheme_phonemes = text_to_sequence(
text, self.replace_pronounciation_prob)
text_length = len(mix_grapheme_phonemes)
# CAUTION: positions start from 1
speaker_id = None
S_norm = np.load(lin_spec).T.astype(np.float32)
S_mel_norm = np.load(mel_spec).T.astype(np.float32)
n_frames = S_mel_norm.shape[-1] # CAUTION: original number of frames
return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, n_frames)
class DataCollector(object):
def __init__(self, downsample_factor=4, r=1):
self.downsample_factor = int(downsample_factor)
self.frames_per_step = int(r)
self._factor = int(downsample_factor * r)
self._pad_begin = int(r) # int(downsample_factor * r)
def __call__(self, examples):
batch_size = len(examples)
# lengths
text_lengths = np.array([example[1]
for example in examples]).astype(np.int64)
frames = np.array([example[5]
for example in examples]).astype(np.int64)
max_text_length = int(np.max(text_lengths))
max_frames = int(np.max(frames))
if max_frames % self._factor != 0:
max_frames += (self._factor - max_frames % self._factor)
max_frames += self._factor
max_decoder_length = max_frames // self._factor
# pad time sequence
text_sequences = []
lin_specs = []
mel_specs = []
done_flags = []
for example in examples:
(mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, num_frames) = example
text_sequences.append(
np.pad(mix_grapheme_phonemes,
(0, max_text_length - text_length)))
lin_specs.append(
np.pad(S_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
mel_specs.append(
np.pad(S_mel_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
(0, max_decoder_length -
int(np.ceil(num_frames // self._factor))),
constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs),
(0, 2, 1)).astype(np.float32)
mel_specs = np.transpose(np.array(mel_specs),
(0, 2, 1)).astype(np.float32)
done_flags = np.array(done_flags).astype(np.float32)
# text positions
text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(
text_lengths, -1)).astype(np.int64)
text_positions = np.arange(1, 1 + max_text_length) * text_mask
# decoder_positions
decoder_positions = np.tile(
np.expand_dims(np.arange(1, 1 + max_decoder_length), 0),
(batch_size, 1))
return (text_sequences, text_lengths, text_positions, mel_specs,
lin_specs, frames, decoder_positions, done_flags)

View File

@ -0,0 +1,103 @@
meta_data:
min_text_length: 20
transform:
# text
replace_pronunciation_prob: 0.5
# spectrogram
sample_rate: 22050
max_norm: 0.999
preemphasis: 0.97
n_fft: 1024
win_length: 1024
hop_length: 256
# mel
fmin: 125
fmax: 7600
n_mels: 80
# db scale
min_level_db: -100
ref_level_db: 20
loss:
masked_loss_weight: 0.5
priority_freq: 3000
priority_freq_weight: 0.0
binary_divergence_weight: 0.1
guided_attention_sigma: 0.2
synthesis:
max_steps: 512
power: 1.4
n_iter: 32
model:
# speaker_embedding
n_speakers: 1
speaker_embed_dim: 16
speaker_embedding_weight_std: 0.01
max_positions: 512
dropout: 0.050000000000000044
# encoder
text_embed_dim: 256
embedding_weight_std: 0.1
freeze_embedding: false
padding_idx: 0
encoder_channels: 256
# decoder
query_position_rate: 1.0
key_position_rate: 1.29
trainable_positional_encodings: false
kernel_size: 3
decoder_channels: 512
downsample_factor: 4
outputs_per_step: 1
# attention
key_position_rate: true
value_position_rate: true
force_monotonic_attention: true
window_backward: -1
window_ahead: 3
use_memory_mask: true
# converter
use_decoder_state_for_postnet_input: true
converter_channels: 256
optimizer:
beta1: 0.5
beta2: 0.9
epsilon: 1e-6
lr_scheduler:
warmup_steps: 4000
peak_learning_rate: 5e-4
train:
batch_size: 16
epochs: 2000
report_interval: 100
snap_interval: 1000
eval_interval: 10000
save_interval: 10000

View File

@ -0,0 +1,6 @@
Scientists at the CERN laboratory say they have discovered a new particle.
There's a way to measure the acute emotional intelligence that has never gone out of style.
President Trump met with other leaders at the Group of 20 conference.
Generative adversarial network or variational auto-encoder.
Please call Stella.
Some have accepted this as a miracle without any physical explanation.

View File

@ -0,0 +1,121 @@
import os
import argparse
import ruamel.yamls
import numpy as np
import soundfile as sf
from paddle import fluid
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
from tensorboardX import SummaryWriter
from parakeet.g2p import en
from parakeet.utils.layer_tools import summary
from parakeet.modules.weight_norm import WeightNormWrapper
from utils import make_model, eval_model, plot_alignment
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Synthsize waveform with a checkpoint.")
parser.add_argument("-c", "--config", type=str, help="experiment config.")
parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
parser.add_argument("text", type=str, help="text file to synthesize")
parser.add_argument("output_path", type=str, help="path to save results")
args = parser.parse_args()
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
# =========================model=========================
transform_config = config["transform"]
replace_pronounciation_prob = transform_config[
"replace_pronunciation_prob"]
sample_rate = transform_config["sample_rate"]
preemphasis = transform_config["preemphasis"]
n_fft = transform_config["n_fft"]
n_mels = transform_config["n_mels"]
model_config = config["model"]
downsample_factor = model_config["downsample_factor"]
r = model_config["outputs_per_step"]
n_speakers = model_config["n_speakers"]
speaker_dim = model_config["speaker_embed_dim"]
speaker_embed_std = model_config["speaker_embedding_weight_std"]
n_vocab = en.n_vocab
embed_dim = model_config["text_embed_dim"]
linear_dim = 1 + n_fft // 2
use_decoder_states = model_config[
"use_decoder_state_for_postnet_input"]
filter_size = model_config["kernel_size"]
encoder_channels = model_config["encoder_channels"]
decoder_channels = model_config["decoder_channels"]
converter_channels = model_config["converter_channels"]
dropout = model_config["dropout"]
padding_idx = model_config["padding_idx"]
embedding_std = model_config["embedding_weight_std"]
max_positions = model_config["max_positions"]
freeze_embedding = model_config["freeze_embedding"]
trainable_positional_encodings = model_config[
"trainable_positional_encodings"]
use_memory_mask = model_config["use_memory_mask"]
query_position_rate = model_config["query_position_rate"]
key_position_rate = model_config["key_position_rate"]
window_behind = model_config["window_behind"]
window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
padding_idx, embedding_std, max_positions, n_vocab,
freeze_embedding, filter_size, encoder_channels,
n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, window_behind,
window_ahead, key_projection, value_projection,
downsample_factor, linear_dim, use_decoder_states,
converter_channels, dropout)
state, _ = dg.load_dygraph(args.checkpoint)
dv3.set_dict(state)
for layer in dv3.sublayers():
if isinstance(layer, WeightNormWrapper):
layer.remove_weight_norm()
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
transform_config = config["transform"]
c = transform_config["replace_pronunciation_prob"]
sample_rate = transform_config["sample_rate"]
min_level_db = transform_config["min_level_db"]
ref_level_db = transform_config["ref_level_db"]
preemphasis = transform_config["preemphasis"]
win_length = transform_config["win_length"]
hop_length = transform_config["hop_length"]
synthesis_config = config["synthesis"]
power = synthesis_config["power"]
n_iter = synthesis_config["n_iter"]
with open(args.text, "rt", encoding="utf-8") as f:
lines = f.readlines()
for idx, line in enumerate(lines):
text = line[:-1]
dv3.eval()
wav, attn = eval_model(dv3, text, replace_pronounciation_prob,
min_level_db, ref_level_db, power,
n_iter, win_length, hop_length,
preemphasis)
plot_alignment(
attn,
os.path.join(args.output_path, "test_{}.png".format(idx)))
sf.write(
os.path.join(args.output_path, "test_{}.wav".format(idx)),
wav, sample_rate)

View File

@ -0,0 +1,314 @@
import os
import argparse
import ruamel.yamls
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
import tqdm
import librosa
from librosa import display
import soundfile as sf
from tensorboardX import SummaryWriter
from paddle import fluid
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
from parakeet.g2p import en
from parakeet.models.deepvoice3.encoder import ConvSpec
from parakeet.data import FilterDataset, TransformDataset, FilterDataset
from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3
from parakeet.models.deepvoice3.loss import TTSLoss
from parakeet.utils.layer_tools import summary
from data import LJSpeechMetaData, DataCollector, Transform
from utils import make_model, eval_model, plot_alignment, plot_alignments, save_state, make_output_tree
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Train a deepvoice 3 model with LJSpeech dataset.")
parser.add_argument("-c", "--config", type=str, help="experimrnt config")
parser.add_argument("-s",
"--data",
type=str,
default="/workspace/datasets/LJSpeech-1.1/",
help="The path of the LJSpeech dataset.")
parser.add_argument("-r", "--resume", type=str, help="checkpoint to load")
parser.add_argument("-o",
"--output",
type=str,
default="result",
help="The directory to save result.")
parser.add_argument("-g",
"--device",
type=int,
default=-1,
help="device to use")
args, _ = parser.parse_known_args()
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
# =========================dataset=========================
# construct meta data
data_root = args.data
meta = LJSpeechMetaData(data_root)
# filter it!
min_text_length = config["meta_data"]["min_text_length"]
meta = FilterDataset(meta, lambda x: len(x[2]) >= min_text_length)
# transform meta data into meta data
transform_config = config["transform"]
replace_pronounciation_prob = transform_config[
"replace_pronunciation_prob"]
sample_rate = transform_config["sample_rate"]
preemphasis = transform_config["preemphasis"]
n_fft = transform_config["n_fft"]
win_length = transform_config["win_length"]
hop_length = transform_config["hop_length"]
fmin = transform_config["fmin"]
fmax = transform_config["fmax"]
n_mels = transform_config["n_mels"]
min_level_db = transform_config["min_level_db"]
ref_level_db = transform_config["ref_level_db"]
max_norm = transform_config["max_norm"]
clip_norm = transform_config["clip_norm"]
transform = Transform(replace_pronounciation_prob, sample_rate,
preemphasis, n_fft, win_length, hop_length, fmin,
fmax, n_mels, min_level_db, ref_level_db, max_norm,
clip_norm)
ljspeech = TransformDataset(meta, transform)
# =========================dataiterator=========================
# use meta data's text length as a sort key for the sampler
train_config = config["train"]
batch_size = train_config["batch_size"]
text_lengths = [len(example[2]) for example in meta]
sampler = PartialyRandomizedSimilarTimeLengthSampler(
text_lengths, batch_size)
# some hyperparameters affect how we process data, so create a data collector!
model_config = config["model"]
downsample_factor = model_config["downsample_factor"]
r = model_config["outputs_per_step"]
collector = DataCollector(downsample_factor=downsample_factor, r=r)
ljspeech_loader = DataCargo(ljspeech,
batch_fn=collector,
batch_size=batch_size,
sampler=sampler)
# =========================model=========================
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
# =========================model=========================
n_speakers = model_config["n_speakers"]
speaker_dim = model_config["speaker_embed_dim"]
speaker_embed_std = model_config["speaker_embedding_weight_std"]
n_vocab = en.n_vocab
embed_dim = model_config["text_embed_dim"]
linear_dim = 1 + n_fft // 2
use_decoder_states = model_config[
"use_decoder_state_for_postnet_input"]
filter_size = model_config["kernel_size"]
encoder_channels = model_config["encoder_channels"]
decoder_channels = model_config["decoder_channels"]
converter_channels = model_config["converter_channels"]
dropout = model_config["dropout"]
padding_idx = model_config["padding_idx"]
embedding_std = model_config["embedding_weight_std"]
max_positions = model_config["max_positions"]
freeze_embedding = model_config["freeze_embedding"]
trainable_positional_encodings = model_config[
"trainable_positional_encodings"]
use_memory_mask = model_config["use_memory_mask"]
query_position_rate = model_config["query_position_rate"]
key_position_rate = model_config["key_position_rate"]
window_behind = model_config["window_behind"]
window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
padding_idx, embedding_std, max_positions, n_vocab,
freeze_embedding, filter_size, encoder_channels,
n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, window_behind,
window_ahead, key_projection, value_projection,
downsample_factor, linear_dim, use_decoder_states,
converter_channels, dropout)
# =========================loss=========================
loss_config = config["loss"]
masked_weight = loss_config["masked_loss_weight"]
priority_freq = loss_config["priority_freq"] # Hz
priority_bin = int(priority_freq / (0.5 * sample_rate) * linear_dim)
priority_freq_weight = loss_config["priority_freq_weight"]
binary_divergence_weight = loss_config["binary_divergence_weight"]
guided_attention_sigma = loss_config["guided_attention_sigma"]
criterion = TTSLoss(masked_weight=masked_weight,
priority_bin=priority_bin,
priority_weight=priority_freq_weight,
binary_divergence_weight=binary_divergence_weight,
guided_attention_sigma=guided_attention_sigma,
downsample_factor=downsample_factor,
r=r)
# =========================lr_scheduler=========================
lr_config = config["lr_scheduler"]
warmup_steps = lr_config["warmup_steps"]
peak_learning_rate = lr_config["peak_learning_rate"]
lr_scheduler = dg.NoamDecay(
1 / (warmup_steps * (peak_learning_rate)**2), warmup_steps)
# =========================optimizer=========================
optim_config = config["optimizer"]
beta1 = optim_config["beta1"]
beta2 = optim_config["beta2"]
epsilon = optim_config["epsilon"]
optim = fluid.optimizer.Adam(lr_scheduler,
beta1,
beta2,
epsilon=epsilon,
parameter_list=dv3.parameters())
gradient_clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.1)
# =========================link(dataloader, paddle)=========================
# CAUTION: it does not return a DataLoader
loader = fluid.io.DataLoader.from_generator(capacity=10,
return_list=True)
loader.set_batch_generator(ljspeech_loader, places=place)
# tensorboard & checkpoint preparation
output_dir = args.output
ckpt_dir = os.path.join(output_dir, "checkpoints")
log_dir = os.path.join(output_dir, "log")
state_dir = os.path.join(output_dir, "states")
make_output_tree(output_dir)
writer = SummaryWriter(logdir=log_dir)
# load model parameters
resume_path = args.resume
if resume_path is not None:
state, _ = dg.load_dygraph(args.resume)
dv3.set_dict(state)
# =========================train=========================
epoch = train_config["epochs"]
report_interval = train_config["report_interval"]
snap_interval = train_config["snap_interval"]
save_interval = train_config["save_interval"]
eval_interval = train_config["eval_interval"]
global_step = 1
average_loss = {"mel": 0, "lin": 0, "done": 0, "attn": 0}
for j in range(1, 1 + epoch):
epoch_loss = {"mel": 0., "lin": 0., "done": 0., "attn": 0.}
for i, batch in tqdm.tqdm(enumerate(loader, 1)):
dv3.train() # CAUTION: don't forget to switch to train
(text_sequences, text_lengths, text_positions, mel_specs,
lin_specs, frames, decoder_positions, done_flags) = batch
downsampled_mel_specs = F.strided_slice(
mel_specs,
axes=[1],
starts=[0],
ends=[mel_specs.shape[1]],
strides=[downsample_factor])
mel_outputs, linear_outputs, alignments, done = dv3(
text_sequences, text_positions, text_lengths, None,
downsampled_mel_specs, decoder_positions)
losses = criterion(mel_outputs, linear_outputs, done,
alignments, downsampled_mel_specs,
lin_specs, done_flags, text_lengths, frames)
l = criterion.compose_loss(losses)
l.backward()
optim.minimize(l, grad_clip=gradient_clipper)
dv3.clear_gradients()
# ==================all kinds of tedious things=================
for k in epoch_loss.keys():
epoch_loss[k] += losses[k].numpy()[0]
average_loss[k] += losses[k].numpy()[0]
# record step loss into tensorboard
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
for k, v in step_loss.items():
writer.add_scalar(k, v, global_step)
# TODO: clean code
# train state saving, the first sentence in the batch
if global_step % snap_interval == 0:
linear_outputs_np = linear_outputs.numpy()[0].T
denoramlized = np.clip(linear_outputs_np, 0, 1) \
* (-min_level_db) \
+ min_level_db
lin_scaled = np.exp(
(denoramlized + ref_level_db) / 20 * np.log(10))
synthesis_config = config["synthesis"]
power = synthesis_config["power"]
n_iter = synthesis_config["n_iter"]
wav = librosa.griffinlim(lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
save_state(state_dir,
global_step,
mel_input=mel_specs.numpy()[0].T,
mel_output=mel_outputs.numpy()[0].T,
lin_input=lin_specs.numpy()[0].T,
lin_output=linear_outputs.numpy()[0].T,
alignments=alignments.numpy()[:, 0, :, :],
wav=wav)
# evaluation
if global_step % eval_interval == 0:
sentences = [
"Scientists at the CERN laboratory say they have discovered a new particle.",
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
"President Trump met with other leaders at the Group of 20 conference.",
"Generative adversarial network or variational auto-encoder.",
"Please call Stella.",
"Some have accepted this as a miracle without any physical explanation.",
]
for idx, sent in sentences:
wav, attn = eval_model(dv3, sent,
replace_pronounciation_prob,
min_level_db, ref_level_db,
power, n_iter, win_length,
hop_length, preemphasis)
wav_path = os.path.join(
state_dir, "waveform",
"eval_sample_{:09d}.wav".format(global_step))
sf.write(wav_path, wav, sample_rate)
attn_path = os.path.join(
state_dir, "alignments",
"eval_sample_attn_{:09d}.png".format(global_step))
plot_alignment(attn, attn_path)
# save checkpoint
if global_step % save_interval == 0:
dg.save_dygraph(dv3.state_dict(),
os.path.join(ckpt_dir, "dv3"))
dg.save_dygraph(optim.state_dict(),
os.path.join(ckpt_dir, "dv3"))
# report average loss
if global_step % report_interval == 0:
for k in epoch_loss.keys():
average_loss[k] /= report_interval
print("[average_loss] ",
"global_step: {}".format(global_step), average_loss)
average_loss = {"mel": 0, "lin": 0, "done": 0, "attn": 0}
global_step += 1
# epoch report
for k in epoch_loss.keys():
epoch_loss[k] /= i
print("[epoch_loss] ", "epoch: {}".format(j), epoch_loss)

View File

@ -0,0 +1,283 @@
import os
import argparse
import numpy as np
import pandas as pd
from matplotlib import cm
import matplotlib.pyplot as plt
import tqdm
import librosa
from scipy import signal
from librosa import display
import soundfile as sf
from tensorboardX import SummaryWriter
from paddle import fluid
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
from parakeet.g2p import en
from parakeet.models.Rdeepvoice3.encoder import ConvSpec
from parakeet.data import FilterDataset, TransformDataset, FilterDataset, DatasetMixin
from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
from parakeet.models.Rdeepvoice3 import Encoder, Decoder, Converter, DeepVoice3
from parakeet.models.Rdeepvoice3.loss import TTSLoss
from parakeet.modules.weight_norm_wrapper import WeightNormWrapper
from parakeet.utils.layer_tools import summary
from data_validate import LJSpeechMetaData, DataCollector, Transform
from utils import make_model, eval_model, plot_alignment, plot_alignments, save_state
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Train a deepvoice 3 model with LJSpeech")
parser.add_argument("-o",
"--output",
type=str,
default="result",
help="The directory to save result.")
parser.add_argument("-d",
"--data",
type=str,
default="/workspace/datasets/ljs_dv3",
help="The path of the LJSpeech dataset.")
parser.add_argument("-r", "--resume", type=str, help="checkpoint to load")
args, _ = parser.parse_known_args()
# =========================dataset=========================
data_root = args.data
meta = LJSpeechMetaData(data_root) # construct meta data
#meta = FilterDataset(meta, lambda x: len(x[3]) >= 20) # filter it!
transform = Transform()
ljspeech = TransformDataset(meta, transform)
# =========================dataiterator=========================
# use meta data's text length as a sort key
# which is used in sampler
text_lengths = [len(example[3]) for example in meta]
# some hyperparameters affect how we process data, so create a data collector!
collector = DataCollector(downsample_factor=4., r=1)
ljspeech_loader = DataCargo(ljspeech,
batch_fn=collector,
batch_size=16,
sampler=SequentialSampler(ljspeech))
# sampler=PartialyRandomizedSimilarTimeLengthSampler(text_lengths,
# batch_size=32))
# ljspeech_iterator = ljspeech_loader() # if you want to inspect it!
# for i in range(3):
# batch = next(ljspeech_iterator)
# print(batch)
# =========================model=========================
sample_rate = 22050
n_speakers = 1
speaker_dim = 16
n_vocab = en.n_vocab
embed_dim = 256
mel_dim = 80
downsample_factor = 4
r = 1
linear_dim = 1 + 1024 // 2
use_decoder_states = True
filter_size = 3
encoder_channels = 512
decoder_channels = 256
converter_channels = 256
dropout = 0. #0.050000000000000044
place = fluid.CPUPlace()
with dg.guard(place):
# =========================model=========================
dv3 = make_model(n_speakers, speaker_dim, n_vocab, embed_dim, mel_dim,
downsample_factor, r, linear_dim, use_decoder_states,
filter_size, encoder_channels, decoder_channels,
converter_channels, dropout)
# =========================loss=========================
priority_freq = 3000 # Hz
priority_bin = int(priority_freq / (0.5 * sample_rate) * linear_dim)
criterion = TTSLoss(masked_weight=.5,
priority_bin=priority_bin,
priority_weight=.0,
binary_divergence_weight=.1,
guided_attention_sigma=.2,
downsample_factor=downsample_factor,
r=r)
# summary(dv3)
# =========================lr_scheduler=========================
warmup_steps = 4000
peak_learning_rate = 5e-4
lr_scheduler = dg.NoamDecay(d_model=1 / (warmup_steps *
(peak_learning_rate)**2),
warmup_steps=warmup_steps)
# =========================optimizer=========================
beta1, beta2 = 0.5, 0.9
epsilon = 1e-6
optim = fluid.optimizer.Adam(lr_scheduler,
beta1,
beta2,
epsilon=1e-6,
parameter_list=dv3.parameters())
# =========================link(dataloader, paddle)=========================
# CAUTION: it does not return a DataLoader
loader = fluid.io.DataLoader.from_generator(capacity=10,
return_list=True)
loader.set_batch_generator(ljspeech_loader, places=place)
# tensorboard & checkpoint preparation
output_dir = args.output
ckpt_dir = os.path.join(output_dir, "checkpoints")
state_dir = os.path.join(output_dir, "states")
log_dir = os.path.join(output_dir, "log")
for x in [ckpt_dir, state_dir]:
if not os.path.exists(x):
os.makedirs(x)
for x in ["alignments", "waveform", "lin_spec", "mel_spec"]:
p = os.path.join(state_dir, x)
if not os.path.exists(p):
os.makedirs(p)
writer = SummaryWriter(logdir=log_dir)
# DEBUG
resume_path = args.resume
if resume_path is not None:
state, _ = dg.load_dygraph(args.resume)
dv3.set_dict(state)
# =========================train=========================
epoch = 3000
global_step = 1
average_loss = {"mel": 0, "lin": 0, "done": 0, "attn": 0}
epoch_loss = {"mel": 0, "lin": 0, "done": 0, "attn": 0}
for j in range(epoch):
for i, batch in tqdm.tqdm(enumerate(loader)):
dv3.train() # switch to train
(text_sequences, text_lengths, text_positions, mel_specs,
lin_specs, frames, decoder_positions, done_flags) = batch
downsampled_mel_specs = F.strided_slice(
mel_specs,
axes=[1],
starts=[0],
ends=[mel_specs.shape[1]],
strides=[downsample_factor])
mel_outputs, linear_outputs, alignments, done = dv3(
text_sequences, text_positions, text_lengths, None,
downsampled_mel_specs, decoder_positions)
# print("========")
# print("text lengths: {}".format(text_lengths.numpy()))
# print("n frames: {}".format(frames.numpy()))
# print("[mel] mel's shape: {}; "
# "downsampled mel's shape: {}; "
# "output's shape: {}".format(mel_specs.shape,
# downsampled_mel_specs.shape,
# mel_outputs.shape))
# print("[lin] lin's shape: {}; "
# "output's shape{}".format(lin_specs.shape,
# linear_outputs.shape))
# print("[attn]: alignments's shape: {}".format(alignments.shape))
# print("[done]: input done flag's shape: {}; "
# "output done flag's shape: {}".format(
# done_flags.shape, done.shape))
losses = criterion(mel_outputs, linear_outputs, done,
alignments, downsampled_mel_specs,
lin_specs, done_flags, text_lengths, frames)
for k in epoch_loss.keys():
epoch_loss[k] += losses[k].numpy()[0]
average_loss[k] += losses[k].numpy()[0]
global_step += 1
# train state saving, the first sentence in the batch
if global_step > 0 and global_step % 10 == 0:
linear_outputs_np = linear_outputs.numpy()[0].T
denoramlized = np.clip(linear_outputs_np, 0,
1) * 100. - 100.
lin_scaled = np.exp((denoramlized + 20) / 20 * np.log(10))
wav = librosa.griffinlim(lin_scaled**1.4,
n_iter=32,
hop_length=256,
win_length=1024)
save_state(state_dir,
global_step,
mel_input=mel_specs.numpy()[0].T,
mel_output=mel_outputs.numpy()[0].T,
lin_input=lin_specs.numpy()[0].T,
lin_output=linear_outputs.numpy()[0].T,
alignments=alignments.numpy()[:, 0, :, :],
wav=wav)
# evaluation
if global_step > 0 and global_step % 10 == 0:
wav, attn = eval_model(
dv3,
"Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition"
)
wav_path = os.path.join(
state_dir, "waveform",
"eval_sample_{}.wav".format(global_step))
sf.write(wav_path, wav, 22050)
attn_path = os.path.join(
state_dir, "alignments",
"eval_sample_attn_{}.png".format(global_step))
plot_alignment(attn, attn_path)
# for tensorboard writer, if you want more, write more
# cause you are in the process
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
for k, v in step_loss.items():
writer.add_scalar(k, v, global_step)
# save checkpoint
if global_step % 1000 == 0:
for i, attn_layer in enumerate(
alignments.numpy()[:, 0, :, :]):
plt.figure()
plt.imshow(attn_layer)
plt.xlabel("encoder_timesteps")
plt.ylabel("decoder_timesteps")
plt.savefig("results3/step_{}_layer_{}.png".format(
global_step, i),
format="png")
plt.close()
# print(step_loss)
if global_step % 100 == 0:
for k in epoch_loss.keys():
average_loss[k] /= 100
print("[average_loss] ",
"global_step: {}".format(global_step), average_loss)
average_loss = {"mel": 0, "lin": 0, "done": 0, "attn": 0}
l = criterion.compose_loss(losses)
l.backward()
# print("loss: ", l.numpy()[0])
optim.minimize(
l,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(
0.1))
dv3.clear_gradients()
if global_step % 10000 == 0:
dg.save_dygraph(dv3.state_dict(),
os.path.join(ckpt_dir, "dv3"))
dg.save_dygraph(optim.state_dict(),
os.path.join(ckpt_dir, "dv3"))
for k in epoch_loss.keys():
epoch_loss[k] /= (i + 1)
print("[epoch_loss] ", "epoch: {}".format(j + 1), epoch_loss)
epoch_loss = {"mel": 0, "lin": 0, "done": 0, "attn": 0}

View File

@ -0,0 +1,272 @@
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
from scipy import signal
from librosa import display
import soundfile as sf
from paddle import fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.initializer as I
from parakeet.g2p import en
from parakeet.models.deepvoice3.encoder import ConvSpec
from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3, WindowRange
from parakeet.utils.layer_tools import freeze
@fluid.framework.dygraph_only
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
padding_idx, embedding_std, max_positions, n_vocab,
freeze_embedding, filter_size, encoder_channels, mel_dim,
decoder_channels, r, trainable_positional_encodings,
use_memory_mask, query_position_rate, key_position_rate,
window_behind, window_ahead, key_projection, value_projection,
downsample_factor, linear_dim, use_decoder_states,
converter_channels, dropout):
"""just a simple function to create a deepvoice 3 model"""
if n_speakers > 1:
spe = dg.Embedding((n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
else:
spe = None
h = encoder_channels
k = filter_size
encoder_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
)
enc = Encoder(n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=padding_idx,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
dropout=dropout)
if freeze_embedding:
freeze(enc.embed)
h = decoder_channels
prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
attentive_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
)
attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True]
dec = Decoder(n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=r,
max_positions=max_positions,
padding_idx=padding_idx,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=WindowRange(window_behind, window_ahead),
key_projection=key_projection,
value_projection=value_projection)
if not trainable_positional_encodings:
freeze(dec.embed_keys_positions)
freeze(dec.embed_query_positions)
h = converter_channels
postnet_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3),
)
cvt = Converter(n_speakers,
speaker_dim,
dec.state_dim if use_decoder_states else mel_dim,
linear_dim,
time_upsampling=downsample_factor,
convolutions=postnet_convolutions,
dropout=dropout)
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
return dv3
@fluid.framework.dygraph_only
def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length,
preemphasis):
"""generate waveform from text using a deepvoice 3 model"""
text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
dtype=np.int64)
length = len(text)
print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length)
text = np.expand_dims(text, 0)
text_positions = np.expand_dims(text_positions, 0)
mel_outputs, linear_outputs, alignments, done = model.transduce(
dg.to_variable(text), dg.to_variable(text_positions))
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)
print("linear_outputs's shape: ", linear_outputs_np.shape)
denoramlized = np.clip(linear_outputs_np, 0,
1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
wav = signal.lfilter([1.], [1., -preemphasis], wav)
print("alignmnets' shape:", alignments.shape)
alignments_np = alignments.numpy()[0].T
return wav, alignments_np
def make_output_tree(output_dir):
print("creating output tree: {}".format(output_dir))
ckpt_dir = os.path.join(output_dir, "checkpoints")
state_dir = os.path.join(output_dir, "states")
log_dir = os.path.join(output_dir, "log")
for x in [ckpt_dir, state_dir]:
if not os.path.exists(x):
os.makedirs(x)
for x in ["alignments", "waveform", "lin_spec", "mel_spec"]:
p = os.path.join(state_dir, x)
if not os.path.exists(p):
os.makedirs(p)
def plot_alignment(alignment, path, info=None):
"""
Plot an attention layer's alignment for a sentence.
alignment: shape(T_enc, T_dec), and T_enc is flipped
"""
fig, ax = plt.subplots()
im = ax.imshow(alignment,
aspect='auto',
origin='lower',
interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if info is not None:
xlabel += '\n\n' + info
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
plt.savefig(path)
plt.close()
def plot_alignments(alignments, save_dir, global_step):
"""
Plot alignments for a sentence when training, we just pick the first
sentence. Each layer is plot separately.
alignments: shape(N, T_dec, T_enc)
"""
n_layers = alignments.shape[0]
for i, alignment in enumerate(alignments):
alignment = alignment.T
path = os.path.join(save_dir, "layer_{}".format(i))
if not os.path.exists(path):
os.makedirs(path)
fname = os.path.join(path, "step_{:09d}".format(global_step))
plot_alignment(alignment, fname)
average_alignment = np.mean(alignments, axis=0).T
path = os.path.join(save_dir, "average")
if not os.path.exists(path):
os.makedirs(path)
fname = os.path.join(path, "step_{:09d}.png".format(global_step))
plot_alignment(average_alignment, fname)
def save_state(save_dir,
global_step,
mel_input=None,
mel_output=None,
lin_input=None,
lin_output=None,
alignments=None,
wav=None):
if mel_input is not None and mel_output is not None:
path = os.path.join(save_dir, "mel_spec")
if not os.path.exists(path):
os.makedirs(path)
plt.figure(figsize=(10, 3))
display.specshow(mel_input)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"target_mel_spec_step{:09d}".format(global_step)))
plt.close()
plt.figure(figsize=(10, 3))
display.specshow(mel_output)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"predicted_mel_spec_step{:09d}".format(global_step)))
plt.close()
if lin_input is not None and lin_output is not None:
path = os.path.join(save_dir, "lin_spec")
if not os.path.exists(path):
os.makedirs(path)
plt.figure(figsize=(10, 3))
display.specshow(lin_input)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"target_lin_spec_step{:09d}".format(global_step)))
plt.close()
plt.figure(figsize=(10, 3))
display.specshow(lin_output)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"predicted_lin_spec_step{:09d}".format(global_step)))
plt.close()
if alignments is not None and len(alignments.shape) == 3:
path = os.path.join(save_dir, "alignments")
if not os.path.exists(path):
os.makedirs(path)
plot_alignments(alignments, path, global_step)
if wav is not None:
path = os.path.join(save_dir, "waveform")
if not os.path.exists(path):
os.makedirs(path)
sf.write(
os.path.join(path, "sample_step_{:09d}.wav".format(global_step)),
wav, 22050)

View File

@ -1,208 +0,0 @@
# Deep Voice 3 with Paddle Fluid
[中文版](README_cn.md)
Paddle fluid implementation of DeepVoice 3, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
We implement Deepvoice3 model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
## Installation
You additionally need to download punkt and cmudict for nltk, because we tokenize text with `punkt` and convert text into phonemes with `cmudict`.
```python
import nltk
nltk.download("punkt")
nltk.download("cmudict")
```
## Model Architecture
![DeepVoice3 model architecture](./_images/model_architecture.png)
The model consists of an encoder, a decoder and a converter (and a speaker embedding for multispeaker models). The encoder, together with the decoder forms the seq2seq part of the model, and the converter forms the postnet part.
## Project Structure
```text
├── audio.py # audio processing
├── compute_timestamp_ratio.py # script to compute position rate
├── conversion # parameter conversion from pytorch model
├── requirements.txt # requirements
├── hparams.py # HParam class for deepvoice3
├── hparam_tf # hyper parameter related stuffs
├── ljspeech.py # functions for ljspeech preprocessing
├── preprocess.py # preprocrssing script
├── presets # preset hyperparameters
├── deepvoice3_paddle # DeepVoice3 model implementation
├── eval_model.py # functions for model evaluation
├── synthesis.py # script for speech synthesis
├── train_model.py # functions for model training
└── train.py # script for model training
```
## Usage
There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on. Hyperparameters that are known to work good are provided in the repository. See `presets` directory for details. Now we only provide preset with LJSpeech dataset (`deepvoice3_ljspeech.json`). Support for more models and datasets is pending.
Note that `preprocess.py`, `train.py` and `synthesis.py` all accept a `--preset` parameter. To ensure consistency, you should use the same preset for preprocessing, training and synthesizing.
Note that you can overwrite preset hyperparameters with command line argument `--hparams`, just pass several key-value pair in `${key}=${value}` format seperated by comma `,`. For example `--hparams="batch_size=8, nepochs=500"` can overwrite default values in the preset json file.
Some hyperparameters are only related to training, like `batch_size`, `checkpoint_interval` and you can use different values for preprocessing and training. But hyperparameters related to data preprocessing, like `num_mels` and `ref_level_db`, should be kept the same for preprocessing and training.
For more details about hyperparameters, see `hparams.py`, which contains the definition of `hparams`. Priority order of hyperparameters is command line option `--hparams` > `--preset` json configuration file > definition of hparams in `hparams.py`.
### Dataset
Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar xjvf LJSpeech-1.1.tar.bz2
```
Preprocessing with `preprocess.py`.
```bash
python preprocess.py \
--preset=${preset_json_path} \
--hparams="hyper parameters you want to overwrite" \
${name} ${in_dir} ${out_dir}
```
Now `${name}$` only supports `ljspeech`. Support for other datasets is pending.
Assuming that you use `presers/deepvoice3_ljspeech.json` for LJSpeech and the path of the unziped dataset is `./data/LJSpeech-1.1`, then you can preprocess data with the following command.
```bash
python preprocess.py \
--preset=presets/deepvoice3_ljspeech.json \
ljspeech ./data/LJSpeech-1.1/ ./data/ljspeech
```
When this is done, you will see extracted features in `./data/ljspeech` including:
1. text and corresponding file names for the extracted features in `train.txt`.
2. mel-spectrogram in `ljspeech-mel-*.npy` .
3. linear-spectrogram in `ljspeech-spec-*.npy`.
### Train on single GPU
Training the whole model on one single GPU:
```bash
export CUDA_VISIBLE_DEVICES=0
python train.py --data-root=${data-root} --use-gpu \
--preset=${preset_json_path} \
--hparams="parameters you may want to override"
```
For more details about `train.py`, see `python train.py --help`.
#### load checkpoints
We provide a trained model ([dv3.single_frame](https://paddlespeech.bj.bcebos.com/Parakeet/dv3.single_frame.tar.gz)) for downloading, which is trained with the default preset. Unzip the downloaded file with `tar xzvf dv3.single_frame.tar.gz`, you will get `config.json`, `model.pdparams` and `README.md`. `config.json` is the preset json with which the model is trained, `model.pdparams` is the parameter file, and `README.md` is a brief introduction of the model.
You can load saved checkpoint and resume training with `--checkpoint` (You only need to provide the base name of the parameter file, eg. if you want to load `model.pdparams`, just use `--checkpoint=model`). If there is also a file with the same basename and extension name `.pdopt` in the same folder with the model file (i.e. `model.pdopt`, which is the optimizer file), it is also loaded automatically. If you wan to reset optimizer states, pass `--reset-optimizer` in addition.
#### train a part of the model
You can also train parts of the model while freezing other parts, by passing `--train-seq2seq-only` or `--train-postnet-only`. When training only parts of the model, other parts should be loaded from saved checkpoint.
To train only the `seq2seq` or `postnet`, you should load from a whole model with `--checkpoint` and keep the same configurations with which the checkpoint is trained. Note that when training only the `postnet`, you should set `use_decoder_state_for_postnet_input=false`, because when train only the postnet, the postnet takes the ground truth mel-spectrogram as input. Note that the default value for `use_decoder_state_for_postnet_input` is `True`.
example:
```bash
export CUDA_VISIBLE_DEVICES=0
python train.py --data-root=${data-root} --use-gpu \
--preset=${preset_json_path} \
--hparams="parameters you may want to override" \
--train-seq2seq-only \
--output=${directory_to_save_results}
```
### Training on multiple GPUs
Training on multiple GPUs with data parallel is enabled. You can run `train.py` with `paddle.distributed.launch` module. Here is the command line usage.
```bash
python -m paddle.distributed.launch \
--started_port ${port_of_the_first_worker} \
--selected_gpus ${logical_gpu_ids_to_choose} \
--log_dir ${path_of_write_log} \
training_script ...
```
`paddle.distributed.launch` parallelizes training in multiprocessing mode.`--selected_gpus` means the logical ids of the selected GPUs, and `started_port` means the port used by the first worker. Outputs of each process are saved in `--log_dir.` Then follows the command for training on a single GPU, except that you should pass `--use-data-paralle` in addition.
```bash
export CUDA_VISIBLE_DEVICES=2,3,4,5 # The IDs of visible physical devices
python -m paddle.distributed.launch \
--selected_gpus=0,1,2,3 --log_dir ${multi_gpu_log_dir} \
train.py --data-root=${data-root} \
--use-gpu --use-data-parallel \
--preset=${preset_json_path} \
--hparams="parameters you may want to override"
```
In the example above, we set only GPU `2, 3, 4, 5` to be visible. Then `--selected_gpus="0, 1, 2, 3"` means the logical ids of the selected gpus, which correponds to GPU `2, 3, 4, 5`.
Model checkpoints (`*.pdparams` for the model and `*.pdopt` for the optimizer) are saved in `${directory_to_save_results}/checkpoints` per 10000 steps by default. Layer-wise averaged attention alignments (.png) are saved in `${directory_to_save_results}/checkpoints/alignment_ave`. And alignments for each attention layer are saved in `${directory_to_save_results}/checkpoints/alignment_layer{attention_layer_num}` per 10000 steps for inspection.
Synthesis results of 6 sentences (hardcoded in `eval_model.py`) are saved in `${directory_to_save_results}/checkpoints/eval`, including `step{step_num}_text{text_id}_single_alignment.png` for averaged alignments and `step{step_num}_text{text_id}_single_predicted.wav` for the predicted waveforms.
### Monitor with Tensorboard
Logs with tensorboard are saved in `${directory_to_save_results}/log/` directory by default. You can monitor logs by tensorboard.
```bash
tensorboard --logdir=${log_dir} --host=$HOSTNAME --port=8888
```
### Synthesize from a checkpoint
Given a list of text, `synthesis.py` synthesize audio signals from a trained model.
```bash
python synthesis.py --use-gpu --preset=${preset_json_path} \
--hparams="parameters you may want to override" \
${checkpoint} ${text_list_file} ${dst_dir}
```
Example test_list.txt:
```text
Generative adversarial network or variational auto-encoder.
Once upon a time there was a dear little girl who was loved by every one who looked at her, but most of all by her grandmother, and there was nothing that she would not have given to the child.
A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module.
```
generated waveform files and alignment files are saved in `${dst_dir}`.
### Compute position ratio
According to [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654), the position rate is different for different datasets. There are 2 position rates, one for the query and the other for the key, which are referred to as $\omega_1$ and $\omega_2$ in th paper, and the corresponding names in preset json are `query_position_rate` and `key_position_rate`.
For example, the `query_position_rate` and `key_position_rate` for LJSpeech are `1.0` and `1.385`, respectively. Fix the `query_position_rate` as 1.0, the `key_position_rate` is computed with `compute_timestamp_ratio.py`. Run the command below, where `${data_root}` means the path of the preprocessed dataset.
```bash
python compute_timestamp_ratio.py --preset=${preset_json_path} \
--hparams="parameters you may want to override" ${data_root}
```
You will get outputs like this.
```text
100%|██████████████████████████████████████████████████████████| 13047/13047 [00:12<00:00, 1058.19it/s]
1345587 1863884.0 1.3851828235558161
```
Then set the `key_position_rate=1.385` and `query_position_rate=1.0` in the preset.
## Acknowledgement
We thankfully included and adapted some files from r9y9's [deepvoice3_pytorch](https://github.com/r9y9/deepvoice3_pytorch).

View File

@ -1,224 +0,0 @@
# Deep Voice 3 with Paddle Fluid
[English](README.md)
Paddle 实现的 Deepvoice3一个基于卷积神经网络的语音合成 (Text to Speech) 模型。本实现基于 [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654) 。
本 Deepvoice3 实现使用 Paddle 动态图模式,这对于灵活的网络结构更为方便。
## 安装
### 安装 paddlepaddle 框架
为了更快的训练速度和更好的支持,我们推荐使用最新的开发版 paddle。用户可以最新编译的开发版 whl 包,也可以选择从源码编译 Paddle。
1. 下载最新编译的开发版 whl 包。可以从 [**多版本 wheel 包列表-dev**](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) 页面中选择合适的版本。
2. 从源码编译 Paddle. 参考[**从源码编译**](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/compile/fromsource.html) 页面。注意,如果你需要使用多卡训练,那么编译前需要设置选项 `-DWITH_DISTRIBUTE=ON`
### 其他依赖
使用 pip 安装其他依赖。
```bash
pip install -r requirements.txt
```
另外需要下载 nltk 的两个库,因为使用了 `punkt` 对文本进行 tokenization并且使用了 `cmudict` 来将文本转为音位。
```python
import nltk
nltk.download("punkt")
nltk.download("cmudict")
```
## 模型结构
![DeepVoice3 模型结构](./_images/model_architecture.png)
模型包含 encoder, decoder, converter 几个部分,对于 multispeaker 数据集,还有一个 speaker embedding。其中 encoder 和 decoder 构成 seq2seq 部分converter 构成 postnet 部分。
## 项目结构
```text
├── audio.py # 用于处理处理音频的函数
├── compute_timestamp_ratio.py # 计算 position rate 的脚本
├── conversion # 用于转换 pytorch 实现的参数
├── requirements.txt # 项目依赖
├── hparams.py # DeepVoice3 运行超参数配置类的定义
├── hparam_tf # 超参数相关
├── ljspeech.py # ljspeech 数据集预处理
├── preprocess.py # 通用预处理脚本
├── presets # 预设超参数配置
├── deepvoice3_paddle # DeepVoice3 模型实现的主要文件
├── eval_model.py # 模型测评相关函数
├── synthesis.py # 用于语音合成的脚本
├── train_model.py # 模型训练相关函数
└── train.py # 用于模型训练的脚本
```
## 使用方法
根据所使用的模型配置和数据集的不同,有不少超参数需要进行调节。我们提供已知结果较好的超参数设置,详见 `presets` 文件夹。目前我们只提供 LJSpeech 的预设配置 `deepvoice3_ljspeech.json`)。后续将提供更多模型和数据集的预设配置。
`preprocess.py``train.py``synthesis.py` 都接受 `--preset` 参数。为了保持一致性,最好在数据预处理,模型训练和语音合成时使用相同的预设配置。
可以通过 `--hparams` 参数来覆盖预设的超参数配置,参数格式是逗号分隔的键值对 `${key}=${value}`,例如 `--hparams="batch_size=8, nepochs=500"`
部分参数只和训练有关,如 `batch_size`, `checkpoint_interval`, 用户在训练时可以使用不同的值。但部分参数和数据预处理相关,如 `num_mels``ref_level_db`, 这些参数在数据预处理和训练时候应该保持一致。
关于超参数设置更多细节可以参考 `hparams.py` ,其中定义了 hparams。超参数的优先级序列是通过命令行参数 `--hparams` 传入的参数优先级高于通过 `--preset` 参数传入的 json 配置文件,高于 `hparams.py` 中的定义。
### 数据集
下载并解压 [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) 数据集。
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar xjvf LJSpeech-1.1.tar.bz2
```
使用 `preprocess.py`进行预处理。
```bash
python preprocess.py \
--preset=${preset_json_path} \
--hparams="hyper parameters you want to overwrite" \
${name} ${in_dir} ${out_dir}
```
目前 `${name}$` 只支持 `ljspeech`。未来将会支持更多数据集。
假设你使用 `presers/deepvoice3_ljspeech.json` 作为处理 LJSpeech 的预设配置文件,并且解压后的数据集位于 `./data/LJSpeech-1.1`, 那么使用如下的命令进行数据预处理。
```bash
python preprocess.py \
--preset=presets/deepvoice3_ljspeech.json \
ljspeech ./data/LJSpeech-1.1/ ./data/ljspeech
```
数据处理完成后,你会在 `./data/ljspeech` 看到提取的特征,包含如下文件。
1. `train.txt`,包含文本和对应的音频特征的文件名。
2. `ljspeech-mel-*.npy`,包含 mel 频谱。
3. `ljspeech-spec-*.npy`,包含线性频谱。
### 使用 GPU 单卡训练
在单个 GPU 上训练整个模型的使用方法如下。
```bash
export CUDA_VISIBLE_DEVICES=0
python train.py --data-root=${data-root} --use-gpu \
--preset=${preset_json_path} \
--hparams="parameters you may want to override"
```
用于可以通过 `python train.py --help` 查看 `train.py` 的详细使用方法。
#### 加载保存的模型
我们提供了使用默认的配置文件训练的模型 [dv3.single_frame](https://paddlespeech.bj.bcebos.com/Parakeet/dv3.single_frame.tar.gz) 供用户下载。使用 `tar xzvf dv3.single_frame.tar.gz` 解压下载的文件,会得到 `config.json`, `model.pdparams` and `README.md`。其中 `config.json` 是模型训练时使用的配置文件,`model.pdparams` 是参数文件,`README.md` 是模型的简要说明。
用户可以通过 `--checkpoint` 参数加载保存的模型并恢复训练(注意:只需要传基础文件名,不需要扩展名,例如需要加载 `model.pdparams` 那么,只需要使用 `--checkpoint=model`)。如果同一个文件夹内有一个和参数文件基础文件名相同,而后缀为 `.pdopt` 的文件,(如 `model.pdopt`,即优化器文件),那么该文件也会被自动加载。如果你想要重置优化器的状态,在训练脚本加入 `--reset-optimizer` 参数。
#### 训练模型的一部分
用户可以通过 `--train-seq2seq-only` 或者 `--train-postnet-only` 来实现固定模型的其他部分,只训练需要训练的部分。但当只训练模型的一部分时,其他的部分需要从保存的模型中加载。
当只训练模型的 `seq2seq` 部分或者 `postnet` 部分时,需要使用 `--checkpoint` 加载整个模型并保持相同的配置。注意,当只训练 `postnet` 的时候,需要保证配置中的`use_decoder_state_for_postnet_input=false`因为在这种情况下postnet 使用真实的 mel 频谱作为输入。注意,`use_decoder_state_for_postnet_input` 的默认值是 `True`
示例:
```bash
export CUDA_VISIBLE_DEVICES=0
python train.py --data-root=${data-root} --use-gpu \
--preset=${preset_json_path} \
--hparams="parameters you may want to override" \
--train-seq2seq-only \
--output=${directory_to_save_results}
```
### 使用 GPU 多卡训练
本模型支持使用多个 GPU 通过数据并行的方式训练。方法是使用 `paddle.distributed.launch` 模块来启动 `train.py`
```bash
python -m paddle.distributed.launch \
--started_port ${port_of_the_first_worker} \
--selected_gpus ${logical_gpu_ids_to_choose} \
--log_dir ${path_to_write_log} \
training_script ...
```
paddle.distributed.launch 通过多进程的方式进行并行训练。`--selected_gpus` 指的是选择的 GPU 的逻辑序号,`started_port` 指的是 0 号显卡的使用的端口号,`--log_dir` 是日志保存的目录,每个进程的输出会在这个文件夹中保存为单独的文件。再在后面接上需要启动的脚本文件及其参数即可。这部分和单卡训练的脚本一致,但是需要传入 `--use-data-paralle` 以使用数据并行训练。示例命令如下。
```bash
export CUDA_VISIBLE_DEVICES=2,3,4,5 # The IDs of visible physical devices
python -m paddle.distributed.launch \
--selected_gpus=0,1,2,3 --log_dir ${multi_gpu_log_dir} \
train.py --data-root=${data-root} \
--use-gpu --use-data-parallel \
--preset=${preset_json_path} \
--hparams="parameters you may want to override" \
--output=${directory_to_save_results}
```
上述的示例中,设置了 `2, 3, 4, 5` 号显卡为可见的 GPU。然后 `--selected_gpus=0,1,2,3` 选择的是 GPU 的逻辑序号,分别对应于 `2, 3, 4, 5` 号卡。
模型 (模型参数保存为`*.pdparams` 文件,优化器被保存为 `*.pdopt` 文件)保存在 `${directory_to_save_results}/checkpoints` 文件夹中。多层平均的注意力机制对齐结果被保存为 `.png` 图片,默认保存在 `${directory_to_save_results}/checkpoints/alignment_ave` 中。每一层的注意力机制对齐结果默认被保存在 `${directory_to_save_results}/checkpoints/alignment_layer{attention_layer_num}`文件夹中。默认每 10000 步保存一次用于查看。
对 6 个给定的句子的语音合成结果保存在 `${directory_to_save_results}/checkpoints/eval` 中,包含多层平均平均的注意力机制对齐结果,这被保存为名为 `step{step_num}_text{text_id}_single_alignment.png` 的图片;以及合成的音频文件,保存为名为 `step{step_num}_text{text_id}_single_predicted.wav` 的音频。
### 使用 Tensorboard 查看训练
Tensorboard 训练日志被保存在 `${directory_to_save_results}/log/` 文件夹,可以通过 tensorboard 查看。使用方法如下。
```bash
tensorboard --logdir=${log_dir} --host=$HOSTNAME --port=8888
```
### 从保存的模型合成语音
给定一组文本,使用 `synthesis.py` 从一个训练好的模型来合成语音,使用方法如下。
```bash
python synthesis.py --use-gpu --preset=${preset_json_path} \
--hparams="parameters you may want to override" \
${checkpoint} ${text_list_file} ${dst_dir}
```
示例文本文件如下:
```text
Generative adversarial network or variational auto-encoder.
Once upon a time there was a dear little girl who was loved by every one who looked at her, but most of all by her grandmother, and there was nothing that she would not have given to the child.
A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module.
```
合成的结果包含注意力机制对齐结果和音频文件,保存于 `${dst_dir}`
### 计算 position rate
根据 [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654), 对于不同的数据集,会有不同的 position rate. 有两个不同的 position rate一个用于 query 一个用于 key 这在论文中称为 $\omega_1$ 和 $\omega_2$ ,在预设配置文件中的名字分别为 `query_position_rate``key_position_rate`
比如 LJSpeech 数据集的 `query_position_rate``key_position_rate` 分别为 `1.0``1.385`。固定 `query_position_rate` 为 1.0`key_position_rate` 可以使用 `compute_timestamp_ratio.py` 计算,命令如下,其中 `${data_root}` 是预处理后的数据集路径。
```bash
python compute_timestamp_ratio.py --preset=${preset_json_path} \
--hparams="parameters you may want to override" ${data_root}
```
可以得到如下的结果。
```text
100%|██████████████████████████████████████████████████████████| 13047/13047 [00:12<00:00, 1058.19it/s]
1345587 1863884.0 1.3851828235558161
```
然后在预设配置文件中设置 `key_position_rate=1.385` 以及 `query_position_rate=1.0`
## 致谢
本实现包含及改写了 r9y9's 的 [deepvoice3_pytorch](https://github.com/r9y9/deepvoice3_pytorch) 中的部分文件,在此表示感谢。

View File

@ -0,0 +1,4 @@
from parakeet.models.deepvoice3.encoder import Encoder
from parakeet.models.deepvoice3.decoder import Decoder
from parakeet.models.deepvoice3.converter import Converter
from parakeet.models.deepvoice3.model import DeepVoice3

View File

@ -1,62 +0,0 @@
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import AccKpi
each_epoch_duration_frame1_card1 = DurationKpi("each_epoch_duration_frame1_card1", 0.02, actived=True)
train_cost_frame1_card1 = CostKpi("train_cost_frame1_card1", 0.02, actived=True)
each_epoch_duration_frame4_card1 = DurationKpi("each_epoch_duration_frame4_card1", 0.05, actived=True)
train_cost_frame4_card1 = CostKpi("train_cost_frame4_card1", 0.02, actived=True)
tracking_kpis = [
each_epoch_duration_frame1_card1,
train_cost_frame1_card1,
each_epoch_duration_frame4_card1,
train_cost_frame4_card1,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 447 KiB

View File

@ -0,0 +1,101 @@
import numpy as np
from collections import namedtuple
from paddle import fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as F
from parakeet.modules.weight_norm import Linear
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
class Attention(dg.Layer):
def __init__(self,
query_dim,
embed_dim,
dropout=0.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True):
super(Attention, self).__init__()
self.query_proj = Linear(query_dim, embed_dim)
if key_projection:
self.key_proj = Linear(embed_dim, embed_dim)
if value_projection:
self.value_proj = Linear(embed_dim, embed_dim)
self.out_proj = Linear(embed_dim, query_dim)
self.key_projection = key_projection
self.value_projection = value_projection
self.dropout = dropout
self.window_range = window_range
def forward(self, query, encoder_out, mask=None, last_attended=None):
"""
Compute pooled context representation and alignment scores.
Args:
query (Variable): shape(B, T_dec, C_q), the query tensor,
where C_q means the channel of query.
encoder_out (Tuple(Variable, Variable)):
keys (Variable): shape(B, T_enc, C_emb), the key
representation from an encoder, where C_emb means
text embedding size.
values (Variable): shape(B, T_enc, C_emb), the value
representation from an encoder, where C_emb means
text embedding size.
mask (Variable, optional): Shape(B, T_enc), mask generated with
valid text lengths.
last_attended (int, optional): The position that received most
attention at last timestep. This is only used at decoding.
Outpus:
x (Variable): Shape(B, T_dec, C_q), the context representation
pooled from attention mechanism.
attn_scores (Variable): shape(B, T_dec, T_enc), the alignment
tensor, where T_dec means the number of decoder time steps and
T_enc means number the number of decoder time steps.
"""
keys, values = encoder_out
residual = query
if self.value_projection:
values = self.value_proj(values)
if self.key_projection:
keys = self.key_proj(keys)
x = self.query_proj(query)
# TODO: check the code
x = F.matmul(x, keys, transpose_y=True)
# mask generated by sentence length
neg_inf = -1.e30
if mask is not None:
neg_inf_mask = F.scale(F.unsqueeze(mask, [1]), neg_inf)
x += neg_inf_mask
# if last_attended is provided, focus only on a window range around it
# to enforce monotonic attention.
# TODO: if last attended is a shape(B,) array
if last_attended is not None:
locality_mask = np.ones(shape=x.shape, dtype=np.float32)
backward, ahead = self.window_range
backward = last_attended + backward
ahead = last_attended + ahead
backward = max(backward, 0)
ahead = min(ahead, x.shape[-1])
locality_mask[:, :, backward:ahead] = 0.
locality_mask = dg.to_variable(locality_mask)
neg_inf_mask = F.scale(locality_mask, neg_inf)
x += neg_inf_mask
x = F.softmax(x)
attn_scores = x
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.matmul(x, values)
encoder_length = keys.shape[1]
# CAUTION: is it wrong? let it be now
x = F.scale(x, encoder_length * np.sqrt(1.0 / encoder_length))
x = self.out_proj(x)
x = F.scale((x + residual), np.sqrt(0.5))
return x, attn_scores

View File

@ -1,98 +0,0 @@
# This file was copied from https://github.com/r9y9/deepvoice3_pytorch/tree/master/audio.py
# Copyright (c) 2017: Ryuichi Yamamoto.
import librosa
import librosa.filters
import math
import numpy as np
from scipy import signal
from hparams import hparams
from scipy.io import wavfile
import lws
def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
def save_wav(wav, path):
wav = wav * 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
def preemphasis(x):
from nnmnkwii.preprocessing import preemphasis
return preemphasis(x, hparams.preemphasis)
def inv_preemphasis(x):
from nnmnkwii.preprocessing import inv_preemphasis
return inv_preemphasis(x, hparams.preemphasis)
def spectrogram(y):
D = _lws_processor().stft(preemphasis(y)).T
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S)
def inv_spectrogram(spectrogram):
'''Converts spectrogram to waveform using librosa'''
S = _db_to_amp(_denormalize(spectrogram) +
hparams.ref_level_db) # Convert back to linear
processor = _lws_processor()
D = processor.run_lws(S.astype(np.float64).T**hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y)
def melspectrogram(y):
D = _lws_processor().stft(preemphasis(y)).T
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
if not hparams.allow_clipping_in_normalization:
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
return _normalize(S)
def _lws_processor():
return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech")
# Conversions:
_mel_basis = None
def _linear_to_mel(spectrogram):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _build_mel_basis():
if hparams.fmax is not None:
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate,
hparams.fft_size,
fmin=hparams.fmin,
fmax=hparams.fmax,
n_mels=hparams.num_mels)
def _amp_to_db(x):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _normalize(S):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
def _denormalize(S):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db

View File

@ -1,137 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from deepvoice3 import DeepVoiceTTS, ConvSpec, WindowRange
def deepvoice3(n_vocab,
embed_dim=256,
mel_dim=80,
linear_dim=513,
r=4,
downsample_step=1,
n_speakers=1,
speaker_dim=16,
padding_idx=0,
dropout=(1 - 0.96),
filter_size=5,
encoder_channels=128,
decoder_channels=256,
converter_channels=256,
query_position_rate=1.0,
key_position_rate=1.29,
use_memory_mask=False,
trainable_positional_encodings=False,
force_monotonic_attention=True,
use_decoder_state_for_postnet_input=True,
max_positions=512,
embedding_weight_std=0.1,
speaker_embedding_weight_std=0.01,
freeze_embedding=False,
window_range=WindowRange(-1, 3),
key_projection=False,
value_projection=False):
time_upsampling = max(downsample_step, 1)
h = encoder_channels
k = filter_size
encoder_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 3))
h = decoder_channels
prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
attentive_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
ConvSpec(h, k, 1))
attention = [True, False, False, False, True]
h = converter_channels
postnet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3))
model = DeepVoiceTTS(
"dv3", n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab,
embed_dim, padding_idx, embedding_weight_std, freeze_embedding,
encoder_convolutions, max_positions, padding_idx,
trainable_positional_encodings, mel_dim, r, prenet_convolutions,
attentive_convolutions, attention, use_memory_mask,
force_monotonic_attention, query_position_rate, key_position_rate,
window_range, key_projection, value_projection, linear_dim,
postnet_convolutions, time_upsampling, dropout,
use_decoder_state_for_postnet_input, "float32")
return model
def deepvoice3_multispeaker(n_vocab,
embed_dim=256,
mel_dim=80,
linear_dim=513,
r=4,
downsample_step=1,
n_speakers=1,
speaker_dim=16,
padding_idx=0,
dropout=(1 - 0.96),
filter_size=5,
encoder_channels=128,
decoder_channels=256,
converter_channels=256,
query_position_rate=1.0,
key_position_rate=1.29,
use_memory_mask=False,
trainable_positional_encodings=False,
force_monotonic_attention=True,
use_decoder_state_for_postnet_input=True,
max_positions=512,
embedding_weight_std=0.1,
speaker_embedding_weight_std=0.01,
freeze_embedding=False,
window_range=WindowRange(-1, 3),
key_projection=False,
value_projection=False):
time_upsampling = max(downsample_step, 1)
h = encoder_channels
k = filter_size
encoder_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 3))
h = decoder_channels
prenet_convolutions = (ConvSpec(h, k, 1))
attentive_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
ConvSpec(h, k, 1))
attention = [True, False, False, False, False]
h = converter_channels
postnet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3))
model = DeepVoiceTTS(
"dv3", n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab,
embed_dim, padding_idx, embedding_weight_std, freeze_embedding,
encoder_convolutions, max_positions, padding_idx,
trainable_positional_encodings, mel_dim, r, prenet_convolutions,
attentive_convolutions, attention, use_memory_mask,
force_monotonic_attention, query_position_rate, key_position_rate,
window_range, key_projection, value_projection, linear_dim,
postnet_convolutions, time_upsampling, dropout,
use_decoder_state_for_postnet_input, "float32")
return model

View File

@ -1,71 +0,0 @@
# Part of code was adpated from https://github.com/r9y9/deepvoice3_pytorch/tree/master/compute_timestamp_ratio.py
# Copyright (c) 2017: Ryuichi Yamamoto.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import io
import numpy as np
# sys.path.append("../")
from hparams import hparams, hparams_debug_string
from data import TextDataSource, MelSpecDataSource
from nnmnkwii.datasets import FileSourceDataset
from tqdm import trange
from parakeet import g2p as frontend
def build_parser():
parser = argparse.ArgumentParser(
description="Compute output/input timestamp ratio.")
parser.add_argument(
"--hparams", type=str, default="", help="Hyper parameters.")
parser.add_argument(
"--preset",
type=str,
required=True,
help="Path of preset parameters (json).")
parser.add_argument("data_root", type=str, help="path of the dataset.")
return parser
if __name__ == "__main__":
parser = build_parser()
args, _ = parser.parse_known_args()
data_root = args.data_root
preset = args.preset
# Load preset if specified
if preset is not None:
with io.open(preset) as f:
hparams.parse_json(f.read())
# Override hyper parameters
hparams.parse(args.hparams)
assert hparams.name == "deepvoice3"
# Code below
X = FileSourceDataset(TextDataSource(data_root))
Mel = FileSourceDataset(MelSpecDataSource(data_root))
in_sizes = []
out_sizes = []
for i in trange(len(X)):
x, m = X[i], Mel[i]
if X.file_data_source.multi_speaker:
x = x[0]
in_sizes.append(x.shape[0])
out_sizes.append(m.shape[0])
in_sizes = np.array(in_sizes)
out_sizes = np.array(out_sizes)
input_timestamps = np.sum(in_sizes)
output_timestamps = np.sum(
out_sizes) / hparams.outputs_per_step / hparams.downsample_step
print(input_timestamps, output_timestamps,
output_timestamps / input_timestamps)
sys.exit(0)

View File

@ -0,0 +1,137 @@
import numpy as np
from paddle import fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
from parakeet.modules.weight_norm import Conv1D, Conv1DCell, Conv2D, Linear
class Conv1DGLU(dg.Layer):
"""
A Convolution 1D block with GLU activation. It also applys dropout for the
input x. It fuses speaker embeddings through a FC activated by softsign. It
has residual connection from the input x, and scale the output by
np.sqrt(0.5).
"""
def __init__(self,
n_speakers,
speaker_dim,
in_channels,
num_filters,
filter_size=1,
dilation=1,
std_mul=4.0,
dropout=0.0,
causal=False,
residual=True):
super(Conv1DGLU, self).__init__()
# conv spec
self.in_channels = in_channels
self.n_speakers = n_speakers
self.speaker_dim = speaker_dim
self.num_filters = num_filters
self.filter_size = filter_size
self.dilation = dilation
# padding
self.causal = causal
# weight init and dropout
self.std_mul = std_mul
self.dropout = dropout
c_in = filter_size * in_channels
std = np.sqrt(std_mul * (1 - dropout) / c_in)
self.residual = residual
if residual:
assert (
in_channels == num_filters
), "this block uses residual connection"\
"the input_channes should equals num_filters"
self.conv = Conv1DCell(in_channels,
2 * num_filters,
filter_size,
dilation,
causal,
param_attr=I.Normal(scale=std))
if n_speakers > 1:
assert (speaker_dim is not None
), "speaker embed should not be null in multi-speaker case"
std = np.sqrt(1 / speaker_dim)
self.fc = Linear(speaker_dim,
num_filters,
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None):
"""
Args:
x (Variable): Shape(B, C_in, T), the input of Conv1DGLU
layer, where B means batch_size, C_in means the input channels
T means input time steps.
speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded
speaker embed, where C_sp means speaker embedding size. Note
that when using residual connection, the Conv1DGLU does not
change the number of channels, so out channels equals input
channels.
Returns:
x (Variable): Shape(B, C_out, T), the output of Conv1DGLU, where
C_out means the output channels of Conv1DGLU.
"""
residual = x
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = self.conv(x)
content, gate = F.split(x, num_or_sections=2, dim=1)
if speaker_embed is not None:
sp = F.softsign(self.fc(speaker_embed))
content = F.elementwise_add(content, sp, axis=0)
# glu
x = F.sigmoid(gate) * content
if self.residual:
x = F.scale(x + residual, np.sqrt(0.5))
return x
def start_sequence(self):
self.conv.start_sequence()
def add_input(self, x_t, speaker_embed=None):
"""
Args:
x (Variable): Shape(B, C_in), the input of Conv1DGLU
layer, where B means batch_size, C_in means the input channels.
speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded
speaker embed, where C_sp means speaker embedding size. Note
that when using residual connection, the Conv1DGLU does not
change the number of channels, so out channels equals input
channels.
Returns:
x (Variable): Shape(B, C_out), the output of Conv1DGLU, where
C_out means the output channels of Conv1DGLU.
"""
residual = x_t
x_t = F.dropout(x_t,
self.dropout,
dropout_implementation="upscale_in_train")
x_t = self.conv.add_input(x_t)
content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)
if speaker_embed is not None:
sp = F.softsign(self.fc(speaker_embed))
content_t = F.elementwise_add(content_t, sp, axis=0)
# glu
x_t = F.sigmoid(gate_t) * content_t
if self.residual:
x_t = F.scale(x_t + residual, np.sqrt(0.5))
return x_t

View File

@ -0,0 +1,231 @@
import numpy as np
from itertools import chain
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg
from parakeet.modules.weight_norm import Conv1D, Conv1DTranspose, Conv2D, Conv2DTranspose, Linear
from parakeet.models.deepvoice3.conv1dglu import Conv1DGLU
from parakeet.models.deepvoice3.encoder import ConvSpec
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
# upsampling convolitions
upsampling_convolutions = [
Conv1DTranspose(target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(np.sqrt(1 / target_channels))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout),
Conv1DTranspose(target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(4. /
target_channels))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [
Conv1DTranspose(target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(1. /
target_channels))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
class Converter(dg.Layer):
"""
Vocoder that transforms mel spectrogram (or ecoder hidden states)
to waveform.
"""
def __init__(self,
n_speakers,
speaker_dim,
in_channels,
linear_dim,
convolutions=(ConvSpec(256, 5, 1), ) * 4,
time_upsampling=1,
dropout=0.0):
super(Converter, self).__init__()
self.n_speakers = n_speakers
self.speaker_dim = speaker_dim
self.in_channels = in_channels
self.linear_dim = linear_dim
# CAUTION: this should equals the downsampling steps coefficient
self.time_upsampling = time_upsampling
self.dropout = dropout
target_channels = convolutions[0].out_channels
# conv proj to target channels
self.first_conv_proj = Conv1D(
in_channels,
target_channels,
1,
param_attr=I.Normal(scale=np.sqrt(1 / in_channels)))
# Idea from nyanko
if time_upsampling == 4:
self.upsampling_convolutions = dg.LayerList(
upsampling_4x_blocks(n_speakers, speaker_dim, target_channels,
dropout))
elif time_upsampling == 2:
self.upsampling_convolutions = dg.LayerList(
upsampling_2x_blocks(n_speakers, speaker_dim, target_channels,
dropout))
elif time_upsampling == 1:
self.upsampling_convolutions = dg.LayerList(
upsampling_1x_blocks(n_speakers, speaker_dim, target_channels,
dropout))
else:
raise ValueError(
"Upsampling factors other than {1, 2, 4} are Not supported.")
# post conv layers
std_mul = 4.0
in_channels = target_channels
self.convolutions = dg.LayerList()
for (out_channels, filter_size, dilation) in convolutions:
if in_channels != out_channels:
std = np.sqrt(std_mul / in_channels)
# CAUTION: relu
self.convolutions.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation=dilation,
std_mul=std_mul,
dropout=dropout))
in_channels = out_channels
std_mul = 4.0
# final conv proj, channel transformed to linear dim
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
# CAUTION: sigmoid
self.last_conv_proj = Conv1D(in_channels,
linear_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None):
"""
Convert mel spectrogram or decoder hidden states to linear spectrogram.
Args:
x (Variable): Shape(B, T_mel, C_in), converter inputs, where
C_in means the input channel for the converter. Note that it
can be either C_mel (channel of mel spectrogram) or C_dec // r.
When use mel_spectrogram as the input of converter, C_in =
C_mel; and when use decoder states as the input of converter,
C_in = C_dec // r. In this scenario, decoder hidden states are
treated as if they were r outputs per decoder step and are
unpacked before passing to the converter.
speaker_embed (Variable, optional): shape(B, C_sp), speaker
embedding, where C_sp means the speaker embedding size.
Returns:
out (Variable): Shape(B, T_lin, C_lin), the output linear
spectrogram, where C_lin means the channel of linear
spectrogram and T_linear means the length(time steps) of linear
spectrogram. T_line = time_upsampling * T_mel, which depends
on the time_upsampling converter.
"""
x = F.transpose(x, [0, 2, 1])
x = self.first_conv_proj(x)
if speaker_embed is not None:
speaker_embed = F.dropout(
speaker_embed,
self.dropout,
dropout_implementation="upscale_in_train")
for layer in chain(self.upsampling_convolutions, self.convolutions):
if isinstance(layer, Conv1DGLU):
x = layer(x, speaker_embed)
else:
x = layer(x)
out = self.last_conv_proj(x)
out = F.transpose(out, [0, 2, 1])
return out

View File

@ -1,328 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import random
import io
import platform
from os.path import dirname, join
from nnmnkwii.datasets import FileSourceDataset, FileDataSource
from os.path import join, expanduser
import random
# import global hyper parameters
from hparams import hparams
from parakeet import g2p as frontend
import builder
_frontend = getattr(frontend, hparams.frontend)
def _pad(seq, max_len, constant_values=0):
return np.pad(seq, (0, max_len - len(seq)),
mode="constant",
constant_values=constant_values)
def _pad_2d(x, max_len, b_pad=0):
x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)],
mode="constant",
constant_values=0)
return x
class TextDataSource(FileDataSource):
def __init__(self, data_root, speaker_id=None):
self.data_root = data_root
self.speaker_ids = None
self.multi_speaker = False
# If not None, filter by speaker_id
self.speaker_id = speaker_id
def collect_files(self):
meta = join(self.data_root, "train.txt")
with io.open(meta, "rt", encoding="utf-8") as f:
lines = f.readlines()
l = lines[0].split("|")
assert len(l) == 4 or len(l) == 5
self.multi_speaker = len(l) == 5
texts = list(map(lambda l: l.split("|")[3], lines))
if self.multi_speaker:
speaker_ids = list(map(lambda l: int(l.split("|")[-1]), lines))
# Filter by speaker_id
# using multi-speaker dataset as a single speaker dataset
if self.speaker_id is not None:
indices = np.array(speaker_ids) == self.speaker_id
texts = list(np.array(texts)[indices])
self.multi_speaker = False
return texts
return texts, speaker_ids
else:
return texts
def collect_features(self, *args):
if self.multi_speaker:
text, speaker_id = args
else:
text = args[0]
global _frontend
if _frontend is None:
_frontend = getattr(frontend, hparams.frontend)
seq = _frontend.text_to_sequence(
text, p=hparams.replace_pronunciation_prob)
if platform.system() == "Windows":
if hasattr(hparams, "gc_probability"):
_frontend = None # memory leaking prevention in Windows
if np.random.rand() < hparams.gc_probability:
gc.collect() # garbage collection enforced
print("GC done")
if self.multi_speaker:
return np.asarray(seq, dtype=np.int32), int(speaker_id)
else:
return np.asarray(seq, dtype=np.int32)
class _NPYDataSource(FileDataSource):
def __init__(self, data_root, col, speaker_id=None):
self.data_root = data_root
self.col = col
self.frame_lengths = []
self.speaker_id = speaker_id
def collect_files(self):
meta = join(self.data_root, "train.txt")
with io.open(meta, "rt", encoding="utf-8") as f:
lines = f.readlines()
l = lines[0].split("|")
assert len(l) == 4 or len(l) == 5
multi_speaker = len(l) == 5
self.frame_lengths = list(map(lambda l: int(l.split("|")[2]), lines))
paths = list(map(lambda l: l.split("|")[self.col], lines))
paths = list(map(lambda f: join(self.data_root, f), paths))
if multi_speaker and self.speaker_id is not None:
speaker_ids = list(map(lambda l: int(l.split("|")[-1]), lines))
# Filter by speaker_id
# using multi-speaker dataset as a single speaker dataset
indices = np.array(speaker_ids) == self.speaker_id
paths = list(np.array(paths)[indices])
self.frame_lengths = list(np.array(self.frame_lengths)[indices])
# aha, need to cast numpy.int64 to int
self.frame_lengths = list(map(int, self.frame_lengths))
return paths
def collect_features(self, path):
return np.load(path)
class MelSpecDataSource(_NPYDataSource):
def __init__(self, data_root, speaker_id=None):
super(MelSpecDataSource, self).__init__(data_root, 1, speaker_id)
class LinearSpecDataSource(_NPYDataSource):
def __init__(self, data_root, speaker_id=None):
super(LinearSpecDataSource, self).__init__(data_root, 0, speaker_id)
class PartialyRandomizedSimilarTimeLengthSampler(object):
"""Partially randmoized sampler
1. Sort by lengths
2. Pick a small patch and randomize it
3. Permutate mini-batchs
"""
def __init__(self,
lengths,
batch_size=16,
batch_group_size=None,
permutate=True):
self.sorted_indices = np.argsort(lengths)
self.lengths = np.array(lengths)[self.sorted_indices]
self.batch_size = batch_size
if batch_group_size is None:
batch_group_size = min(batch_size * 32, len(self.lengths))
if batch_group_size % batch_size != 0:
batch_group_size -= batch_group_size % batch_size
self.batch_group_size = batch_group_size
assert batch_group_size % batch_size == 0
self.permutate = permutate
def __iter__(self):
indices = self.sorted_indices.copy()
batch_group_size = self.batch_group_size
s, e = 0, 0
for i in range(len(indices) // batch_group_size):
s = i * batch_group_size
e = s + batch_group_size
random.shuffle(indices[s:e])
# Permutate batches
if self.permutate:
perm = np.arange(len(indices[:e]) // self.batch_size)
random.shuffle(perm)
indices[:e] = indices[:e].reshape(
-1, self.batch_size)[perm, :].reshape(-1)
# Handle last elements
s += batch_group_size
if s < len(indices):
random.shuffle(indices[s:])
return iter(indices)
def __len__(self):
return len(self.sorted_indices)
class Dataset(object):
def __init__(self, X, Mel, Y):
self.X = X
self.Mel = Mel
self.Y = Y
# alias
self.multi_speaker = X.file_data_source.multi_speaker
def __getitem__(self, idx):
if self.multi_speaker:
text, speaker_id = self.X[idx]
return text, self.Mel[idx], self.Y[idx], speaker_id
else:
return self.X[idx], self.Mel[idx], self.Y[idx]
def __len__(self):
return len(self.X)
def make_loader(dataset, batch_size, shuffle, sampler, create_batch_fn,
trainer_count, local_rank):
assert not (
shuffle and
sampler), "shuffle and sampler should not be valid in the same time."
num_samples = len(dataset)
def wrapper():
if sampler is None:
ids = range(num_samples)
if shuffle:
random.shuffle(ids)
else:
ids = sampler
batch, batches = [], []
for idx in ids:
batch.append(dataset[idx])
if len(batch) >= batch_size:
batches.append(batch)
batch = []
if len(batches) >= trainer_count:
yield create_batch_fn(batches[local_rank])
batches = []
if len(batch) > 0:
batches.append(batch)
if len(batches) >= trainer_count:
yield create_batch_fn(batches[local_rank])
return wrapper
def create_batch(batch):
"""Create batch"""
r = hparams.outputs_per_step
downsample_step = hparams.downsample_step
multi_speaker = len(batch[0]) == 4
# Lengths
input_lengths = [len(x[0]) for x in batch]
max_input_len = max(input_lengths)
input_lengths = np.array(input_lengths, dtype=np.int64)
target_lengths = [len(x[1]) for x in batch]
max_target_len = max(target_lengths)
target_lengths = np.array(target_lengths, dtype=np.int64)
if max_target_len % (r * downsample_step) != 0:
max_target_len += (r * downsample_step) - max_target_len % (
r * downsample_step)
assert max_target_len % (r * downsample_step) == 0
# Set 0 for zero beginning padding
# imitates initial decoder states
b_pad = r
max_target_len += b_pad * downsample_step
x_batch = np.array(
[_pad(x[0], max_input_len) for x in batch], dtype=np.int64)
x_batch = np.expand_dims(x_batch, axis=-1)
mel_batch = np.array(
[_pad_2d(
x[1], max_target_len, b_pad=b_pad) for x in batch],
dtype=np.float32)
# down sampling is done here
if downsample_step > 1:
mel_batch = mel_batch[:, 0::downsample_step, :]
mel_batch = np.expand_dims(np.transpose(mel_batch, axes=[0, 2, 1]), axis=2)
y_batch = np.array(
[_pad_2d(
x[2], max_target_len, b_pad=b_pad) for x in batch],
dtype=np.float32)
y_batch = np.expand_dims(np.transpose(y_batch, axes=[0, 2, 1]), axis=2)
# text positions
text_positions = np.array(
[_pad(np.arange(1, len(x[0]) + 1), max_input_len) for x in batch],
dtype=np.int64)
text_positions = np.expand_dims(text_positions, axis=-1)
max_decoder_target_len = max_target_len // r // downsample_step
# frame positions
s, e = 1, max_decoder_target_len + 1
frame_positions = np.tile(
np.expand_dims(
np.arange(
s, e, dtype=np.int64), axis=0), (len(batch), 1))
frame_positions = np.expand_dims(frame_positions, axis=-1)
# done flags
done = np.array([
_pad(
np.zeros(
len(x[1]) // r // downsample_step - 1, dtype=np.float32),
max_decoder_target_len,
constant_values=1) for x in batch
])
done = np.expand_dims(np.expand_dims(done, axis=1), axis=1)
if multi_speaker:
speaker_ids = np.expand_dims(np.array([x[3] for x in batch]), axis=-1)
return (x_batch, input_lengths, mel_batch, y_batch, text_positions,
frame_positions, done, target_lengths, speaker_ids)
else:
speaker_ids = None
return (x_batch, input_lengths, mel_batch, y_batch, text_positions,
frame_positions, done, target_lengths)

View File

@ -0,0 +1,507 @@
import numpy as np
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg
from parakeet.modules.weight_norm import Conv1D, Linear
from parakeet.models.deepvoice3.conv1dglu import Conv1DGLU
from parakeet.models.deepvoice3.encoder import ConvSpec
from parakeet.models.deepvoice3.attention import Attention, WindowRange
from parakeet.models.deepvoice3.position_embedding import PositionEmbedding
def gen_mask(valid_lengths, max_len, dtype="float32"):
"""
Generate a mask tensor from valid lengths. note that it return a *reverse*
mask. Indices within valid lengths correspond to 0, and those within
padding area correspond to 1.
Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is
[[0, 0, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1],
[0, 0, 0, 0, 0, 0, 0]].
Args:
valid_lengths (Variable): Shape(B), dtype: int64. A 1D-Tensor containing
the valid lengths (timesteps) of each example, where B means
beatch_size.
max_len (int): The length (number of timesteps) of the mask.
dtype (str, optional): A string that specifies the data type of the
returned mask.
Returns:
mask (Variable): A mask computed from valid lengths.
"""
mask = F.sequence_mask(valid_lengths, maxlen=max_len, dtype=dtype)
mask = 1 - mask
return mask
def fold_adjacent_frames(frames, r):
"""fold multiple adjacent frames.
Arguments:
frames {Variable} -- shape(batch_size, time_steps, channels), the spectrogram
r {int} -- frames per step.
Returns:
Variable -- shape(batch_size, time_steps // r, r *channels), folded frames
"""
if r == 1:
return frames
batch_size, time_steps, channels = frames.shape
if time_steps % r != 0:
print(
"time_steps cannot be divided by r, you would lose {} tailing frames"
.format(time_steps % r))
frames = frames[:, :time_steps - time_steps % r, :]
frames = F.reshape(frames, (batch_size, -1, channels * r))
return frames
def unfold_adjacent_frames(folded_frames, r):
"""fold multiple adjacent frames.
Arguments:
folded_frames {Variable} -- shape(batch_size, time_steps // r, r * channels), the spectrogram
r {int} -- frames per step.
Returns:
Variable -- shape(batch_size, time_steps, channels), folded frames
"""
if r == 1:
return folded_frames
batch_size, time_steps, channels = folded_frames.shape
folded_frames = F.reshape(folded_frames, (batch_size, -1, channels // r))
return folded_frames
class Decoder(dg.Layer):
def __init__(self,
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=1,
max_positions=512,
padding_idx=None,
preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True,
dropout=0.0,
use_memory_mask=False,
force_monotonic_attention=False,
query_position_rate=1.0,
key_position_rate=1.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True):
super(Decoder, self).__init__()
self.dropout = dropout
self.mel_dim = mel_dim
self.r = r
self.query_position_rate = query_position_rate
self.key_position_rate = key_position_rate
self.window_range = window_range
self.n_speakers = n_speakers
conv_channels = convolutions[0].out_channels
self.embed_keys_positions = PositionEmbedding(max_positions,
embed_dim,
padding_idx=padding_idx)
self.embed_query_positions = PositionEmbedding(max_positions,
conv_channels,
padding_idx=padding_idx)
if n_speakers > 1:
# CAUTION: mind the sigmoid
std = np.sqrt((1 - dropout) / speaker_dim)
self.speaker_proj1 = Linear(speaker_dim,
1,
param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(speaker_dim,
1,
param_attr=I.Normal(scale=std))
# prenet
self.prenet = dg.LayerList()
in_channels = mel_dim * r # multiframe
std_mul = 1.0
for (out_channels, filter_size, dilation) in preattention:
if in_channels != out_channels:
# conv1d & relu
std = np.sqrt(std_mul / in_channels)
self.prenet.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.prenet.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=True))
in_channels = out_channels
std_mul = 4.0
# attention
self.use_memory_mask = use_memory_mask
if isinstance(attention, bool):
self.attention = [attention] * len(convolutions)
else:
self.attention = attention
if isinstance(force_monotonic_attention, bool):
self.force_monotonic_attention = [force_monotonic_attention
] * len(convolutions)
else:
self.force_monotonic_attention = force_monotonic_attention
for x, y in zip(self.force_monotonic_attention, self.attention):
if x is True and y is False:
raise ValueError("When not using attention, there is no "
"monotonic attention at all")
# causual convolution & attention
self.conv_attn = []
for use_attention, (out_channels, filter_size,
dilation) in zip(self.attention, convolutions):
assert (
in_channels == out_channels
), "the stack of convolution & attention does not change channels"
conv_layer = Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=False)
attn_layer = Attention(
out_channels,
embed_dim,
dropout,
window_range,
key_projection=key_projection,
value_projection=value_projection) if use_attention else None
in_channels = out_channels
std_mul = 4.0
self.conv_attn.append((conv_layer, attn_layer))
for i, (conv_layer, attn_layer) in enumerate(self.conv_attn):
self.add_sublayer("conv_{}".format(i), conv_layer)
if attn_layer is not None:
self.add_sublayer("attn_{}".format(i), attn_layer)
# 1 * 1 conv to transform channels
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.last_conv = Conv1D(in_channels,
mel_dim * r,
1,
param_attr=I.Normal(scale=std))
# mel (before sigmoid) to done hat
std = np.sqrt(1 / in_channels)
self.fc = Conv1D(mel_dim * r, 1, 1, param_attr=I.Normal(scale=std))
# decoding configs
self.max_decoder_steps = 200
self.min_decoder_steps = 10
assert convolutions[-1].out_channels % r == 0, \
"decoder_state dim must be divided by r"
self.state_dim = convolutions[-1].out_channels // self.r
def forward(self,
encoder_out,
lengths,
frames,
text_positions,
frame_positions,
speaker_embed=None):
"""
Compute decoder outputs with ground truth mel spectrogram.
Args:
encoder_out (Tuple(Variable, Variable)):
keys (Variable): shape(B, T_enc, C_emb), the key
representation from an encoder, where C_emb means
text embedding size.
values (Variable): shape(B, T_enc, C_emb), the value
representation from an encoder, where C_emb means
text embedding size.
lengths (Variable): shape(batch_size,), dtype: int64, valid lengths
of text inputs for each example.
inputs (Variable): shape(B, T_mel, C_mel), ground truth
mel-spectrogram, which is used as decoder inputs when training.
text_positions (Variable): shape(B, T_enc), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
frame_positions (Variable): shape(B, T_dec // r), dtype:
int64. Positions indices for each decoder time steps.
speaker_embed: shape(batch_size, speaker_dim), speaker embedding,
only used for multispeaker model.
Returns:
outputs (Variable): Shape(B, T_mel // r, r * C_mel). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of mel spectrogram. Note that, when r > 1, the decoder
outputs r frames of mel spectrogram per step.
alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, where N means number
of Attention Layers, T_mel means the length of mel spectrogram,
r means the outputs per decoder step, T_enc means the encoder
time steps.
done (Variable): Shape(B, T_mel // r), probability that the
outputs should stop.
decoder_states (Variable): Shape(B, T_mel // r, C_dec), decoder
hidden states, where C_dec means the channels of decoder states.
"""
if speaker_embed is not None:
speaker_embed = F.dropout(
speaker_embed,
self.dropout,
dropout_implementation="upscale_in_train")
keys, values = encoder_out
enc_time_steps = keys.shape[1]
if self.use_memory_mask and lengths is not None:
mask = gen_mask(lengths, enc_time_steps)
else:
mask = None
if text_positions is not None:
w = self.key_position_rate
if self.n_speakers > 1:
w = w * F.squeeze(F.sigmoid(self.speaker_proj1(speaker_embed)),
[-1])
text_pos_embed = self.embed_keys_positions(text_positions, w)
keys += text_pos_embed # (B, T, C)
if frame_positions is not None:
w = self.query_position_rate
if self.n_speakers > 1:
w = w * F.squeeze(F.sigmoid(self.speaker_proj2(speaker_embed)),
[-1])
frame_pos_embed = self.embed_query_positions(frame_positions, w)
else:
frame_pos_embed = None
# pack multiple frames if necessary
frames = fold_adjacent_frames(frames, self.r) # assume (B, T, C) input
# (B, C, T)
frames = F.transpose(frames, [0, 2, 1])
x = frames
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
if isinstance(layer, Conv1DGLU):
x = layer(x, speaker_embed)
else:
x = layer(x)
# Convolution & Multi-hop Attention
alignments = []
for (conv, attn) in self.conv_attn:
residual = x
x = conv(x, speaker_embed)
if attn is not None:
x = F.transpose(x, [0, 2, 1]) # (B, T, C)
if frame_pos_embed is not None:
x = x + frame_pos_embed
x, attn_scores = attn(x, (keys, values), mask)
alignments.append(attn_scores)
x = F.transpose(x, [0, 2, 1]) #(B, C, T)
x = F.scale(residual + x, np.sqrt(0.5))
alignments = F.stack(alignments)
decoder_states = x
x = self.last_conv(x)
outputs = F.sigmoid(x)
done = F.sigmoid(self.fc(x))
outputs = F.transpose(outputs, [0, 2, 1])
decoder_states = F.transpose(decoder_states, [0, 2, 1])
done = F.squeeze(done, [1])
outputs = unfold_adjacent_frames(outputs, self.r)
decoder_states = unfold_adjacent_frames(decoder_states, self.r)
return outputs, alignments, done, decoder_states
@property
def receptive_field(self):
"""Whole receptive field of the causally convolutional decoder."""
r = 1
for conv in self.prenet:
r += conv.dilation[1] * (conv.filter_size[1] - 1)
for (conv, _) in self.conv_attn:
r += conv.dilation[1] * (conv.filter_size[1] - 1)
return r
def start_sequence(self):
for layer in self.prenet:
if isinstance(layer, Conv1DGLU):
layer.start_sequence()
for conv, _ in self.conv_attn:
if isinstance(conv, Conv1DGLU):
conv.start_sequence()
def decode(self,
encoder_out,
text_positions,
speaker_embed=None,
test_inputs=None):
self.start_sequence()
keys, values = encoder_out
batch_size = keys.shape[0]
assert batch_size == 1, "now only supports single instance inference"
mask = None # no mask because we use single instance decoding
# no dropout in inference
if speaker_embed is not None:
speaker_embed = F.dropout(
speaker_embed,
self.dropout,
dropout_implementation="upscale_in_train")
# since we use single example inference, there is no text_mask
if text_positions is not None:
w = self.key_position_rate
if self.n_speakers > 1:
# shape (B, )
w = w * F.squeeze(F.sigmoid(self.speaker_proj1(speaker_embed)),
[-1])
text_pos_embed = self.embed_keys_positions(text_positions, w)
keys += text_pos_embed # (B, T, C)
# statr decoding
decoder_states = [] # (B, C, 1) tensors
mel_outputs = [] # (B, C, 1) tensors
alignments = [] # (B, 1, T_enc) tensors
dones = [] # (B, 1, 1) tensors
last_attended = [None] * len(self.conv_attn)
for idx, monotonic_attn in enumerate(self.force_monotonic_attention):
if monotonic_attn:
last_attended[idx] = 0
if test_inputs is not None:
# pack multiple frames if necessary # assume (B, T, C) input
test_inputs = fold_adjacent_frames(test_inputs, self.r)
test_inputs = F.transpose(test_inputs, [0, 2, 1])
initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
dtype=keys.dtype)
t = 0 # decoder time step
while True:
frame_pos = F.fill_constant((batch_size, 1),
value=t + 1,
dtype="int64")
w = self.query_position_rate
if self.n_speakers > 1:
w = w * F.squeeze(F.sigmoid(self.speaker_proj2(speaker_embed)),
[-1])
# (B, T=1, C)
frame_pos_embed = self.embed_query_positions(frame_pos, w)
if test_inputs is not None:
if t >= test_inputs.shape[-1]:
break
current_input = test_inputs[:, :, t:t + 1]
else:
if t > 0:
current_input = mel_outputs[-1] # auto-regressive
else:
current_input = initial_input
x_t = current_input
x_t = F.dropout(x_t,
self.dropout,
dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
if isinstance(layer, Conv1DGLU):
x_t = layer.add_input(x_t, speaker_embed)
else:
x_t = layer(x_t) # (B, C, T=1)
step_attn_scores = []
# causal convolutions + multi-hop attentions
for i, (conv, attn) in enumerate(self.conv_attn):
residual = x_t #(B, C, T=1)
x_t = conv.add_input(x_t, speaker_embed)
if attn is not None:
x_t = F.transpose(x_t, [0, 2, 1])
if frame_pos_embed is not None:
x_t += frame_pos_embed
x_t, attn_scores = attn(
x_t, (keys, values), mask,
last_attended[i] if test_inputs is None else None)
x_t = F.transpose(x_t, [0, 2, 1])
step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc)
# update last attended when necessary
if self.force_monotonic_attention[i]:
last_attended[i] = np.argmax(attn_scores.numpy(),
axis=-1)[0][0]
x_t = F.scale(residual + x_t, np.sqrt(0.5))
if len(step_attn_scores):
# (B, 1, T_enc) again
average_attn_scores = F.reduce_mean(
F.stack(step_attn_scores, 0), 0)
else:
average_attn_scores = None
decoder_state_t = x_t
x_t = self.last_conv(x_t)
mel_output_t = F.sigmoid(x_t)
done_t = F.sigmoid(self.fc(x_t))
decoder_states.append(decoder_state_t)
mel_outputs.append(mel_output_t)
if average_attn_scores is not None:
alignments.append(average_attn_scores)
dones.append(done_t)
t += 1
if test_inputs is None:
if F.reduce_min(done_t).numpy(
)[0] > 0.5 and t > self.min_decoder_steps:
break
elif t > self.max_decoder_steps:
break
# concat results
mel_outputs = F.concat(mel_outputs, axis=-1)
decoder_states = F.concat(decoder_states, axis=-1)
dones = F.concat(dones, axis=-1)
alignments = F.concat(alignments, axis=1)
mel_outputs = F.transpose(mel_outputs, [0, 2, 1])
decoder_states = F.transpose(decoder_states, [0, 2, 1])
dones = F.squeeze(dones, [1])
mel_outputs = unfold_adjacent_frames(mel_outputs, self.r)
decoder_states = unfold_adjacent_frames(decoder_states, self.r)
return mel_outputs, alignments, dones, decoder_states

File diff suppressed because it is too large Load Diff

View File

@ -1,113 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
import paddle.fluid.dygraph as dg
from hparams import hparams, hparams_debug_string
from parakeet import g2p as frontend
from deepvoice3 import DeepVoiceTTS
def dry_run(model):
"""
Run the model once, just to get it initialized.
"""
model.train()
_frontend = getattr(frontend, hparams.frontend)
batch_size = 4
enc_length = 157
snd_sample_length = 500
r = hparams.outputs_per_step
downsample_step = hparams.downsample_step
n_speakers = hparams.n_speakers
# make sure snd_sample_length can be divided by r * downsample_step
linear_shift = r * downsample_step
snd_sample_length += linear_shift - snd_sample_length % linear_shift
decoder_length = snd_sample_length // downsample_step // r
mel_length = snd_sample_length // downsample_step
n_vocab = _frontend.n_vocab
max_pos = hparams.max_positions
spker_embed = hparams.speaker_embed_dim
linear_dim = model.linear_dim
mel_dim = hparams.num_mels
x = np.random.randint(
low=0, high=n_vocab, size=(batch_size, enc_length, 1), dtype="int64")
input_lengths = np.arange(
enc_length - batch_size + 1, enc_length + 1, dtype="int64")
mel = np.random.randn(batch_size, mel_dim, 1, mel_length).astype("float32")
y = np.random.randn(batch_size, linear_dim, 1,
snd_sample_length).astype("float32")
text_positions = np.tile(
np.arange(
0, enc_length, dtype="int64"), (batch_size, 1))
text_mask = text_positions > np.expand_dims(input_lengths, 1)
text_positions[text_mask] = 0
text_positions = np.expand_dims(text_positions, axis=-1)
frame_positions = np.tile(
np.arange(
1, decoder_length + 1, dtype="int64"), (batch_size, 1))
frame_positions = np.expand_dims(frame_positions, axis=-1)
done = np.zeros(shape=(batch_size, 1, 1, decoder_length), dtype="float32")
target_lengths = np.array([snd_sample_length] * batch_size).astype("int64")
speaker_ids = np.random.randint(
low=0, high=n_speakers, size=(batch_size, 1),
dtype="int64") if n_speakers > 1 else None
ismultispeaker = speaker_ids is not None
x = dg.to_variable(x)
input_lengths = dg.to_variable(input_lengths)
mel = dg.to_variable(mel)
y = dg.to_variable(y)
text_positions = dg.to_variable(text_positions)
frame_positions = dg.to_variable(frame_positions)
done = dg.to_variable(done)
target_lengths = dg.to_variable(target_lengths)
speaker_ids = dg.to_variable(
speaker_ids) if speaker_ids is not None else None
# these two fields are used as numpy ndarray
text_lengths = input_lengths.numpy()
decoder_lengths = target_lengths.numpy() // r // downsample_step
max_seq_len = max(text_lengths.max(), decoder_lengths.max())
if max_seq_len >= hparams.max_positions:
raise RuntimeError(
"max_seq_len ({}) >= max_posision ({})\n"
"Input text or decoder targget length exceeded the maximum length.\n"
"Please set a larger value for ``max_position`` in hyper parameters."
.format(max_seq_len, hparams.max_positions))
# cause paddle's embedding layer expect shape[-1] == 1
# first dry run runs the whole model
mel_outputs, linear_outputs, attn, done_hat = model(
x, input_lengths, mel, speaker_ids, text_positions, frame_positions)
num_parameters = 0
for k, v in model.state_dict().items():
print("{}|{}|{}".format(k, v.shape, np.prod(v.shape)))
num_parameters += np.prod(v.shape)
print("now model has {} parameters".format(len(model.state_dict())))
print("now model has {} elements".format(num_parameters))

View File

@ -0,0 +1,128 @@
import numpy as np
from collections import namedtuple
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg
from parakeet.modules.weight_norm import Conv1D, Linear
from parakeet.models.deepvoice3.conv1dglu import Conv1DGLU
ConvSpec = namedtuple("ConvSpec", ["out_channels", "filter_size", "dilation"])
class Encoder(dg.Layer):
def __init__(self,
n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=0.1,
convolutions=(ConvSpec(64, 5, 1), ) * 7,
max_positions=512,
dropout=0.):
super(Encoder, self).__init__()
self.embedding_weight_std = embedding_weight_std
self.embed = dg.Embedding(
(n_vocab, embed_dim),
padding_idx=padding_idx,
param_attr=I.Normal(scale=embedding_weight_std))
self.dropout = dropout
if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) # CAUTION: keep_prob
self.sp_proj1 = Linear(speaker_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.sp_proj2 = Linear(speaker_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.n_speakers = n_speakers
self.convolutions = dg.LayerList()
in_channels = embed_dim
std_mul = 1.0
for (out_channels, filter_size, dilation) in convolutions:
# 1 * 1 convolution & relu
if in_channels != out_channels:
std = np.sqrt(std_mul / in_channels)
self.convolutions.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=False,
residual=True))
in_channels = out_channels
std_mul = 4.0
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.convolutions.append(
Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def forward(self, x, speaker_embed=None):
"""
Encode text sequence.
Args:
x (Variable): Shape(B, T_enc), dtype: int64. Ihe input text
indices. T_enc means the timesteps of decoder input x.
speaker_embed (Variable, optional): Shape(batch_size, speaker_dim),
dtype: float32. Speaker embeddings. This arg is not None only
when the model is a multispeaker model.
Returns:
keys (Variable), Shape(B, T_enc, C_emb), the encoded
representation for keys, where C_emb menas the text embedding
size.
values (Variable), Shape(B, T_enc, C_emb), the encoded
representation for values.
"""
x = self.embed(x)
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.transpose(x, [0, 2, 1])
if self.n_speakers > 1 and speaker_embed is not None:
speaker_embed = F.dropout(
speaker_embed,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.elementwise_add(x,
F.softsign(self.sp_proj1(speaker_embed)),
axis=0)
input_embed = x
for layer in self.convolutions:
if isinstance(layer, Conv1DGLU):
x = layer(x, speaker_embed)
else:
# layer is a Conv1D with (1,) filter wrapped by WeightNormWrapper
x = layer(x)
if self.n_speakers > 1 and speaker_embed is not None:
x = F.elementwise_add(x,
F.softsign(self.sp_proj2(speaker_embed)),
axis=0)
keys = x # (B, C, T)
values = F.scale(input_embed + x, scale=np.sqrt(0.5))
keys = F.transpose(keys, [0, 2, 1])
values = F.transpose(values, [0, 2, 1])
return keys, values

View File

@ -1,321 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import os
from os.path import join, expanduser
from warnings import warn
from datetime import datetime
import matplotlib
# Force matplotlib not to use any Xwindows backend.
matplotlib.use("Agg")
from matplotlib import pyplot as plt
from matplotlib import cm
import audio
import numpy as np
from paddle import fluid
import paddle.fluid.dygraph as dg
import librosa.display
from tensorboardX import SummaryWriter
# import global hyper parameters
from hparams import hparams
from parakeet import g2p as frontend
_frontend = getattr(frontend, hparams.frontend)
def tts(model, text, p=0., speaker_id=None):
"""
Convert text to speech waveform given a deepvoice3 model.
Args:
model (DeepVoiceTTS): Model used to synthesize waveform.
text (str) : Input text to be synthesized
p (float) : Replace word to pronounciation if p > 0. Default is 0.
Returns:
waveform (numpy.ndarray): Shape(T_wav, ), predicted wave form, where
T_wav means the length of the synthesized wave form.
alignment (numpy.ndarray): Shape(T_dec, T_enc), predicted alignment
matrix, where T_dec means the time steps of decoder outputs, T_enc
means the time steps of encoder outoputs.
spectrogram (numpy.ndarray): Shape(T_lin, C_lin), predicted linear
spectrogram, where T__lin means the time steps of linear
spectrogram and C_lin mean sthe channels of linear spectrogram.
mel (numpy.ndarray): Shape(T_mel, C_mel), predicted mel spectrogram,
where T_mel means the time steps of mel spectrogram and C_mel means
the channels of mel spectrogram.
"""
model.eval()
sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64")
sequence = np.reshape(sequence, (1, -1, 1))
text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64")
text_positions = np.reshape(text_positions, (1, -1, 1))
sequence = dg.to_variable(sequence)
text_positions = dg.to_variable(text_positions)
speaker_ids = None if speaker_id is None else fluid.layers.fill_constant(
shape=[1, 1], value=speaker_id)
# sequence: shape(1, input_length, 1)
# text_positions: shape(1, input_length, 1)
# Greedy decoding
mel_outputs, linear_outputs, alignments, done = model.transduce(
sequence, text_positions, speaker_ids)
# reshape to the desired shape
linear_output = linear_outputs.numpy().squeeze().T
spectrogram = audio._denormalize(linear_output)
alignment = alignments.numpy()[0]
mel = mel_outputs.numpy().squeeze().T
mel = audio._denormalize(mel)
# Predicted audio signal
waveform = audio.inv_spectrogram(linear_output.T)
return waveform, alignment, spectrogram, mel
def prepare_spec_image(spectrogram):
"""
Prepare an image from spectrogram to be written to tensorboardX
summary writer.
Args:
spectrogram (numpy.ndarray): Shape(T, C), spectrogram to be
visualized, where T means the time steps of the spectrogram,
and C means the channels of the spectrogram.
Return:
np.ndarray: Shape(C, T, 4), the generated image of the spectrogram,
where T means the time steps of the spectrogram. It is treated
as the width of the image. And C means the channels of the
spectrogram, which is treated as the height of the image. And 4
means it is a 'ARGB' format.
"""
# [0, 1]
spectrogram = (spectrogram - np.min(spectrogram)) / (
np.max(spectrogram) - np.min(spectrogram))
spectrogram = np.flip(spectrogram, axis=1) # flip against freq axis
return np.uint8(cm.magma(spectrogram.T) * 255)
def plot_alignment(alignment, path, info=None):
fig, ax = plt.subplots()
im = ax.imshow(
alignment, aspect="auto", origin="lower", interpolation="none")
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
plt.tight_layout()
plt.savefig(path, format="png")
plt.close()
def time_string():
return datetime.now().strftime("%Y-%m-%d %H:%M")
def save_alignment(global_step, path, attn):
plot_alignment(
attn.T,
path,
info="{}, {}, step={}".format(hparams.builder,
time_string(), global_step))
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker):
# hard coded text sequences
texts = [
"Scientists at the CERN laboratory say they have discovered a new particle.",
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
"President Trump met with other leaders at the Group of 20 conference.",
"Generative adversarial network or variational auto-encoder.",
"Please call Stella.",
"Some have accepted this as a miracle without any physical explanation.",
]
eval_output_dir = join(checkpoint_dir, "eval")
if not os.path.exists(eval_output_dir):
os.makedirs(eval_output_dir)
print("[eval] Evaluating the model, results are saved in {}".format(
eval_output_dir))
model.eval()
# hard coded
speaker_ids = [0, 1, 10] if ismultispeaker else [None]
for speaker_id in speaker_ids:
speaker_str = ("multispeaker{}".format(speaker_id)
if speaker_id is not None else "single")
for idx, text in enumerate(texts):
signal, alignment, _, mel = tts(model,
text,
p=0,
speaker_id=speaker_id)
signal /= np.max(np.abs(signal))
# Alignment
path = join(eval_output_dir,
"step{:09d}_text{}_{}_alignment.png".format(
global_step, idx, speaker_str))
save_alignment(global_step, path, alignment)
tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str)
writer.add_image(
tag,
np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
global_step,
dataformats='HWC')
# Mel
writer.add_image(
"(Eval) Predicted mel spectrogram text{}_{}".format(
idx, speaker_str),
prepare_spec_image(mel),
global_step,
dataformats='HWC')
# Audio
path = join(eval_output_dir,
"step{:09d}_text{}_{}_predicted.wav".format(
global_step, idx, speaker_str))
audio.save_wav(signal, path)
try:
writer.add_audio(
"(Eval) Predicted audio signal {}_{}".format(idx,
speaker_str),
signal,
global_step,
sample_rate=hparams.sample_rate)
except Exception as e:
warn(str(e))
pass
def save_states(global_step,
writer,
mel_outputs,
linear_outputs,
attn,
mel,
y,
input_lengths,
checkpoint_dir=None):
"""
Save states for the trainning process.
"""
print("[train] Saving intermediate states at step {}".format(global_step))
idx = min(1, len(input_lengths) - 1)
input_length = input_lengths[idx]
# Alignment, Multi-hop attention
if attn is not None and len(attn.shape) == 4:
attn = attn.numpy()
for i in range(attn.shape[0]):
alignment = attn[i]
alignment = alignment[idx]
tag = "alignment_layer{}".format(i + 1)
writer.add_image(
tag,
np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
global_step,
dataformats='HWC')
alignment_dir = join(checkpoint_dir,
"alignment_layer{}".format(i + 1))
if not os.path.exists(alignment_dir):
os.makedirs(alignment_dir)
path = join(
alignment_dir,
"step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
save_alignment(global_step, path, alignment)
alignment_dir = join(checkpoint_dir, "alignment_ave")
if not os.path.exists(alignment_dir):
os.makedirs(alignment_dir)
path = join(alignment_dir,
"step{:09d}_alignment.png".format(global_step))
alignment = np.mean(attn, axis=0)[idx]
save_alignment(global_step, path, alignment)
tag = "averaged_alignment"
writer.add_image(
tag,
np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
global_step,
dataformats="HWC")
if mel_outputs is not None:
mel_output = mel_outputs[idx].numpy().squeeze().T
mel_output = prepare_spec_image(audio._denormalize(mel_output))
writer.add_image(
"Predicted mel spectrogram",
mel_output,
global_step,
dataformats="HWC")
if linear_outputs is not None:
linear_output = linear_outputs[idx].numpy().squeeze().T
spectrogram = prepare_spec_image(audio._denormalize(linear_output))
writer.add_image(
"Predicted linear spectrogram",
spectrogram,
global_step,
dataformats="HWC")
signal = audio.inv_spectrogram(linear_output.T)
signal /= np.max(np.abs(signal))
path = join(checkpoint_dir,
"step{:09d}_predicted.wav".format(global_step))
try:
writer.add_audio(
"Predicted audio signal",
signal,
global_step,
sample_rate=hparams.sample_rate)
except Exception as e:
warn(str(e))
pass
audio.save_wav(signal, path)
if mel_outputs is not None:
mel_output = mel[idx].numpy().squeeze().T
mel_output = prepare_spec_image(audio._denormalize(mel_output))
writer.add_image(
"Target mel spectrogram",
mel_output,
global_step,
dataformats="HWC")
if linear_outputs is not None:
linear_output = y[idx].numpy().squeeze().T
spectrogram = prepare_spec_image(audio._denormalize(linear_output))
writer.add_image(
"Target linear spectrogram",
spectrogram,
global_step,
dataformats="HWC")

View File

@ -1,731 +0,0 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Hyperparameter values."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import numbers
import re
import six
## from tensorflow.contrib.training.python.training import hparam_pb2
## from tensorflow.python.framework import ops
## from tensorflow.python.util import compat
## from tensorflow.python.util import deprecation
# Define the regular expression for parsing a single clause of the input
# (delimited by commas). A legal clause looks like:
# <variable name>[<index>]? = <rhs>
# where <rhs> is either a single token or [] enclosed list of tokens.
# For example: "var[1] = a" or "x = [1,2,3]"
PARAM_RE = re.compile(r"""
(?P<name>[a-zA-Z][\w\.]*) # variable name: "var" or "x"
(\[\s*(?P<index>\d+)\s*\])? # (optional) index: "1" or None
\s*=\s*
((?P<val>[^,\[]*) # single value: "a" or None
|
\[(?P<vals>[^\]]*)\]) # list of values: None or "1,2,3"
($|,\s*)""", re.VERBOSE)
def _parse_fail(name, var_type, value, values):
"""Helper function for raising a value error for bad assignment."""
raise ValueError(
'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s' %
(name, var_type.__name__, value, values))
def _reuse_fail(name, values):
"""Helper function for raising a value error for reuse of name."""
raise ValueError('Multiple assignments to variable \'%s\' in %s' %
(name, values))
def _process_scalar_value(name, parse_fn, var_type, m_dict, values,
results_dictionary):
"""Update results_dictionary with a scalar value.
Used to update the results_dictionary to be returned by parse_values when
encountering a clause with a scalar RHS (e.g. "s=5" or "arr[0]=5".)
Mutates results_dictionary.
Args:
name: Name of variable in assignment ("s" or "arr").
parse_fn: Function for parsing the actual value.
var_type: Type of named variable.
m_dict: Dictionary constructed from regex parsing.
m_dict['val']: RHS value (scalar)
m_dict['index']: List index value (or None)
values: Full expression being parsed
results_dictionary: The dictionary being updated for return by the parsing
function.
Raises:
ValueError: If the name has already been used.
"""
try:
parsed_value = parse_fn(m_dict['val'])
except ValueError:
_parse_fail(name, var_type, m_dict['val'], values)
# If no index is provided
if not m_dict['index']:
if name in results_dictionary:
_reuse_fail(name, values)
results_dictionary[name] = parsed_value
else:
if name in results_dictionary:
# The name has already been used as a scalar, then it
# will be in this dictionary and map to a non-dictionary.
if not isinstance(results_dictionary.get(name), dict):
_reuse_fail(name, values)
else:
results_dictionary[name] = {}
index = int(m_dict['index'])
# Make sure the index position hasn't already been assigned a value.
if index in results_dictionary[name]:
_reuse_fail('{}[{}]'.format(name, index), values)
results_dictionary[name][index] = parsed_value
def _process_list_value(name, parse_fn, var_type, m_dict, values,
results_dictionary):
"""Update results_dictionary from a list of values.
Used to update results_dictionary to be returned by parse_values when
encountering a clause with a list RHS (e.g. "arr=[1,2,3]".)
Mutates results_dictionary.
Args:
name: Name of variable in assignment ("arr").
parse_fn: Function for parsing individual values.
var_type: Type of named variable.
m_dict: Dictionary constructed from regex parsing.
m_dict['val']: RHS value (scalar)
values: Full expression being parsed
results_dictionary: The dictionary being updated for return by the parsing
function.
Raises:
ValueError: If the name has an index or the values cannot be parsed.
"""
if m_dict['index'] is not None:
raise ValueError('Assignment of a list to a list index.')
elements = filter(None, re.split('[ ,]', m_dict['vals']))
# Make sure the name hasn't already been assigned a value
if name in results_dictionary:
raise _reuse_fail(name, values)
try:
results_dictionary[name] = [parse_fn(e) for e in elements]
except ValueError:
_parse_fail(name, var_type, m_dict['vals'], values)
def _cast_to_type_if_compatible(name, param_type, value):
"""Cast hparam to the provided type, if compatible.
Args:
name: Name of the hparam to be cast.
param_type: The type of the hparam.
value: The value to be cast, if compatible.
Returns:
The result of casting `value` to `param_type`.
Raises:
ValueError: If the type of `value` is not compatible with param_type.
* If `param_type` is a string type, but `value` is not.
* If `param_type` is a boolean, but `value` is not, or vice versa.
* If `param_type` is an integer type, but `value` is not.
* If `param_type` is a float type, but `value` is not a numeric type.
"""
fail_msg = ("Could not cast hparam '%s' of type '%s' from value %r" %
(name, param_type, value))
# Some callers use None, for which we can't do any casting/checking. :(
if issubclass(param_type, type(None)):
return value
# Avoid converting a non-string type to a string.
if (issubclass(param_type, (six.string_types, six.binary_type)) and
not isinstance(value, (six.string_types, six.binary_type))):
raise ValueError(fail_msg)
# Avoid converting a number or string type to a boolean or vice versa.
if issubclass(param_type, bool) != isinstance(value, bool):
raise ValueError(fail_msg)
# Avoid converting float to an integer (the reverse is fine).
if (issubclass(param_type, numbers.Integral) and
not isinstance(value, numbers.Integral)):
raise ValueError(fail_msg)
# Avoid converting a non-numeric type to a numeric type.
if (issubclass(param_type, numbers.Number) and
not isinstance(value, numbers.Number)):
raise ValueError(fail_msg)
return param_type(value)
def parse_values(values, type_map):
"""Parses hyperparameter values from a string into a python map.
`values` is a string containing comma-separated `name=value` pairs.
For each pair, the value of the hyperparameter named `name` is set to
`value`.
If a hyperparameter name appears multiple times in `values`, a ValueError
is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
If a hyperparameter name in both an index assignment and scalar assignment,
a ValueError is raised. (e.g. 'a=[1,2,3],a[0] = 1').
The hyperparameter name may contain '.' symbols, which will result in an
attribute name that is only accessible through the getattr and setattr
functions. (And must be first explicit added through add_hparam.)
WARNING: Use of '.' in your variable names is allowed, but is not well
supported and not recommended.
The `value` in `name=value` must follows the syntax according to the
type of the parameter:
* Scalar integer: A Python-parsable integer point value. E.g.: 1,
100, -12.
* Scalar float: A Python-parsable floating point value. E.g.: 1.0,
-.54e89.
* Boolean: Either true or false.
* Scalar string: A non-empty sequence of characters, excluding comma,
spaces, and square brackets. E.g.: foo, bar_1.
* List: A comma separated list of scalar values of the parameter type
enclosed in square brackets. E.g.: [1,2,3], [1.0,1e-12], [high,low].
When index assignment is used, the corresponding type_map key should be the
list name. E.g. for "arr[1]=0" the type_map must have the key "arr" (not
"arr[1]").
Args:
values: String. Comma separated list of `name=value` pairs where
'value' must follow the syntax described above.
type_map: A dictionary mapping hyperparameter names to types. Note every
parameter name in values must be a key in type_map. The values must
conform to the types indicated, where a value V is said to conform to a
type T if either V has type T, or V is a list of elements of type T.
Hence, for a multidimensional parameter 'x' taking float values,
'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
Returns:
A python map mapping each name to either:
* A scalar value.
* A list of scalar values.
* A dictionary mapping index numbers to scalar values.
(e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
Raises:
ValueError: If there is a problem with input.
* If `values` cannot be parsed.
* If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
* If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
'a[1]=1,a[1]=2', or 'a=1,a=[1]')
"""
results_dictionary = {}
pos = 0
while pos < len(values):
m = PARAM_RE.match(values, pos)
if not m:
raise ValueError('Malformed hyperparameter value: %s' %
values[pos:])
# Check that there is a comma between parameters and move past it.
pos = m.end()
# Parse the values.
m_dict = m.groupdict()
name = m_dict['name']
if name not in type_map:
raise ValueError('Unknown hyperparameter type for %s' % name)
type_ = type_map[name]
# Set up correct parsing function (depending on whether type_ is a bool)
if type_ == bool:
def parse_bool(value):
if value in ['true', 'True']:
return True
elif value in ['false', 'False']:
return False
else:
try:
return bool(int(value))
except ValueError:
_parse_fail(name, type_, value, values)
parse = parse_bool
else:
parse = type_
# If a singe value is provided
if m_dict['val'] is not None:
_process_scalar_value(name, parse, type_, m_dict, values,
results_dictionary)
# If the assigned value is a list:
elif m_dict['vals'] is not None:
_process_list_value(name, parse, type_, m_dict, values,
results_dictionary)
else: # Not assigned a list or value
_parse_fail(name, type_, '', values)
return results_dictionary
class HParams(object):
"""Class to hold a set of hyperparameters as name-value pairs.
A `HParams` object holds hyperparameters used to build and train a model,
such as the number of hidden units in a neural net layer or the learning rate
to use when training.
You first create a `HParams` object by specifying the names and values of the
hyperparameters.
To make them easily accessible the parameter names are added as direct
attributes of the class. A typical usage is as follows:
```python
# Create a HParams object specifying names and values of the model
# hyperparameters:
hparams = HParams(learning_rate=0.1, num_hidden_units=100)
# The hyperparameter are available as attributes of the HParams object:
hparams.learning_rate ==> 0.1
hparams.num_hidden_units ==> 100
```
Hyperparameters have type, which is inferred from the type of their value
passed at construction type. The currently supported types are: integer,
float, boolean, string, and list of integer, float, boolean, or string.
You can override hyperparameter values by calling the
[`parse()`](#HParams.parse) method, passing a string of comma separated
`name=value` pairs. This is intended to make it possible to override
any hyperparameter values from a single command-line flag to which
the user passes 'hyper-param=value' pairs. It avoids having to define
one flag for each hyperparameter.
The syntax expected for each value depends on the type of the parameter.
See `parse()` for a description of the syntax.
Example:
```python
# Define a command line flag to pass name=value pairs.
# For example using argparse:
import argparse
parser = argparse.ArgumentParser(description='Train my model.')
parser.add_argument('--hparams', type=str,
help='Comma separated list of "name=value" pairs.')
args = parser.parse_args()
...
def my_program():
# Create a HParams object specifying the names and values of the
# model hyperparameters:
hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
activations=['relu', 'tanh'])
# Override hyperparameters values by parsing the command line
hparams.parse(args.hparams)
# If the user passed `--hparams=learning_rate=0.3` on the command line
# then 'hparams' has the following attributes:
hparams.learning_rate ==> 0.3
hparams.num_hidden_units ==> 100
hparams.activations ==> ['relu', 'tanh']
# If the hyperparameters are in json format use parse_json:
hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
```
"""
_HAS_DYNAMIC_ATTRIBUTES = True # Required for pytype checks.
def __init__(self, hparam_def=None, model_structure=None, **kwargs):
"""Create an instance of `HParams` from keyword arguments.
The keyword arguments specify name-values pairs for the hyperparameters.
The parameter types are inferred from the type of the values passed.
The parameter names are added as attributes of `HParams` object, so they
can be accessed directly with the dot notation `hparams._name_`.
Example:
```python
# Define 3 hyperparameters: 'learning_rate' is a float parameter,
# 'num_hidden_units' an integer parameter, and 'activation' a string
# parameter.
hparams = tf.HParams(
learning_rate=0.1, num_hidden_units=100, activation='relu')
hparams.activation ==> 'relu'
```
Note that a few names are reserved and cannot be used as hyperparameter
names. If you use one of the reserved name the constructor raises a
`ValueError`.
Args:
hparam_def: Serialized hyperparameters, encoded as a hparam_pb2.HParamDef
protocol buffer. If provided, this object is initialized by
deserializing hparam_def. Otherwise **kwargs is used.
model_structure: An instance of ModelStructure, defining the feature
crosses to be used in the Trial.
**kwargs: Key-value pairs where the key is the hyperparameter name and
the value is the value for the parameter.
Raises:
ValueError: If both `hparam_def` and initialization values are provided,
or if one of the arguments is invalid.
"""
# Register the hyperparameters and their type in _hparam_types.
# This simplifies the implementation of parse().
# _hparam_types maps the parameter name to a tuple (type, bool).
# The type value is the type of the parameter for scalar hyperparameters,
# or the type of the list elements for multidimensional hyperparameters.
# The bool value is True if the value is a list, False otherwise.
self._hparam_types = {}
self._model_structure = model_structure
if hparam_def:
## self._init_from_proto(hparam_def)
## if kwargs:
## raise ValueError('hparam_def and initialization values are '
## 'mutually exclusive')
raise ValueError('hparam_def has been disabled in this version')
else:
for name, value in six.iteritems(kwargs):
self.add_hparam(name, value)
## def _init_from_proto(self, hparam_def):
## """Creates a new HParams from `HParamDef` protocol buffer.
##
## Args:
## hparam_def: `HParamDef` protocol buffer.
## """
## assert isinstance(hparam_def, hparam_pb2.HParamDef)
## for name, value in hparam_def.hparam.items():
## kind = value.WhichOneof('kind')
## if kind.endswith('_value'):
## # Single value.
## if kind.startswith('int64'):
## # Setting attribute value to be 'int' to ensure the type is compatible
## # with both Python2 and Python3.
## self.add_hparam(name, int(getattr(value, kind)))
## elif kind.startswith('bytes'):
## # Setting attribute value to be 'str' to ensure the type is compatible
## # with both Python2 and Python3. UTF-8 encoding is assumed.
## self.add_hparam(name, compat.as_str(getattr(value, kind)))
## else:
## self.add_hparam(name, getattr(value, kind))
## else:
## # List of values.
## if kind.startswith('int64'):
## # Setting attribute value to be 'int' to ensure the type is compatible
## # with both Python2 and Python3.
## self.add_hparam(name, [int(v) for v in getattr(value, kind).value])
## elif kind.startswith('bytes'):
## # Setting attribute value to be 'str' to ensure the type is compatible
## # with both Python2 and Python3. UTF-8 encoding is assumed.
## self.add_hparam(
## name, [compat.as_str(v) for v in getattr(value, kind).value])
## else:
## self.add_hparam(name, [v for v in getattr(value, kind).value])
def add_hparam(self, name, value):
"""Adds {name, value} pair to hyperparameters.
Args:
name: Name of the hyperparameter.
value: Value of the hyperparameter. Can be one of the following types:
int, float, string, int list, float list, or string list.
Raises:
ValueError: if one of the arguments is invalid.
"""
# Keys in kwargs are unique, but 'name' could the name of a pre-existing
# attribute of this object. In that case we refuse to use it as a
# hyperparameter name.
if getattr(self, name, None) is not None:
raise ValueError('Hyperparameter name is reserved: %s' % name)
if isinstance(value, (list, tuple)):
if not value:
raise ValueError(
'Multi-valued hyperparameters cannot be empty: %s' % name)
self._hparam_types[name] = (type(value[0]), True)
else:
self._hparam_types[name] = (type(value), False)
setattr(self, name, value)
def set_hparam(self, name, value):
"""Set the value of an existing hyperparameter.
This function verifies that the type of the value matches the type of the
existing hyperparameter.
Args:
name: Name of the hyperparameter.
value: New value of the hyperparameter.
Raises:
ValueError: If there is a type mismatch.
"""
param_type, is_list = self._hparam_types[name]
if isinstance(value, list):
if not is_list:
raise ValueError(
'Must not pass a list for single-valued parameter: %s' %
name)
setattr(self, name, [
_cast_to_type_if_compatible(name, param_type, v) for v in value
])
else:
if is_list:
raise ValueError(
'Must pass a list for multi-valued parameter: %s.' % name)
setattr(self, name,
_cast_to_type_if_compatible(name, param_type, value))
def del_hparam(self, name):
"""Removes the hyperparameter with key 'name'.
Args:
name: Name of the hyperparameter.
"""
if hasattr(self, name):
delattr(self, name)
del self._hparam_types[name]
def parse(self, values):
"""Override hyperparameter values, parsing new values from a string.
See parse_values for more detail on the allowed format for values.
Args:
values: String. Comma separated list of `name=value` pairs where
'value' must follow the syntax described above.
Returns:
The `HParams` instance.
Raises:
ValueError: If `values` cannot be parsed.
"""
type_map = dict()
for name, t in self._hparam_types.items():
param_type, _ = t
type_map[name] = param_type
values_map = parse_values(values, type_map)
return self.override_from_dict(values_map)
def override_from_dict(self, values_dict):
"""Override hyperparameter values, parsing new values from a dictionary.
Args:
values_dict: Dictionary of name:value pairs.
Returns:
The `HParams` instance.
Raises:
ValueError: If `values_dict` cannot be parsed.
"""
for name, value in values_dict.items():
self.set_hparam(name, value)
return self
## @deprecation.deprecated(None, 'Use `override_from_dict`.')
def set_from_map(self, values_map):
"""DEPRECATED. Use override_from_dict."""
return self.override_from_dict(values_dict=values_map)
def set_model_structure(self, model_structure):
self._model_structure = model_structure
def get_model_structure(self):
return self._model_structure
def to_json(self, indent=None, separators=None, sort_keys=False):
"""Serializes the hyperparameters into JSON.
Args:
indent: If a non-negative integer, JSON array elements and object members
will be pretty-printed with that indent level. An indent level of 0, or
negative, will only insert newlines. `None` (the default) selects the
most compact representation.
separators: Optional `(item_separator, key_separator)` tuple. Default is
`(', ', ': ')`.
sort_keys: If `True`, the output dictionaries will be sorted by key.
Returns:
A JSON string.
"""
return json.dumps(
self.values(),
indent=indent,
separators=separators,
sort_keys=sort_keys)
def parse_json(self, values_json):
"""Override hyperparameter values, parsing new values from a json object.
Args:
values_json: String containing a json object of name:value pairs.
Returns:
The `HParams` instance.
Raises:
ValueError: If `values_json` cannot be parsed.
"""
values_map = json.loads(values_json)
return self.override_from_dict(values_map)
def values(self):
"""Return the hyperparameter values as a Python dictionary.
Returns:
A dictionary with hyperparameter names as keys. The values are the
hyperparameter values.
"""
return {n: getattr(self, n) for n in self._hparam_types.keys()}
def get(self, key, default=None):
"""Returns the value of `key` if it exists, else `default`."""
if key in self._hparam_types:
# Ensure that default is compatible with the parameter type.
if default is not None:
param_type, is_param_list = self._hparam_types[key]
type_str = 'list<%s>' % param_type if is_param_list else str(
param_type)
fail_msg = ("Hparam '%s' of type '%s' is incompatible with "
'default=%s' % (key, type_str, default))
is_default_list = isinstance(default, list)
if is_param_list != is_default_list:
raise ValueError(fail_msg)
try:
if is_default_list:
for value in default:
_cast_to_type_if_compatible(key, param_type, value)
else:
_cast_to_type_if_compatible(key, param_type, default)
except ValueError as e:
raise ValueError('%s. %s' % (fail_msg, e))
return getattr(self, key)
return default
def __contains__(self, key):
return key in self._hparam_types
def __str__(self):
return str(sorted(self.values().items()))
def __repr__(self):
return '%s(%s)' % (type(self).__name__, self.__str__())
@staticmethod
def _get_kind_name(param_type, is_list):
"""Returns the field name given parameter type and is_list.
Args:
param_type: Data type of the hparam.
is_list: Whether this is a list.
Returns:
A string representation of the field name.
Raises:
ValueError: If parameter type is not recognized.
"""
if issubclass(param_type, bool):
# This check must happen before issubclass(param_type, six.integer_types),
# since Python considers bool to be a subclass of int.
typename = 'bool'
elif issubclass(param_type, six.integer_types):
# Setting 'int' and 'long' types to be 'int64' to ensure the type is
# compatible with both Python2 and Python3.
typename = 'int64'
elif issubclass(param_type, (six.string_types, six.binary_type)):
# Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
# compatible with both Python2 and Python3.
typename = 'bytes'
elif issubclass(param_type, float):
typename = 'float'
else:
raise ValueError('Unsupported parameter type: %s' % str(param_type))
suffix = 'list' if is_list else 'value'
return '_'.join([typename, suffix])
## def to_proto(self, export_scope=None): # pylint: disable=unused-argument
## """Converts a `HParams` object to a `HParamDef` protocol buffer.
##
## Args:
## export_scope: Optional `string`. Name scope to remove.
##
## Returns:
## A `HParamDef` protocol buffer.
## """
## hparam_proto = hparam_pb2.HParamDef()
## for name in self._hparam_types:
## # Parse the values.
## param_type, is_list = self._hparam_types.get(name, (None, None))
## kind = HParams._get_kind_name(param_type, is_list)
##
## if is_list:
## if kind.startswith('bytes'):
## v_list = [compat.as_bytes(v) for v in getattr(self, name)]
## else:
## v_list = [v for v in getattr(self, name)]
## getattr(hparam_proto.hparam[name], kind).value.extend(v_list)
## else:
## v = getattr(self, name)
## if kind.startswith('bytes'):
## v = compat.as_bytes(getattr(self, name))
## setattr(hparam_proto.hparam[name], kind, v)
##
## return hparam_proto
## @staticmethod
## def from_proto(hparam_def, import_scope=None): # pylint: disable=unused-argument
## return HParams(hparam_def=hparam_def)
## ops.register_proto_function(
## 'hparams',
## proto_type=hparam_pb2.HParamDef,
## to_proto=HParams.to_proto,
## from_proto=HParams.from_proto)

View File

@ -1,8 +0,0 @@
Source: hparam.py copied from tensorflow v1.12.0.
https://github.com/tensorflow/tensorflow/blob/v1.12.0/tensorflow/contrib/training/python/training/hparam.py
with the following:
wget https://github.com/tensorflow/tensorflow/raw/v1.12.0/tensorflow/contrib/training/python/training/hparam.py
Once all other tensorflow dependencies of these file are removed, the class keeps its goal. Functions not available due to this process are not used in this project.

View File

@ -1,150 +0,0 @@
# Part of code was adpated from https://github.com/r9y9/deepvoice3_pytorch/tree/master/hparams.py
# Copyright (c) 2017: Ryuichi Yamamoto.
import hparam_tf.hparam
# NOTE: If you want full control for model architecture. please take a look
# at the code and change whatever you want. Some hyper parameters are hardcoded.
# Default hyperparameters:
hparams = hparam_tf.hparam.HParams(
name="deepvoice3",
# Text:
# [en, jp]
frontend='en',
# Replace words to its pronunciation with fixed probability.
# e.g., 'hello' to 'HH AH0 L OW1'
# [en, jp]
# en: Word -> pronunciation using CMUDict
# jp: Word -> pronounciation usnig MeCab
# [0 ~ 1.0]: 0 means no replacement happens.
replace_pronunciation_prob=0.5,
# Convenient model builder
# [deepvoice3, deepvoice3_multispeaker, nyanko]
# Definitions can be found at deepvoice3_pytorch/builder.py
# deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654
# deepvoice3_multispeaker: Multi-speaker version of DeepVoice3
# nyanko: https://arxiv.org/abs/1710.08969
builder="deepvoice3",
# Must be configured depends on the dataset and model you use
n_speakers=1,
speaker_embed_dim=16,
# Audio:
num_mels=80,
fmin=125,
fmax=7600,
fft_size=1024,
hop_size=256,
sample_rate=22050,
preemphasis=0.97,
min_level_db=-100,
ref_level_db=20,
# whether to rescale waveform or not.
# Let x is an input waveform, rescaled waveform y is given by:
# y = x / np.abs(x).max() * rescaling_max
rescaling=False,
rescaling_max=0.999,
# mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
# happen depends on min_level_db and ref_level_db, causing clipping noise.
# If False, assertion is added to ensure no clipping happens.
allow_clipping_in_normalization=True,
# Model:
downsample_step=4, # must be 4 when builder="nyanko"
outputs_per_step=1, # must be 1 when builder="nyanko"
embedding_weight_std=0.1,
speaker_embedding_weight_std=0.01,
padding_idx=0,
# Maximum number of input text length
# try setting larger value if you want to give very long text input
max_positions=512,
dropout=1 - 0.95,
kernel_size=3,
text_embed_dim=128,
encoder_channels=256,
decoder_channels=256,
# Note: large converter channels requires significant computational cost
converter_channels=256,
query_position_rate=1.0,
# can be computed by `compute_timestamp_ratio.py`.
key_position_rate=1.385, # 2.37 for jsut
key_projection=False,
value_projection=False,
use_memory_mask=True,
trainable_positional_encodings=False,
freeze_embedding=False,
# If True, use decoder's internal representation for postnet inputs,
# otherwise use mel-spectrogram.
use_decoder_state_for_postnet_input=True,
# Data loader
random_seed=1234,
pin_memory=True,
# Set it to 1 when in Windows (MemoryError, THAllocator.c 0x5)
num_workers=2,
# Loss
masked_loss_weight=0.5, # (1-w)*loss + w * masked_loss
# heuristic: priotrize [0 ~ priotiry_freq] for linear loss
priority_freq=3000,
priority_freq_weight=0.0, # (1-w)*linear_loss + w*priority_linear_loss
# https://arxiv.org/pdf/1710.08969.pdf
# Adding the divergence to the loss stabilizes training, expecially for
# very deep (> 10 layers) networks.
# Binary div loss seems has approx 10x scale compared to L1 loss, so I choose 0.1.
binary_divergence_weight=0.1, # set 0 to disable
use_guided_attention=True,
guided_attention_sigma=0.2,
# Training:
batch_size=16,
adam_beta1=0.5,
adam_beta2=0.9,
adam_eps=1e-6,
amsgrad=False,
initial_learning_rate=5e-4, # 0.001,
lr_schedule="noam_learning_rate_decay",
lr_schedule_kwargs={},
nepochs=2000,
weight_decay=0.0,
clip_thresh=0.1,
# Save
checkpoint_interval=10000,
eval_interval=10000,
save_optimizer_state=True,
# Eval:
# this can be list for multple layers of attention
# e.g., [True, False, False, False, True]
force_monotonic_attention=True,
# Attention constraint for incremental decoding
window_ahead=3,
# 0 tends to prevent word repretetion, but sometime causes skip words
window_backward=1,
power=1.4, # Power to raise magnitudes to prior to phase retrieval
# GC:
# Forced garbage collection probability
# Use only when MemoryError continues in Windows (Disabled by default)
#gc_probability = 0.001,
# json_meta mode only
# 0: "use all",
# 1: "ignore only unmatched_alignment",
# 2: "fully ignore recognition",
ignore_recognition_level=2,
# when dealing with non-dedicated speech dataset(e.g. movie excerpts), setting min_text above 15 is desirable. Can be adjusted by dataset.
min_text=20,
# if true, data without phoneme alignment file(.lab) will be ignored
process_only_htk_aligned=False)
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
return 'Hyperparameters:\n' + '\n'.join(hp)

View File

@ -1,89 +0,0 @@
# This file is copied from https://github.com/r9y9/deepvoice3_pytorch/tree/master/ljspeech.py
# Copyright (c) 2017: Ryuichi Yamamoto.
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import io
import os
import audio
from hparams import hparams
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
'''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
Args:
in_dir: The directory where you have downloaded the LJ Speech dataset
out_dir: The directory to write the output into
num_workers: Optional number of worker processes to parallelize across
tqdm: You can optionally pass tqdm to get a nice progress bar
Returns:
A list of tuples describing the training examples. This should be written to train.txt
'''
# We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
# can omit it and just call _process_utterance on each input if you want.
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
index = 1
with io.open(
os.path.join(in_dir, 'metadata.csv'), "rt", encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
text = parts[2]
if len(text) < hparams.min_text:
continue
futures.append(
executor.submit(
partial(_process_utterance, out_dir, index, wav_path,
text)))
index += 1
return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, index, wav_path, text):
'''Preprocesses a single utterance audio/text pair.
This writes the mel and linear scale spectrograms to disk and returns a tuple to write
to the train.txt file.
Args:
out_dir: The directory to write the spectrograms into
index: The numeric index to use in the spectrogram filenames.
wav_path: Path to the audio file containing the speech input
text: The text spoken in the input audio file
Returns:
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
'''
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
mel_filename = 'ljspeech-mel-%05d.npy' % index
np.save(
os.path.join(out_dir, spectrogram_filename),
spectrogram.T,
allow_pickle=False)
np.save(
os.path.join(out_dir, mel_filename),
mel_spectrogram.T,
allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text)

View File

@ -0,0 +1,218 @@
import numpy as np
from numba import jit
from paddle import fluid
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
def masked_mean(inputs, mask):
"""
Args:
inputs (Variable): Shape(B, T, C), the input, where B means
batch size, C means channels of input, T means timesteps of
the input.
mask (Variable): Shape(B, T), a mask.
Returns:
loss (Variable): Shape(1, ), masked mean.
"""
channels = inputs.shape[-1]
masked_inputs = F.elementwise_mul(inputs, mask, axis=0)
loss = F.reduce_sum(masked_inputs) / (channels * F.reduce_sum(mask))
return loss
@jit(nopython=True)
def guided_attention(N, max_N, T, max_T, g):
W = np.zeros((max_N, max_T), dtype=np.float32)
for n in range(N):
for t in range(T):
W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
return W
def guided_attentions(encoder_lengths,
decoder_lengths,
max_decoder_len,
g=0.2):
B = len(encoder_lengths)
max_input_len = encoder_lengths.max()
W = np.zeros((B, max_decoder_len, max_input_len), dtype=np.float32)
for b in range(B):
W[b] = guided_attention(encoder_lengths[b], max_input_len,
decoder_lengths[b], max_decoder_len, g).T
return W
class TTSLoss(object):
def __init__(self,
masked_weight=0.0,
priority_bin=None,
priority_weight=0.0,
binary_divergence_weight=0.0,
guided_attention_sigma=0.2,
downsample_factor=4,
r=1):
self.masked_weight = masked_weight
self.priority_bin = priority_bin # only used for lin-spec loss
self.priority_weight = priority_weight # only used for lin-spec loss
self.binary_divergence_weight = binary_divergence_weight
self.guided_attention_sigma = guided_attention_sigma
self.time_shift = r
self.r = r
self.downsample_factor = downsample_factor
def l1_loss(self, prediction, target, mask, priority_bin=None):
abs_diff = F.abs(prediction - target)
# basic mask-weighted l1 loss
w = self.masked_weight
if w > 0 and mask is not None:
base_l1_loss = w * masked_mean(abs_diff, mask) \
+ (1 - w) * F.reduce_mean(abs_diff)
else:
base_l1_loss = F.reduce_mean(abs_diff)
if self.priority_weight > 0 and priority_bin is not None:
# mask-weighted priority channels' l1-loss
priority_abs_diff = abs_diff[:, :, :priority_bin]
if w > 0 and mask is not None:
priority_loss = w * masked_mean(priority_abs_diff, mask) \
+ (1 - w) * F.reduce_mean(priority_abs_diff)
else:
priority_loss = F.reduce_mean(priority_abs_diff)
# priority weighted sum
p = self.priority_weight
loss = p * priority_loss + (1 - p) * base_l1_loss
else:
loss = base_l1_loss
return loss
def binary_divergence(self, prediction, target, mask):
flattened_prediction = F.reshape(prediction, [-1, 1])
flattened_target = F.reshape(target, [-1, 1])
flattened_loss = F.log_loss(flattened_prediction,
flattened_target,
epsilon=1e-8)
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
w = self.masked_weight
if w > 0 and mask is not None:
loss = w * masked_mean(bin_div, mask) \
+ (1 - w) * F.reduce_mean(bin_div)
else:
loss = F.reduce_mean(bin_div)
return loss
@staticmethod
def done_loss(done_hat, done):
flat_done_hat = F.reshape(done_hat, [-1, 1])
flat_done = F.reshape(done, [-1, 1])
loss = F.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
loss = F.reduce_mean(loss)
return loss
def attention_loss(self, predicted_attention, input_lengths,
target_lengths):
"""
Given valid encoder_lengths and decoder_lengths, compute a diagonal
guide, and compute loss from the predicted attention and the guide.
Args:
predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the
alignment tensor, where B means batch size, T_dec means number
of time steps of the decoder, T_enc means the number of time
steps of the encoder, * means other possible dimensions.
input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
(time steps) of encoder outputs.
target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64,
valid lengths (time steps) of decoder outputs.
Returns:
loss (Variable): Shape(1, ) attention loss.
"""
n_attention, batch_size, max_target_len, max_input_len = (
predicted_attention.shape)
soft_mask = guided_attentions(input_lengths, target_lengths,
max_target_len,
self.guided_attention_sigma)
soft_mask_ = dg.to_variable(soft_mask)
loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
return loss
def __call__(self,
mel_hyp,
lin_hyp,
done_hyp,
attn_hyp,
mel_ref,
lin_ref,
done_ref,
input_lengths,
n_frames,
compute_lin_loss=True,
compute_mel_loss=True,
compute_done_loss=True,
compute_attn_loss=True):
# n_frames # mel_lengths # decoder_lengths
# 4 个 loss 吧。lin(l1, bce, lin), mel(l1, bce, mel), attn, done
max_frames = lin_hyp.shape[1]
max_mel_steps = max_frames // self.downsample_factor
max_decoder_steps = max_mel_steps // self.r
decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
self.r,
max_decoder_steps,
dtype="float32")
mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
max_mel_steps,
dtype="float32")
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
if compute_lin_loss:
lin_hyp = lin_hyp[:, :-self.time_shift, :]
lin_ref = lin_ref[:, self.time_shift:, :]
lin_mask = lin_mask[:, self.time_shift:, :]
lin_l1_loss = self.l1_loss(lin_hyp,
lin_ref,
lin_mask,
priority_bin=self.priority_bin)
lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
lin_loss = self.binary_divergence_weight * lin_bce_loss \
+ (1 - self.binary_divergence_weight) * lin_l1_loss
if compute_mel_loss:
mel_hyp = mel_hyp[:, :-self.time_shift, :]
mel_ref = mel_ref[:, self.time_shift:, :]
mel_mask = mel_mask[:, self.time_shift:, :]
mel_l1_loss = self.l1_loss(mel_hyp, mel_ref, mel_mask)
mel_bce_loss = self.binary_divergence(mel_hyp, mel_ref, mel_mask)
# print("=====>", mel_l1_loss.numpy()[0], mel_bce_loss.numpy()[0])
mel_loss = self.binary_divergence_weight * mel_bce_loss \
+ (1 - self.binary_divergence_weight) * mel_l1_loss
if compute_attn_loss:
attn_loss = self.attention_loss(
attn_hyp, input_lengths.numpy(),
n_frames.numpy() // (self.downsample_factor * self.r))
if compute_done_loss:
done_loss = self.done_loss(done_hyp, done_ref)
result = {
"mel": mel_loss if compute_mel_loss else None,
"lin": lin_loss if compute_lin_loss else None,
"done": done_loss if compute_done_loss else None,
"attn": attn_loss if compute_attn_loss else None,
}
return result
@staticmethod
def compose_loss(result):
total_loss = 0.
for v in result.values():
if v is not None:
total_loss += v
return total_loss

View File

@ -0,0 +1,49 @@
import numpy as np
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg
class DeepVoice3(dg.Layer):
def __init__(self, encoder, decoder, converter, speaker_embedding,
use_decoder_states):
super(DeepVoice3, self).__init__()
if speaker_embedding is None:
self.n_speakers = 1
else:
self.speaker_embedding = speaker_embedding
self.encoder = encoder
self.decoder = decoder
self.converter = converter
self.use_decoder_states = use_decoder_states
def forward(self, text_sequences, text_positions, valid_lengths,
speaker_indices, mel_inputs, frame_positions):
if hasattr(self, "speaker_embedding"):
speaker_embed = self.speaker_embedding(speaker_indices)
else:
speaker_embed = None
keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder(
(keys, values), valid_lengths, mel_inputs, text_positions,
frame_positions, speaker_embed)
linear_outputs = self.converter(
decoder_states if self.use_decoder_states else mel_outputs,
speaker_embed)
return mel_outputs, linear_outputs, alignments, done
def transduce(self, text_sequences, text_positions, speaker_indices=None):
if hasattr(self, "speaker_embedding"):
speaker_embed = self.speaker_embedding(speaker_indices)
else:
speaker_embed = None
keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder.decode(
(keys, values), text_positions, speaker_embed)
linear_outputs = self.converter(
decoder_states if self.use_decoder_states else mel_outputs,
speaker_embed)
return mel_outputs, linear_outputs, alignments, done

View File

@ -0,0 +1,104 @@
import numpy as np
from paddle import fluid
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
def compute_position_embedding(radians, speaker_position_rate):
"""compute sin/cos separately and scatter them to a zero.
Arguments:
radians {Variable} -- shape(n_vocab, embed_dim), the radians matrix.
speaker_position_rate {Variable} -- shape(batch_size, ), speaker positioning rate.
Returns:
Variable -- shape(batch_size, n_vocab, embed_dim), the sin, cos matrix.
"""
_, embed_dim = radians.shape
batch_size = speaker_position_rate.shape[0]
speaker_position_rate = F.unsqueeze(speaker_position_rate, [1, 2])
scaled_radians = speaker_position_rate * radians
odd_mask = (np.arange(embed_dim) % 2).astype(np.float32)
odd_mask = dg.to_variable(odd_mask)
out = odd_mask * F.cos(scaled_radians) \
+ (1 - odd_mask) * F.sin(scaled_radians)
out = F.concat(
[F.zeros((batch_size, 1, embed_dim), radians.dtype), out[:, 1:, :]],
axis=1)
return out
def position_encoding_init(n_position,
d_pos_vec,
position_rate=1.0,
padding_idx=None):
"""init the position encoding table"""
# keep idx 0 for padding token position encoding zero vector
# CAUTION: it is radians here, sin and cos are not applied
# CAUTION: difference here
indices_range = np.expand_dims(np.arange(n_position), -1)
embed_range = 2 * (np.arange(d_pos_vec) // 2)
radians = position_rate \
* indices_range \
* np.power(1e4, embed_range / d_pos_vec)
if padding_idx is not None:
radians[padding_idx] = 0.
return radians
class PositionEmbedding(dg.Layer):
def __init__(self,
n_position,
d_pos_vec,
position_rate=1.0,
param_attr=None,
max_norm=None,
padding_idx=None):
super(PositionEmbedding, self).__init__()
self.weight = self.create_parameter((n_position, d_pos_vec))
self.weight.set_value(
position_encoding_init(n_position, d_pos_vec, position_rate,
padding_idx).astype("float32"))
def forward(self, indices, speaker_position_rate=None):
"""
Args:
indices (Variable): Shape (B, T), dtype: int64, position
indices, where B means the batch size, T means the time steps.
speaker_position_rate (Variable | float, optional), position
rate. It can be a float point number or a Variable with
shape (1,), then this speaker_position_rate is used for every
example. It can also be a Variable with shape (B, 1), which
contains a speaker position rate for each speaker.
Returns:
out (Variable): Shape(B, T, C_pos), position embedding, where C_pos
means position embedding size.
"""
batch_size, time_steps = indices.shape
# convert speaker_position_rate to a Variable with shape(B, )
if isinstance(speaker_position_rate, float):
speaker_position_rate = dg.to_variable(
np.array([speaker_position_rate]).astype("float32"))
speaker_position_rate = F.expand(speaker_position_rate,
[batch_size])
elif isinstance(speaker_position_rate, fluid.framework.Variable) \
and list(speaker_position_rate.shape) == [1]:
speaker_position_rate = F.expand(speaker_position_rate,
[batch_size])
assert len(speaker_position_rate.shape) == 1 and \
list(speaker_position_rate.shape) == [batch_size]
weight = compute_position_embedding(self.weight,
speaker_position_rate) # (B, V, C)
# make indices for gather_nd
batch_id = F.expand(
F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
[1, time_steps])
# (B, T, 2)
gather_nd_id = F.stack([batch_id, indices], -1)
out = F.gather_nd(weight, gather_nd_id)
return out

View File

@ -1,89 +0,0 @@
# Part of code was adpated from https://github.com/r9y9/deepvoice3_pytorch/tree/master/preprocess.py
# Copyright (c) 2017: Ryuichi Yamamoto.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import io
import six
import os
from multiprocessing import cpu_count
from tqdm import tqdm
import importlib
from hparams import hparams, hparams_debug_string
def build_parser():
parser = argparse.ArgumentParser(description="Data Preprocessing")
parser.add_argument("--num-workers", type=int, help="Num workers.")
parser.add_argument(
"--hparams",
type=str,
default="",
help="Hyper parameters to overwrite.")
parser.add_argument(
"--preset",
type=str,
required=True,
help="Path of preset parameters (json)")
parser.add_argument("name", type=str, help="Dataset name")
parser.add_argument("in_dir", type=str, help="Dataset path.")
parser.add_argument(
"out_dir", type=str, help="Path of preprocessed dataset.")
return parser
def preprocess(mod, in_dir, out_root, num_workers):
if not os.path.exists(out_dir):
os.makedirs(out_dir)
metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
if six.PY3:
string_type = str
elif six.PY2:
string_type = unicode
else:
raise ValueError("Not running on Python2 or Python 3?")
with io.open(
os.path.join(out_dir, 'train.txt'), 'wt', encoding='utf-8') as f:
for m in metadata:
f.write(u'|'.join([string_type(x) for x in m]) + '\n')
frames = sum([m[2] for m in metadata])
frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000
hours = frames * frame_shift_ms / (3600 * 1000)
print('Wrote %d utterances, %d frames (%.2f hours)' %
(len(metadata), frames, hours))
print('Max input length: %d' % max(len(m[3]) for m in metadata))
print('Max output length: %d' % max(m[2] for m in metadata))
if __name__ == "__main__":
parser = build_parser()
args, _ = parser.parse_known_args()
name = args.name
in_dir = args.in_dir
out_dir = args.out_dir
num_workers = args.num_workers
if num_workers is None:
num_workers = cpu_count()
preset = args.preset
# Load preset if specified
if preset is not None:
with io.open(preset) as f:
hparams.parse_json(f.read())
# Override hyper parameters
hparams.parse(args.hparams)
assert hparams.name == "deepvoice3"
print(hparams_debug_string())
assert name in ["ljspeech"], "now we only supports ljspeech"
mod = importlib.import_module(name)
preprocess(mod, in_dir, out_dir, num_workers)

View File

@ -1,65 +0,0 @@
{
"name": "deepvoice3",
"frontend": "en",
"replace_pronunciation_prob": 0.5,
"builder": "deepvoice3",
"n_speakers": 1,
"speaker_embed_dim": 16,
"num_mels": 80,
"fmin": 125,
"fmax": 7600,
"fft_size": 1024,
"hop_size": 256,
"sample_rate": 22050,
"preemphasis": 0.97,
"min_level_db": -100,
"ref_level_db": 20,
"rescaling": false,
"rescaling_max": 0.999,
"allow_clipping_in_normalization": true,
"downsample_step": 4,
"outputs_per_step": 1,
"embedding_weight_std": 0.1,
"speaker_embedding_weight_std": 0.01,
"padding_idx": 0,
"max_positions": 512,
"dropout": 0.050000000000000044,
"kernel_size": 3,
"text_embed_dim": 256,
"encoder_channels": 512,
"decoder_channels": 256,
"converter_channels": 256,
"query_position_rate": 1.0,
"key_position_rate": 1.385,
"key_projection": true,
"value_projection": true,
"use_memory_mask": true,
"trainable_positional_encodings": false,
"freeze_embedding": false,
"use_decoder_state_for_postnet_input": true,
"pin_memory": true,
"num_workers": 2,
"masked_loss_weight": 0.5,
"priority_freq": 3000,
"priority_freq_weight": 0.0,
"binary_divergence_weight": 0.1,
"use_guided_attention": true,
"guided_attention_sigma": 0.2,
"batch_size": 16,
"adam_beta1": 0.5,
"adam_beta2": 0.9,
"adam_eps": 1e-06,
"initial_learning_rate": 0.0005,
"lr_schedule": "noam_learning_rate_decay",
"lr_schedule_kwargs": {},
"nepochs": 2000,
"weight_decay": 0.0,
"clip_thresh": 0.1,
"checkpoint_interval": 10000,
"eval_interval": 10000,
"save_optimizer_state": true,
"force_monotonic_attention": true,
"window_ahead": 3,
"window_backward": 1,
"power": 1.4
}

View File

@ -1,167 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import os
import io
from os.path import dirname, join, basename, splitext, exists
from tqdm import tqdm
import numpy as np
import nltk
from paddle import fluid
import paddle.fluid.dygraph as dg
# sys.path.append("../")
import audio
from parakeet import g2p as frontend
import dry_run
from hparams import hparams
from train import make_deepvoice3_from_hparams
from eval_model import tts, plot_alignment
def build_parser():
parser = argparse.ArgumentParser(
description="Synthesis waveform from trained model.")
parser.add_argument(
"--hparams", type=str, default="", help="Hyper parameters.")
parser.add_argument(
"--preset",
type=str,
required=True,
help="Path of preset parameters (json).")
parser.add_argument(
"--use-gpu",
action="store_true",
help="Whether to use gpu for generation.")
parser.add_argument(
"--file-name-suffix", type=str, default="", help="File name suffix.")
parser.add_argument(
"--max-decoder-steps", type=int, default=500, help="Max decoder steps.")
parser.add_argument(
"--replace_pronunciation_prob",
type=float,
default=0.,
help="Probility to replace text with pronunciation.")
parser.add_argument(
"--speaker-id", type=int, help="Speaker ID (for multi-speaker model).")
parser.add_argument(
"--output-html", action="store_true", help="Output html for blog post.")
parser.add_argument(
"checkpoint", type=str, help="The checkpoint used for synthesis")
parser.add_argument(
"text_list_file",
type=str,
help="Text file to synthesis, a sentence per line.")
parser.add_argument(
"dst_dir", type=str, help="Directory to save synthesis results.")
return parser
if __name__ == "__main__":
parser = build_parser()
args, _ = parser.parse_known_args()
checkpoint_path = args.checkpoint
text_list_file_path = args.text_list_file
dst_dir = args.dst_dir
use_gpu = args.use_gpu
max_decoder_steps = args.max_decoder_steps
file_name_suffix = args.file_name_suffix
replace_pronunciation_prob = args.replace_pronunciation_prob
output_html = args.output_html
speaker_id = args.speaker_id
preset = args.preset
print("Command Line Args:")
for k, v in vars(args).items():
print(" {}: {}".format(k, v))
# Load preset if specified
if preset is not None:
with io.open(preset) as f:
hparams.parse_json(f.read())
# Override hyper parameters
hparams.parse(args.hparams)
assert hparams.name == "deepvoice3"
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
with dg.guard(place):
# Model
model = make_deepvoice3_from_hparams(hparams)
dry_run(model)
model_dict, _ = dg.load_dygraph(args.checkpoint)
model.set_dict(model_dict)
checkpoint_name = splitext(basename(checkpoint_path))[0]
model.seq2seq.decoder.max_decoder_steps = max_decoder_steps
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
with io.open(text_list_file_path, "rt", encoding="utf-8") as f:
lines = f.readlines()
for idx, line in enumerate(lines):
text = line[:-1]
words = nltk.word_tokenize(text)
waveform, alignment, _, _ = tts(model,
text,
p=replace_pronunciation_prob,
speaker_id=speaker_id)
dst_wav_path = join(dst_dir, "{}_{}{}.wav".format(
idx, checkpoint_name, file_name_suffix))
dst_alignment_path = join(
dst_dir, "{}_{}{}_alignment.png".format(
idx, checkpoint_name, file_name_suffix))
plot_alignment(
alignment.T,
dst_alignment_path,
info="{}, {}".format(hparams.builder,
basename(checkpoint_path)))
audio.save_wav(waveform, dst_wav_path)
name = splitext(basename(text_list_file_path))[0]
if output_html:
print("""
{}
({} chars, {} words)
<audio controls="controls" >
<source src="/audio/{}/{}/{}" autoplay/>
Your browser does not support the audio element.
</audio>
<div align="center"><img src="/audio/{}/{}/{}" /></div>
""".format(text,
len(text),
len(words), hparams.builder, name,
basename(dst_wav_path), hparams.builder, name,
basename(dst_alignment_path)))
else:
print(idx, ": {}\n ({} chars, {} words)".format(text,
len(text),
len(words)))
print("Finished! Check out {} for generated audio samples.".format(
dst_dir))
sys.exit(0)

View File

@ -1,250 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import io
from paddle import fluid
import paddle.fluid.dygraph as dg
import sys
# sys.path.append("../")
from argparse import ArgumentParser
from hparams import hparams, hparams_debug_string
from nnmnkwii.datasets import FileSourceDataset
from data import (TextDataSource, MelSpecDataSource,
LinearSpecDataSource,
PartialyRandomizedSimilarTimeLengthSampler,
Dataset, make_loader, create_batch)
from parakeet import g2p as frontend
from builder import deepvoice3, WindowRange
from dry_run import dry_run
from train_model import train_model
from parakeet.modules.loss import TTSLoss
from tensorboardX import SummaryWriter
def build_arg_parser():
parser = ArgumentParser(description="Train deepvoice 3 model.")
parser.add_argument(
"--data-root",
type=str,
required=True,
help="Directory contains preprocessed features.")
parser.add_argument(
"--use-data-parallel",
action="store_true",
help="Whether to use data parallel training.")
parser.add_argument(
"--use-gpu", action="store_true", help="Whether to use gpu training.")
parser.add_argument(
"--output",
type=str,
default="result",
help="Directory to save results")
parser.add_argument(
"--preset",
type=str,
required=True,
help="Path of preset parameters in json format.")
parser.add_argument(
"--hparams",
type=str,
default="",
help="Hyper parameters to override preset.")
parser.add_argument(
"--checkpoint",
type=str,
help="Restore model from checkpoint path if given.")
parser.add_argument(
"--reset-optimizer", action="store_true", help="Reset optimizer.")
# mutually exclusive option
train_opt = parser.add_mutually_exclusive_group()
train_opt.add_argument(
"--train-seq2seq-only",
action="store_true",
help="Train only seq2seq model")
train_opt.add_argument(
"--train-postnet-only",
action="store_true",
help="Train only postnet model.")
parser.add_argument(
"--speaker-id",
type=int,
help="Use specific speaker of data in case for multi-speaker datasets.",
)
return parser
def make_deepvoice3_from_hparams(hparams):
n_vocab = getattr(frontend, hparams.frontend).n_vocab
model = deepvoice3(
n_vocab, hparams.text_embed_dim, hparams.num_mels,
hparams.fft_size // 2 + 1, hparams.outputs_per_step,
hparams.downsample_step, hparams.n_speakers, hparams.speaker_embed_dim,
hparams.padding_idx, hparams.dropout, hparams.kernel_size,
hparams.encoder_channels, hparams.decoder_channels,
hparams.converter_channels, hparams.query_position_rate,
hparams.key_position_rate, hparams.use_memory_mask,
hparams.trainable_positional_encodings,
hparams.force_monotonic_attention,
hparams.use_decoder_state_for_postnet_input, hparams.max_positions,
hparams.embedding_weight_std, hparams.speaker_embedding_weight_std,
hparams.freeze_embedding,
WindowRange(-hparams.window_backward, hparams.window_ahead),
hparams.key_projection, hparams.value_projection)
return model
def noam_learning_rate_decay(init_lr, warmup_steps=4000):
# Noam scheme from tensor2tensor:
warmup_steps = float(warmup_steps)
return dg.NoamDecay(1 / (warmup_steps * (init_lr**2)), warmup_steps)
def make_optimizer_from_hparams(hparams):
if hparams.lr_schedule is not None:
learning_rate = noam_learning_rate_decay(hparams.initial_learning_rate,
**hparams.lr_schedule_kwargs)
else:
learning_rate = hparams.initial_learning_rate
if hparams.weight_decay > 0.0:
regularization = fluid.regularizer.L2DecayRegularizer(
hparams.weight_decay)
else:
regularization = None
optim = fluid.optimizer.Adam(
learning_rate=learning_rate,
beta1=hparams.adam_beta1,
beta2=hparams.adam_beta2,
regularization=regularization)
if hparams.clip_thresh > 0.0:
clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
hparams.clip_thresh)
else:
clipper = None
return optim, clipper
def make_loss_from_hparams(hparams):
criterion = TTSLoss(
hparams.masked_loss_weight, hparams.priority_freq_weight,
hparams.binary_divergence_weight, hparams.guided_attention_sigma)
return criterion
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
if __name__ == "__main__":
parser = build_arg_parser()
args, _ = parser.parse_known_args()
print("Command Line Args:")
for k, v in vars(args).items():
print(" {}: {}".format(k, v))
# Load preset if specified
if args.preset is not None:
with io.open(args.preset) as f:
hparams.parse_json(f.read())
# Override hyper parameters
hparams.parse(args.hparams)
print(hparams_debug_string())
checkpoint_dir = os.path.join(args.output, "checkpoints")
tensorboard_dir = os.path.join(args.output, "log")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
data_root = args.data_root
speaker_id = args.speaker_id
X = FileSourceDataset(TextDataSource(data_root, speaker_id))
Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id))
Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id))
frame_lengths = Mel.file_data_source.frame_lengths
sampler = PartialyRandomizedSimilarTimeLengthSampler(
frame_lengths, batch_size=hparams.batch_size)
dataset = Dataset(X, Mel, Y)
n_trainers = dg.parallel.Env().nranks
local_rank = dg.parallel.Env().local_rank
data_loader = make_loader(
dataset,
batch_size=hparams.batch_size,
shuffle=False,
sampler=sampler,
create_batch_fn=create_batch,
trainer_count=n_trainers,
local_rank=local_rank)
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace())
with dg.guard(place) as g:
pyreader = fluid.io.PyReader(capacity=10, return_list=True)
pyreader.decorate_batch_generator(data_loader, place)
model = make_deepvoice3_from_hparams(hparams)
optimizer, clipper = make_optimizer_from_hparams(hparams)
print("Log event path: {}".format(tensorboard_dir))
writer = SummaryWriter(tensorboard_dir) if local_rank == 0 else None
criterion = make_loss_from_hparams(hparams)
# loading saved model
if args.train_postnet_only or args.train_seq2seq_only:
assert args.checkpoint is not None, \
"you must train part of the model from a trained whole model"
if args.train_postnet_only:
assert hparams.use_decoder_state_for_postnet_input is False, \
"when training only the postnet, there is no decoder states"
if args.checkpoint is not None:
model_dict, optimizer_dict = dg.load_dygraph(args.checkpoint)
if args.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
train_model(model, pyreader, criterion, optimizer, clipper, writer,
args, hparams)
print("Done!")

View File

@ -1,13 +0,0 @@
export LD_LIBRARY_PATH=/fluid13_workspace/cuda-9.0/lib64/:/fluid13_workspace/cudnnv7.5_cuda9.0/lib64/:$LD_LIBRARY_PATH
#export PYTHONPATH=/dv3_workspace/paddle_for_dv3/build/python/
export PYTHONPATH=/fluid13_workspace/paddle_cherry_pick/build/python/:../
export CUDA_VISIBLE_DEVICES=7
GLOG_v=0 python -u train.py \
--use-gpu \
--reset-optimizer \
--preset=presets/deepvoice3_ljspeech.json \
--checkpoint-dir=checkpoint_single_1014 \
--data-root="/fluid13_workspace/dv3_workspace/deepvoice3_pytorch/data/ljspeech/" \
--hparams="batch_size=16"

View File

@ -1,258 +0,0 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from itertools import chain
from paddle import fluid
import paddle.fluid.dygraph as dg
from tqdm import tqdm
from eval_model import eval_model, save_states
def train_model(model, loader, criterion, optimizer, clipper, writer, args,
hparams):
assert fluid.framework.in_dygraph_mode(
), "this function must be run within dygraph guard"
n_trainers = dg.parallel.Env().nranks
local_rank = dg.parallel.Env().local_rank
# amount of shifting when compute losses
linear_shift = hparams.outputs_per_step
mel_shift = hparams.outputs_per_step
global_step = 0
global_epoch = 0
ismultispeaker = model.n_speakers > 1
checkpoint_dir = os.path.join(args.output, "checkpoints")
tensorboard_dir = os.path.join(args.output, "log")
ce_loss = 0
start_time = time.time()
for epoch in range(hparams.nepochs):
epoch_loss = 0.
for step, inputs in tqdm(enumerate(loader())):
if len(inputs) == 9:
(text, input_lengths, mel, linear, text_positions,
frame_positions, done, target_lengths, speaker_ids) = inputs
else:
(text, input_lengths, mel, linear, text_positions,
frame_positions, done, target_lengths) = inputs
speaker_ids = None
model.train()
if not (args.train_seq2seq_only or args.train_postnet_only):
results = model(text, input_lengths, mel, speaker_ids,
text_positions, frame_positions)
mel_outputs, linear_outputs, alignments, done_hat = results
elif args.train_seq2seq_only:
if speaker_ids is not None:
speaker_embed = model.speaker_embedding(speaker_ids)
else:
speaker_embed = None
results = model.seq2seq(text, input_lengths, mel, speaker_embed,
text_positions, frame_positions)
mel_outputs, alignments, done_hat, decoder_states = results
if model.r > 1:
mel_outputs = fluid.layers.transpose(mel_outputs,
[0, 3, 2, 1])
mel_outputs = fluid.layers.reshape(
mel_outputs,
[mel_outputs.shape[0], -1, 1, model.mel_dim])
mel_outputs = fluid.layers.transpose(mel_outputs,
[0, 3, 2, 1])
linear_outputs = None
else:
assert (
model.use_decoder_state_for_postnet_input is False
), "when train only the converter, you have no decoder states"
if speaker_ids is not None:
speaker_embed = model.speaker_embedding(speaker_ids)
else:
speaker_embed = None
linear_outputs = model.converter(mel, speaker_embed)
alignments = None
mel_outputs = None
done_hat = None
if not args.train_seq2seq_only:
n_priority_freq = int(hparams.priority_freq /
(hparams.sample_rate * 0.5) *
model.linear_dim)
linear_mask = fluid.layers.sequence_mask(
target_lengths, maxlen=linear.shape[-1], dtype="float32")
linear_mask = linear_mask[:, linear_shift:]
linear_predicted = linear_outputs[:, :, :, :-linear_shift]
linear_target = linear[:, :, :, linear_shift:]
lin_l1_loss = criterion.l1_loss(
linear_predicted,
linear_target,
linear_mask,
priority_bin=n_priority_freq)
lin_div = criterion.binary_divergence(
linear_predicted, linear_target, linear_mask)
lin_loss = criterion.binary_divergence_weight * lin_div \
+ (1 - criterion.binary_divergence_weight) * lin_l1_loss
if writer is not None and local_rank == 0:
writer.add_scalar("linear_loss",
float(lin_loss.numpy()), global_step)
writer.add_scalar("linear_l1_loss",
float(lin_l1_loss.numpy()), global_step)
writer.add_scalar("linear_binary_div_loss",
float(lin_div.numpy()), global_step)
if not args.train_postnet_only:
mel_lengths = target_lengths // hparams.downsample_step
mel_mask = fluid.layers.sequence_mask(
mel_lengths, maxlen=mel.shape[-1], dtype="float32")
mel_mask = mel_mask[:, mel_shift:]
mel_predicted = mel_outputs[:, :, :, :-mel_shift]
mel_target = mel[:, :, :, mel_shift:]
mel_l1_loss = criterion.l1_loss(mel_predicted, mel_target,
mel_mask)
mel_div = criterion.binary_divergence(mel_predicted, mel_target,
mel_mask)
mel_loss = criterion.binary_divergence_weight * mel_div \
+ (1 - criterion.binary_divergence_weight) * mel_l1_loss
if writer is not None and local_rank == 0:
writer.add_scalar("mel_loss",
float(mel_loss.numpy()), global_step)
writer.add_scalar("mel_l1_loss",
float(mel_l1_loss.numpy()), global_step)
writer.add_scalar("mel_binary_div_loss",
float(mel_div.numpy()), global_step)
done_loss = criterion.done_loss(done_hat, done)
if writer is not None and local_rank == 0:
writer.add_scalar("done_loss",
float(done_loss.numpy()), global_step)
if hparams.use_guided_attention:
decoder_length = target_lengths.numpy() / (
hparams.outputs_per_step * hparams.downsample_step)
attn_loss = criterion.attention_loss(alignments,
input_lengths.numpy(),
decoder_length)
if writer is not None and local_rank == 0:
writer.add_scalar("attention_loss",
float(attn_loss.numpy()), global_step)
if not (args.train_seq2seq_only or args.train_postnet_only):
if hparams.use_guided_attention:
loss = lin_loss + mel_loss + done_loss + attn_loss
else:
loss = lin_loss + mel_loss + done_loss
elif args.train_seq2seq_only:
if hparams.use_guided_attention:
loss = mel_loss + done_loss + attn_loss
else:
loss = mel_loss + done_loss
else:
loss = lin_loss
if writer is not None and local_rank == 0:
writer.add_scalar("loss", float(loss.numpy()), global_step)
if isinstance(optimizer._learning_rate,
fluid.optimizer.LearningRateDecay):
current_lr = optimizer._learning_rate.step().numpy()
else:
current_lr = optimizer._learning_rate
if writer is not None and local_rank == 0:
writer.add_scalar("learning_rate", current_lr, global_step)
epoch_loss += loss.numpy()[0]
if (local_rank == 0 and global_step > 0 and
global_step % hparams.checkpoint_interval == 0):
save_states(global_step, writer, mel_outputs, linear_outputs,
alignments, mel, linear,
input_lengths.numpy(), checkpoint_dir)
step_path = os.path.join(
checkpoint_dir, "checkpoint_{:09d}".format(global_step))
dg.save_dygraph(model.state_dict(), step_path)
dg.save_dygraph(optimizer.state_dict(), step_path)
if (local_rank == 0 and global_step > 0 and
global_step % hparams.eval_interval == 0):
eval_model(global_step, writer, model, checkpoint_dir,
ismultispeaker)
if args.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
if not (args.train_seq2seq_only or args.train_postnet_only):
param_list = model.parameters()
elif args.train_seq2seq_only:
if ismultispeaker:
param_list = chain(model.speaker_embedding.parameters(),
model.seq2seq.parameters())
else:
param_list = model.seq2seq.parameters()
else:
if ismultispeaker:
param_list = chain(model.speaker_embedding.parameters(),
model.seq2seq.parameters())
else:
param_list = model.converter.parameters()
optimizer.minimize(
loss, grad_clip=clipper, parameter_list=param_list)
if not (args.train_seq2seq_only or args.train_postnet_only):
model.clear_gradients()
elif args.train_seq2seq_only:
if ismultispeaker:
model.speaker_embedding.clear_gradients()
model.seq2seq.clear_gradients()
else:
if ismultispeaker:
model.speaker_embedding.clear_gradients()
model.converter.clear_gradients()
global_step += 1
average_loss_in_epoch = epoch_loss / (step + 1)
print("Epoch loss: {}".format(average_loss_in_epoch))
if writer is not None and local_rank == 0:
writer.add_scalar("average_loss_in_epoch", average_loss_in_epoch,
global_epoch)
ce_loss = average_loss_in_epoch
global_epoch += 1
end_time = time.time()
epoch_time = (end_time - start_time) / global_epoch
print("kpis\teach_epoch_duration_frame%s_card%s\t%s" %
(hparams.outputs_per_step, n_trainers, epoch_time))
print("kpis\ttrain_cost_frame%s_card%s\t%f" %
(hparams.outputs_per_step, n_trainers, ce_loss))

File diff suppressed because it is too large Load Diff