transform parse to argparse

This commit is contained in:
lifuchen 2020-02-13 06:48:21 +00:00 committed by chenfeiyu
parent f5ac04b1a3
commit 04d7f8b598
30 changed files with 365 additions and 500 deletions

View File

@ -1,46 +0,0 @@
audio:
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
n_fft: 2048 #the number of fft components.
sr: 22050 #the sampling rate of audio data file.
preemphasis: 0.97 #the preemphasis coefficient.
hop_length: 256 #the number of samples to advance between frames.
win_length: 1024 #the length (width) of the window function.
power: 1.2 #the power to raise before griffin-lim.
min_level_db: -100 #the minimum level db.
ref_level_db: 20 #the reference level db.
outputs_per_step: 1 #the outputs per step.
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
fs_hidden_size: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
dropout: 0.1
transformer_head: 4
embedding_size: 512
hidden_size: 256
warm_up_step: 4000
grad_clip_thresh: 0.1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: True
data_path: ../../dataset/LJSpeech-1.1
transtts_path: ../TransformerTTS/checkpoint/
transformer_step: 160000
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#transformer_step: 97000

View File

@ -1,97 +0,0 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2,
help="the attention head number in encoder.")
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.")
parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.")
parser.add_argument('--fs_hidden_size', type=int, default=256,
help="the hidden size in model of fastspeech.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
help="the filter size of conv1d in duration prediction.")
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
help="the filter size of conv1d in fft.")
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
help="the padding size of conv1d in fft.")
parser.add_argument('--dropout', type=float, default=0.1,
help="the dropout in network.")
parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.")
parser.add_argument('--alpha', type=float, default=1.0,
help="the hyperparameter to determine the length of the expanded sequence\
mel, thereby controlling the voice speed.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model of transformerTTS.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=160000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=70000,
help="the step to load transformerTTS model.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -1,35 +0,0 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 1000
image_step: 2000
use_gpu: True
use_data_parallel: False
stop_token: False
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#ransformer_step: 97000

View File

@ -1,29 +0,0 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 10
use_gpu: True
use_data_parallel: True
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#transformer_step: 27000

View File

@ -1,69 +0,0 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in network.")
parser.add_argument('--embedding_size', type=int, default=512,
help="the embedding vector size.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer.")
parser.add_argument('--postnet_step', type=int, default=90000,
help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=bool, default=False,
help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./log',
help="the directory to save audio sample in synthesis.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -0,0 +1,32 @@
audio:
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
n_fft: 2048 #the number of fft components.
sr: 22050 #the sampling rate of audio data file.
preemphasis: 0.97 #the preemphasis coefficient.
hop_length: 256 #the number of samples to advance between frames.
win_length: 1024 #the length (width) of the window function.
power: 1.2 #the power to raise before griffin-lim.
min_level_db: -100 #the minimum level db.
ref_level_db: 20 #the reference level db.
outputs_per_step: 1 #the outputs per step.
encoder_n_layer: 6 #the number of FFT Block in encoder.
encoder_head: 2 #the attention head number in encoder.
encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder.
max_seq_len: 2048 #the max length of sequence.
decoder_n_layer: 6 #the number of FFT Block in decoder.
decoder_head: 2 #the attention head number in decoder.
decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder.
fs_hidden_size: 384 #the hidden size in model of fastspeech.
duration_predictor_output_size: 256 #the output size of duration predictior.
duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction.
fft_conv1d_filter: 3 #the filter size of conv1d in fft.
fft_conv1d_padding: 1 #the padding size of conv1d in fft.
dropout: 0.1 #the dropout in network.
transformer_head: 4 #the attention head num of transformerTTS.
embedding_size: 512 #the dim size of embedding of transformerTTS.
hidden_size: 256 #the hidden size in model of transformerTTS.
warm_up_step: 4000 #the warm up step of learning rate.
grad_clip_thresh: 0.1 #the threshold of grad clip.

View File

@ -13,7 +13,7 @@ audio:
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
max_seq_len: 2048
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
@ -23,11 +23,4 @@ duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
dropout: 0.1
transformer_head: 4
use_gpu: True
alpha: 1.0
checkpoint_path: checkpoint/
fastspeech_step: 71000
log_dir: ./log
transformer_head: 4

View File

@ -0,0 +1,36 @@
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="the step to load transformerTTS model.")

View File

@ -1,15 +1,16 @@
import os
from tensorboardX import SummaryWriter
from collections import OrderedDict
import jsonargparse
import argparse
from parse import add_config_options_to_parser
from pprint import pprint
from ruamel import yaml
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
from parakeet.g2p.en import text_to_sequence
from parakeet import audio
from network import FastSpeech
from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -21,19 +22,22 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict
def synthesis(text_input, cfg):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'synthesis')
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis')
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
writer = SummaryWriter(path)
with dg.guard(place):
model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech")))
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
model.eval()
text = np.asarray(text_to_sequence(text_input))
@ -41,18 +45,18 @@ def synthesis(text_input, cfg):
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=cfg.alpha)
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg.audio.sr,
num_mels=cfg.audio.num_mels,
min_level_db=cfg.audio.min_level_db,
ref_level_db=cfg.audio.ref_level_db,
n_fft=cfg.audio.n_fft,
win_length= cfg.audio.win_length,
hop_length= cfg.audio.hop_length,
power=cfg.audio.power,
preemphasis=cfg.audio.preemphasis,
sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'],
power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'],
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
@ -65,12 +69,12 @@ def synthesis(text_input, cfg):
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!")
writer.close()
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
synthesis("Transformer model is so fast!", cfg)
args = parser.parse_args()
synthesis("Transformer model is so fast!", args)

View File

@ -3,10 +3,10 @@ import argparse
import os
import time
import math
import jsonargparse
from pathlib import Path
from parse import add_config_options_to_parser
from pprint import pprint
from ruamel import yaml
from tqdm import tqdm
from collections import OrderedDict
from tensorboardX import SummaryWriter
@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
from parakeet.models.fastspeech.fastspeech import FastSpeech
from parakeet.models.fastspeech.utils import get_alignment
@ -28,50 +28,49 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict
def main(cfg):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'fastspeech')
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer"))
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict)
transformerTTS.eval()
model = FastSpeech(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech"))
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = cfg.fastspeech_step
global_step = args.fastspeech_step
print("load checkpoint!!!")
if cfg.use_data_parallel:
if args.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(cfg.epochs):
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
@ -79,7 +78,7 @@ def main(cfg):
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg.transformer_head)).astype(np.float32)
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
global_step += 1
@ -101,20 +100,20 @@ def main(cfg):
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if cfg.use_data_parallel:
if args.use_data_parallel:
total_loss = model.scale_loss(total_loss)
total_loss.backward()
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step)
if local_rank==0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
@ -122,7 +121,9 @@ def main(cfg):
if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse')
parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c config/fastspeech.yaml'.split())
main(cfg)
args = parser.parse_args()
# Print the whole config setting.
pprint(args)
main(args)

View File

@ -0,0 +1,11 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1

View File

@ -10,11 +10,11 @@ audio:
ref_level_db: 20
outputs_per_step: 1
max_len: 50
transformer_step: 10
postnet_step: 10
use_gpu: True
checkpoint_path: ./checkpoint
log_dir: ./log
sample_path: ./sample
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0

View File

@ -0,0 +1,16 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0

View File

@ -0,0 +1,38 @@
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer.")
parser.add_argument('--vocoder_step', type=int, default=90000,
help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=int, default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=int, default=0,
help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")

View File

@ -2,17 +2,19 @@ import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
import numpy as np
from network import TransformerTTS, ModelPostNet
from tqdm import tqdm
from tensorboardX import SummaryWriter
from ruamel import yaml
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
from pathlib import Path
import jsonargparse
import argparse
from parse import add_config_options_to_parser
from pprint import pprint
from collections import OrderedDict
from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -24,25 +26,28 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict
def synthesis(text_input, cfg):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
# tensorboard
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'synthesis')
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis')
writer = SummaryWriter(path)
with dg.guard(place):
with fluid.unique_name.guard():
model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "nostop_token/transformer")))
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "nostop_token/transformer")))
model.eval()
with fluid.unique_name.guard():
model_postnet = ModelPostNet(cfg)
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
model_postnet = Vocoder(cfg, args.batch_size)
model_postnet.set_dict(load_checkpoint(str(args.postnet_step), os.path.join(args.checkpoint_path, "postnet")))
model_postnet.eval()
# init input
text = np.asarray(text_to_sequence(text_input))
@ -52,7 +57,7 @@ def synthesis(text_input, cfg):
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
pbar = tqdm(range(cfg.max_len))
pbar = tqdm(range(args.max_len))
for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1)
@ -62,15 +67,15 @@ def synthesis(text_input, cfg):
mag_pred = model_postnet(postnet_pred)
_ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg.audio.sr,
num_mels=cfg.audio.num_mels,
min_level_db=cfg.audio.min_level_db,
ref_level_db=cfg.audio.ref_level_db,
n_fft=cfg.audio.n_fft,
win_length= cfg.audio.win_length,
hop_length= cfg.audio.hop_length,
power=cfg.audio.power,
preemphasis=cfg.audio.preemphasis,
sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'],
power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'],
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
@ -82,14 +87,14 @@ def synthesis(text_input, cfg):
sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
if not os.path.exists(cfg.sample_path):
os.mkdir(cfg.sample_path)
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav)
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(args.sample_path):
os.mkdir(args.sample_path)
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
writer.close()
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
synthesis("Transformer model is so fast!", cfg)
args = parser.parse_args()
synthesis("Transformer model is so fast!", args)

View File

@ -3,9 +3,10 @@ from tqdm import tqdm
from tensorboardX import SummaryWriter
from pathlib import Path
from collections import OrderedDict
import jsonargparse
import argparse
from parse import add_config_options_to_parser
from pprint import pprint
from ruamel import yaml
from matplotlib import cm
import numpy as np
import paddle.fluid as fluid
@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.modules.utils import cross_entropy
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -26,22 +27,21 @@ def load_checkpoint(step, model_path):
return new_state_dict, opti_dict
def main(cfg):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'transformer')
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None
@ -49,23 +49,23 @@ def main(cfg):
model = TransformerTTS(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = cfg.transformer_step
global_step = args.transformer_step
print("load checkpoint!!!")
if cfg.use_data_parallel:
if args.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(cfg.epochs):
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
@ -81,7 +81,7 @@ def main(cfg):
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work.
if cfg.stop_token:
if args.stop_token:
stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss
@ -91,7 +91,7 @@ def main(cfg):
'post_mel_loss':post_mel_loss.numpy()
}, global_step)
if cfg.stop_token:
if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
writer.add_scalars('alphas', {
@ -101,7 +101,7 @@ def main(cfg):
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if global_step % cfg.image_step == 1:
if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
@ -117,20 +117,20 @@ def main(cfg):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
if cfg.use_data_parallel:
if args.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step)
if local_rank==0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
@ -138,7 +138,10 @@ def main(cfg):
if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse')
parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
main(cfg)
args = parser.parse_args()
# Print the whole config setting.
pprint(args)
main(args)

View File

@ -3,14 +3,15 @@ import os
from tqdm import tqdm
from pathlib import Path
from collections import OrderedDict
import jsonargparse
import argparse
from ruamel import yaml
from parse import add_config_options_to_parser
from pprint import pprint
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.vocoder import Vocoder
from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
@ -22,48 +23,47 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict
def main(cfg):
def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'postnet')
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'postnet')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
model = Vocoder(cfg)
model = Vocoder(cfg, args.batch_size)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
if cfg.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "postnet"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = cfg.postnet_step
global_step = args.vocoder_step
print("load checkpoint!!!")
if cfg.use_data_parallel:
if args.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader()
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(cfg.epochs):
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
@ -75,13 +75,13 @@ def main(cfg):
mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel:
if args.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients()
if local_rank==0:
@ -89,10 +89,10 @@ def main(cfg):
'loss':loss.numpy(),
}, global_step)
if global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step)
if global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'postnet/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
@ -100,7 +100,9 @@ def main(cfg):
writer.close()
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
parser = argparse.ArgumentParser(description="Train postnet model")
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split())
main(cfg)
args = parser.parse_args()
# Print the whole config setting.
pprint(args)
main(args)

View File

@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path)
LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
assert config.batch_size % nranks == 0
each_bs = config.batch_size // nranks
assert args.batch_size % nranks == 0
each_bs = args.batch_size // nranks
if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
else:
@ -63,15 +63,15 @@ class LJSpeech(object):
super(LJSpeech, self).__init__()
self.config = config
self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config.audio.sr,
num_mels=config.audio.num_mels,
min_level_db=config.audio.min_level_db,
ref_level_db=config.audio.ref_level_db,
n_fft=config.audio.n_fft,
win_length= config.audio.win_length,
hop_length= config.audio.hop_length,
power=config.audio.power,
preemphasis=config.audio.preemphasis,
sample_rate=config['audio']['sr'],
num_mels=config['audio']['num_mels'],
min_level_db=config['audio']['min_level_db'],
ref_level_db=config['audio']['ref_level_db'],
n_fft=config['audio']['n_fft'],
win_length= config['audio']['win_length'],
hop_length= config['audio']['hop_length'],
power=config['audio']['power'],
preemphasis=config['audio']['preemphasis'],
signal_norm=True,
symmetric_norm=False,
max_norm=1.,

View File

@ -2,7 +2,7 @@ import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols
from parakeet.models.transformerTTS.post_convnet import PostConvNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder
@ -13,43 +13,43 @@ class FastSpeech(dg.Layer):
super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len,
n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head,
d_k=cfg.fs_hidden_size // cfg.encoder_head,
d_v=cfg.fs_hidden_size // cfg.encoder_head,
d_model=cfg.fs_hidden_size,
d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['encoder_n_layer'],
n_head=cfg['encoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size,
out_channels=cfg.duration_predictor_output_size,
filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head,
d_k=cfg.fs_hidden_size // cfg.decoder_head,
d_v=cfg.fs_hidden_size // cfg.decoder_head,
d_model=cfg.fs_hidden_size,
d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg['dropout'])
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
n_layers=cfg['decoder_n_layer'],
n_head=cfg['decoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg.fs_hidden_size)
k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.mel_linear = dg.Linear(cfg.fs_hidden_size,
cfg.audio.num_mels * cfg.audio.outputs_per_step,
self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
param_attr = self.weight,
bias_attr = self.bias,)
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=cfg.audio.outputs_per_step,
outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)

View File

@ -4,8 +4,8 @@ import paddle.fluid as fluid
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformerTTS.prenet import PreNet
from parakeet.models.transformerTTS.post_convnet import PostConvNet
from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4):
@ -20,7 +20,7 @@ class Decoder(dg.Layer):
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
@ -38,17 +38,17 @@ class Decoder(dg.Layer):
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step,
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.stop_linear = dg.Linear(num_hidden, 1,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step,
use_cudnn = config.use_gpu)
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn = True)
def forward(self, key, value, query, c_mask, positional):

View File

@ -3,10 +3,10 @@ import paddle.fluid as fluid
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformerTTS.encoderprenet import EncoderPrenet
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config, num_head=4):
def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
@ -19,11 +19,11 @@ class Encoder(dg.Layer):
trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=config.use_gpu)
use_cudnn=True)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)

View File

@ -1,13 +1,13 @@
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformerTTS.encoder import Encoder
from parakeet.models.transformerTTS.decoder import Decoder
from parakeet.models.transformer_tts.encoder import Encoder
from parakeet.models.transformer_tts.decoder import Decoder
class TransformerTTS(dg.Layer):
def __init__(self, config):
super(TransformerTTS, self).__init__()
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
self.decoder = Decoder(config.hidden_size, config)
self.encoder = Encoder(config['embedding_size'], config['hidden_size'])
self.decoder = Decoder(config['hidden_size'], config)
self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel):

View File

@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D
from parakeet.modules.utils import *
from parakeet.models.transformerTTS.CBHG import CBHG
from parakeet.models.transformer_tts.CBHG import CBHG
class Vocoder(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, config):
def __init__(self, config, batch_size):
super(Vocoder, self).__init__()
self.pre_proj = Conv1D(num_channels = config.audio.num_mels,
num_filters = config.hidden_size,
self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'],
num_filters = config['hidden_size'],
filter_size=1)
self.cbhg = CBHG(config.hidden_size, config.batch_size)
self.post_proj = Conv1D(num_channels = config.hidden_size,
num_filters = (config.audio.n_fft // 2) + 1,
self.cbhg = CBHG(config['hidden_size'], batch_size)
self.post_proj = Conv1D(num_channels = config['hidden_size'],
num_filters = (config['audio']['n_fft'] // 2) + 1,
filter_size=1)
def forward(self, mel):