add transformerTTS and fastspeech
This commit is contained in:
parent
d0015239db
commit
185e25fedf
|
@ -0,0 +1,4 @@
|
||||||
|
# Fastspeech
|
||||||
|
Paddle fluid implementation of Fastspeech, a feed-forward network based on Transformer. The implementation is based on [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263).
|
||||||
|
|
||||||
|
We implement Fastspeech model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
|
|
@ -35,10 +35,10 @@ epochs: 10000
|
||||||
lr: 0.001
|
lr: 0.001
|
||||||
save_step: 500
|
save_step: 500
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: False
|
use_data_parallel: True
|
||||||
|
|
||||||
data_path: ../../../dataset/LJSpeech-1.1
|
data_path: ../../dataset/LJSpeech-1.1
|
||||||
transtts_path: ../transformerTTS/checkpoint/
|
transtts_path: ../TransformerTTS/checkpoint/
|
||||||
transformer_step: 200000
|
transformer_step: 200000
|
||||||
save_path: ./checkpoint
|
save_path: ./checkpoint
|
||||||
log_dir: ./log
|
log_dir: ./log
|
|
@ -14,9 +14,9 @@ import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid.layers as layers
|
import paddle.fluid.layers as layers
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||||
from parakeet.models.transformerTTS.network import TransformerTTS
|
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
|
||||||
from network import FastSpeech
|
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||||
from utils import get_alignment
|
from parakeet.models.fastspeech.utils import get_alignment
|
||||||
|
|
||||||
def load_checkpoint(step, model_path):
|
def load_checkpoint(step, model_path):
|
||||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
|
@ -0,0 +1,4 @@
|
||||||
|
# TransformerTTS
|
||||||
|
Paddle fluid implementation of TransformerTTS, a neural TTS with Transformer. The implementation is based on [Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895).
|
||||||
|
|
||||||
|
We implement TransformerTTS model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
|
|
@ -23,10 +23,10 @@ lr: 0.001
|
||||||
save_step: 1000
|
save_step: 1000
|
||||||
image_step: 2000
|
image_step: 2000
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: False
|
use_data_parallel: True
|
||||||
stop_token: False
|
stop_token: False
|
||||||
|
|
||||||
data_path: ../../../dataset/LJSpeech-1.1
|
data_path: ../../dataset/LJSpeech-1.1
|
||||||
save_path: ./checkpoint
|
save_path: ./checkpoint
|
||||||
log_dir: ./log
|
log_dir: ./log
|
||||||
#checkpoint_path: ./checkpoint
|
#checkpoint_path: ./checkpoint
|
|
@ -20,9 +20,9 @@ epochs: 10000
|
||||||
lr: 0.001
|
lr: 0.001
|
||||||
save_step: 10
|
save_step: 10
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: False
|
use_data_parallel: True
|
||||||
|
|
||||||
data_path: ../../../dataset/LJSpeech-1.1
|
data_path: ../../dataset/LJSpeech-1.1
|
||||||
save_path: ./checkpoint
|
save_path: ./checkpoint
|
||||||
log_dir: ./log
|
log_dir: ./log
|
||||||
#checkpoint_path: ./checkpoint
|
#checkpoint_path: ./checkpoint
|
|
@ -7,11 +7,13 @@ import jsonargparse
|
||||||
from parse import add_config_options_to_parser
|
from parse import add_config_options_to_parser
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from matplotlib import cm
|
from matplotlib import cm
|
||||||
|
import numpy as np
|
||||||
|
import paddle.fluid as fluid
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid.layers as layers
|
import paddle.fluid.layers as layers
|
||||||
from parakeet.modules.utils import cross_entropy
|
from parakeet.modules.utils import cross_entropy
|
||||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||||
from network import *
|
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
|
||||||
|
|
||||||
def load_checkpoint(step, model_path):
|
def load_checkpoint(step, model_path):
|
||||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||||
|
@ -86,10 +88,12 @@ def main(cfg):
|
||||||
if local_rank==0:
|
if local_rank==0:
|
||||||
writer.add_scalars('training_loss', {
|
writer.add_scalars('training_loss', {
|
||||||
'mel_loss':mel_loss.numpy(),
|
'mel_loss':mel_loss.numpy(),
|
||||||
'post_mel_loss':post_mel_loss.numpy(),
|
'post_mel_loss':post_mel_loss.numpy()
|
||||||
'stop_loss':stop_loss.numpy()
|
|
||||||
}, global_step)
|
}, global_step)
|
||||||
|
|
||||||
|
if cfg.stop_token:
|
||||||
|
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
|
||||||
|
|
||||||
writer.add_scalars('alphas', {
|
writer.add_scalars('alphas', {
|
||||||
'encoder_alpha':model.encoder.alpha.numpy(),
|
'encoder_alpha':model.encoder.alpha.numpy(),
|
||||||
'decoder_alpha':model.decoder.alpha.numpy(),
|
'decoder_alpha':model.decoder.alpha.numpy(),
|
|
@ -6,11 +6,14 @@ from collections import OrderedDict
|
||||||
import jsonargparse
|
import jsonargparse
|
||||||
from parse import add_config_options_to_parser
|
from parse import add_config_options_to_parser
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||||
from network import *
|
from parakeet.models.transformerTTS.vocoder import Vocoder
|
||||||
|
|
||||||
def load_checkpoint(step, model_path):
|
def load_checkpoint(step, model_path):
|
||||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
|
||||||
new_state_dict = OrderedDict()
|
new_state_dict = OrderedDict()
|
||||||
for param in model_dict:
|
for param in model_dict:
|
||||||
if param.startswith('_layers.'):
|
if param.startswith('_layers.'):
|
||||||
|
@ -40,7 +43,7 @@ def main(cfg):
|
||||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
model = ModelPostNet(cfg)
|
model = Vocoder(cfg)
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||||
|
@ -99,5 +102,5 @@ def main(cfg):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
|
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
|
||||||
add_config_options_to_parser(parser)
|
add_config_options_to_parser(parser)
|
||||||
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
|
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split())
|
||||||
main(cfg)
|
main(cfg)
|
|
@ -1,3 +1,3 @@
|
||||||
__version__ = "0.0.0"
|
__version__ = "0.0.0"
|
||||||
|
|
||||||
from . import data, g2p, models, modules, utils
|
from . import data, g2p, models, modules
|
||||||
|
|
|
@ -0,0 +1,152 @@
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import librosa
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from paddle import fluid
|
||||||
|
from parakeet import g2p
|
||||||
|
from parakeet import audio
|
||||||
|
from parakeet.data.sampler import *
|
||||||
|
from parakeet.data.datacargo import DataCargo
|
||||||
|
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||||
|
from parakeet.data.dataset import DatasetMixin, TransformDataset
|
||||||
|
|
||||||
|
class LJSpeechLoader:
|
||||||
|
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
|
||||||
|
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||||
|
|
||||||
|
LJSPEECH_ROOT = Path(config.data_path)
|
||||||
|
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
||||||
|
transformer = LJSpeech(config)
|
||||||
|
dataset = TransformDataset(metadata, transformer)
|
||||||
|
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
|
||||||
|
|
||||||
|
assert config.batch_size % nranks == 0
|
||||||
|
each_bs = config.batch_size // nranks
|
||||||
|
if is_vocoder:
|
||||||
|
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
|
||||||
|
else:
|
||||||
|
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
|
||||||
|
|
||||||
|
self.reader = fluid.io.DataLoader.from_generator(
|
||||||
|
capacity=32,
|
||||||
|
iterable=True,
|
||||||
|
use_double_buffer=True,
|
||||||
|
return_list=True)
|
||||||
|
self.reader.set_batch_generator(dataloader, place)
|
||||||
|
|
||||||
|
|
||||||
|
class LJSpeechMetaData(DatasetMixin):
|
||||||
|
def __init__(self, root):
|
||||||
|
self.root = Path(root)
|
||||||
|
self._wav_dir = self.root.joinpath("wavs")
|
||||||
|
csv_path = self.root.joinpath("metadata.csv")
|
||||||
|
self._table = pd.read_csv(
|
||||||
|
csv_path,
|
||||||
|
sep="|",
|
||||||
|
header=None,
|
||||||
|
quoting=csv.QUOTE_NONE,
|
||||||
|
names=["fname", "raw_text", "normalized_text"])
|
||||||
|
|
||||||
|
def get_example(self, i):
|
||||||
|
fname, raw_text, normalized_text = self._table.iloc[i]
|
||||||
|
fname = str(self._wav_dir.joinpath(fname + ".wav"))
|
||||||
|
return fname, raw_text, normalized_text
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._table)
|
||||||
|
|
||||||
|
|
||||||
|
class LJSpeech(object):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(LJSpeech, self).__init__()
|
||||||
|
self.config = config
|
||||||
|
self._ljspeech_processor = audio.AudioProcessor(
|
||||||
|
sample_rate=config.audio.sr,
|
||||||
|
num_mels=config.audio.num_mels,
|
||||||
|
min_level_db=config.audio.min_level_db,
|
||||||
|
ref_level_db=config.audio.ref_level_db,
|
||||||
|
n_fft=config.audio.n_fft,
|
||||||
|
win_length= config.audio.win_length,
|
||||||
|
hop_length= config.audio.hop_length,
|
||||||
|
power=config.audio.power,
|
||||||
|
preemphasis=config.audio.preemphasis,
|
||||||
|
signal_norm=True,
|
||||||
|
symmetric_norm=False,
|
||||||
|
max_norm=1.,
|
||||||
|
mel_fmin=0,
|
||||||
|
mel_fmax=None,
|
||||||
|
clip_norm=True,
|
||||||
|
griffin_lim_iters=60,
|
||||||
|
do_trim_silence=False,
|
||||||
|
sound_norm=False)
|
||||||
|
|
||||||
|
def __call__(self, metadatum):
|
||||||
|
"""All the code for generating an Example from a metadatum. If you want a
|
||||||
|
different preprocessing pipeline, you can override this method.
|
||||||
|
This method may require several processor, each of which has a lot of options.
|
||||||
|
In this case, you'd better pass a composed transform and pass it to the init
|
||||||
|
method.
|
||||||
|
"""
|
||||||
|
fname, raw_text, normalized_text = metadatum
|
||||||
|
|
||||||
|
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||||
|
wav = self._ljspeech_processor.load_wav(str(fname))
|
||||||
|
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
|
||||||
|
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
|
||||||
|
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||||
|
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||||
|
|
||||||
|
|
||||||
|
def batch_examples(batch):
|
||||||
|
texts = []
|
||||||
|
mels = []
|
||||||
|
mel_inputs = []
|
||||||
|
mel_lens = []
|
||||||
|
text_lens = []
|
||||||
|
pos_texts = []
|
||||||
|
pos_mels = []
|
||||||
|
for data in batch:
|
||||||
|
_, mel, text = data
|
||||||
|
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
|
||||||
|
mel_lens.append(mel.shape[1])
|
||||||
|
text_lens.append(len(text))
|
||||||
|
pos_texts.append(np.arange(1, len(text) + 1))
|
||||||
|
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||||
|
mels.append(mel)
|
||||||
|
texts.append(text)
|
||||||
|
|
||||||
|
# Sort by text_len in descending order
|
||||||
|
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
text_lens = sorted(text_lens, reverse=True)
|
||||||
|
|
||||||
|
# Pad sequence with largest len of the batch
|
||||||
|
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||||
|
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||||
|
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||||
|
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
|
||||||
|
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
|
||||||
|
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
|
||||||
|
|
||||||
|
def batch_examples_vocoder(batch):
|
||||||
|
mels=[]
|
||||||
|
mags=[]
|
||||||
|
for data in batch:
|
||||||
|
mag, mel, _ = data
|
||||||
|
mels.append(mel)
|
||||||
|
mags.append(mag)
|
||||||
|
|
||||||
|
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||||
|
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
|
||||||
|
|
||||||
|
return (mels, mags)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
import numpy as np
|
||||||
|
import math
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||||
|
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||||
|
|
||||||
|
class FFTBlock(dg.Layer):
|
||||||
|
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
||||||
|
super(FFTBlock, self).__init__()
|
||||||
|
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
|
||||||
|
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
||||||
|
|
||||||
|
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
||||||
|
"""
|
||||||
|
Feed Forward Transformer block in FastSpeech.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
|
||||||
|
T means the timesteps of input.
|
||||||
|
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
|
||||||
|
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
|
||||||
|
len_q means the sequence length of query, len_k means the sequence length of key.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
|
||||||
|
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
|
||||||
|
"""
|
||||||
|
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||||
|
output *= non_pad_mask
|
||||||
|
|
||||||
|
output = self.pos_ffn(output)
|
||||||
|
output *= non_pad_mask
|
||||||
|
|
||||||
|
return output, slf_attn
|
|
@ -1,42 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import math
|
import math
|
||||||
import utils
|
import parakeet.models.fastspeech.utils
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid.layers as layers
|
import paddle.fluid.layers as layers
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
from parakeet.modules.layers import Conv, Linear
|
from parakeet.modules.layers import Conv, Linear
|
||||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
|
||||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
|
||||||
|
|
||||||
class FFTBlock(dg.Layer):
|
|
||||||
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
|
||||||
super(FFTBlock, self).__init__()
|
|
||||||
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
|
|
||||||
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
|
||||||
|
|
||||||
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
|
||||||
"""
|
|
||||||
Feed Forward Transformer block in FastSpeech.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
|
|
||||||
T means the timesteps of input.
|
|
||||||
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
|
|
||||||
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
|
|
||||||
len_q means the sequence length of query, len_k means the sequence length of key.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
|
|
||||||
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
|
|
||||||
"""
|
|
||||||
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
|
||||||
output *= non_pad_mask
|
|
||||||
|
|
||||||
output = self.pos_ffn(output)
|
|
||||||
output *= non_pad_mask
|
|
||||||
|
|
||||||
return output, slf_attn
|
|
||||||
|
|
||||||
|
|
||||||
class LengthRegulator(dg.Layer):
|
class LengthRegulator(dg.Layer):
|
||||||
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
|
@ -0,0 +1,63 @@
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from parakeet.g2p.text.symbols import symbols
|
||||||
|
from parakeet.modules.utils import *
|
||||||
|
from parakeet.modules.post_convnet import PostConvNet
|
||||||
|
from parakeet.modules.layers import Linear
|
||||||
|
from parakeet.models.fastspeech.FFTBlock import FFTBlock
|
||||||
|
|
||||||
|
class Decoder(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
len_max_seq,
|
||||||
|
n_layers,
|
||||||
|
n_head,
|
||||||
|
d_k,
|
||||||
|
d_v,
|
||||||
|
d_model,
|
||||||
|
d_inner,
|
||||||
|
fft_conv1d_kernel,
|
||||||
|
fft_conv1d_padding,
|
||||||
|
dropout=0.1):
|
||||||
|
super(Decoder, self).__init__()
|
||||||
|
|
||||||
|
n_position = len_max_seq + 1
|
||||||
|
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||||
|
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||||
|
padding_idx=0,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
|
trainable=False))
|
||||||
|
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||||
|
for i, layer in enumerate(self.layer_stack):
|
||||||
|
self.add_sublayer('fft_{}'.format(i), layer)
|
||||||
|
|
||||||
|
def forward(self, enc_seq, enc_pos):
|
||||||
|
"""
|
||||||
|
Decoder layer of FastSpeech.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
|
||||||
|
The output of length regulator.
|
||||||
|
enc_pos (Variable, optional): Shape(B, T_mel),
|
||||||
|
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
|
||||||
|
Returns:
|
||||||
|
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
|
||||||
|
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||||
|
"""
|
||||||
|
dec_slf_attn_list = []
|
||||||
|
|
||||||
|
# -- Prepare masks
|
||||||
|
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
||||||
|
non_pad_mask = get_non_pad_mask(enc_pos)
|
||||||
|
|
||||||
|
# -- Forward
|
||||||
|
dec_output = enc_seq + self.position_enc(enc_pos)
|
||||||
|
|
||||||
|
for dec_layer in self.layer_stack:
|
||||||
|
dec_output, dec_slf_attn = dec_layer(
|
||||||
|
dec_output,
|
||||||
|
non_pad_mask=non_pad_mask,
|
||||||
|
slf_attn_mask=slf_attn_mask)
|
||||||
|
dec_slf_attn_list += [dec_slf_attn]
|
||||||
|
|
||||||
|
return dec_output, dec_slf_attn_list
|
|
@ -0,0 +1,67 @@
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from parakeet.g2p.text.symbols import symbols
|
||||||
|
from parakeet.modules.utils import *
|
||||||
|
from parakeet.modules.post_convnet import PostConvNet
|
||||||
|
from parakeet.modules.layers import Linear
|
||||||
|
from parakeet.models.fastspeech.FFTBlock import FFTBlock
|
||||||
|
|
||||||
|
class Encoder(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
n_src_vocab,
|
||||||
|
len_max_seq,
|
||||||
|
n_layers,
|
||||||
|
n_head,
|
||||||
|
d_k,
|
||||||
|
d_v,
|
||||||
|
d_model,
|
||||||
|
d_inner,
|
||||||
|
fft_conv1d_kernel,
|
||||||
|
fft_conv1d_padding,
|
||||||
|
dropout=0.1):
|
||||||
|
super(Encoder, self).__init__()
|
||||||
|
n_position = len_max_seq + 1
|
||||||
|
|
||||||
|
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
|
||||||
|
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||||
|
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||||
|
padding_idx=0,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
|
trainable=False))
|
||||||
|
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||||
|
for i, layer in enumerate(self.layer_stack):
|
||||||
|
self.add_sublayer('fft_{}'.format(i), layer)
|
||||||
|
|
||||||
|
def forward(self, character, text_pos):
|
||||||
|
"""
|
||||||
|
Encoder layer of FastSpeech.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
character (Variable): Shape(B, T_text), dtype: float32. The input text
|
||||||
|
characters. T_text means the timesteps of input characters.
|
||||||
|
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
|
||||||
|
position. T_text means the timesteps of input characters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
enc_output (Variable), Shape(B, text_T, C), the encoder output.
|
||||||
|
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
|
||||||
|
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
||||||
|
"""
|
||||||
|
enc_slf_attn_list = []
|
||||||
|
# -- prepare masks
|
||||||
|
# shape character (N, T)
|
||||||
|
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
||||||
|
non_pad_mask = get_non_pad_mask(character)
|
||||||
|
|
||||||
|
# -- Forward
|
||||||
|
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
||||||
|
|
||||||
|
for enc_layer in self.layer_stack:
|
||||||
|
enc_output, enc_slf_attn = enc_layer(
|
||||||
|
enc_output,
|
||||||
|
non_pad_mask=non_pad_mask,
|
||||||
|
slf_attn_mask=slf_attn_mask)
|
||||||
|
enc_slf_attn_list += [enc_slf_attn]
|
||||||
|
|
||||||
|
return enc_output, non_pad_mask, enc_slf_attn_list
|
|
@ -4,124 +4,10 @@ from parakeet.g2p.text.symbols import symbols
|
||||||
from parakeet.modules.utils import *
|
from parakeet.modules.utils import *
|
||||||
from parakeet.modules.post_convnet import PostConvNet
|
from parakeet.modules.post_convnet import PostConvNet
|
||||||
from parakeet.modules.layers import Linear
|
from parakeet.modules.layers import Linear
|
||||||
from utils import *
|
from parakeet.models.fastspeech.utils import *
|
||||||
from modules import FFTBlock, LengthRegulator
|
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
|
||||||
|
from parakeet.models.fastspeech.encoder import Encoder
|
||||||
class Encoder(dg.Layer):
|
from parakeet.models.fastspeech.decoder import Decoder
|
||||||
def __init__(self,
|
|
||||||
n_src_vocab,
|
|
||||||
len_max_seq,
|
|
||||||
n_layers,
|
|
||||||
n_head,
|
|
||||||
d_k,
|
|
||||||
d_v,
|
|
||||||
d_model,
|
|
||||||
d_inner,
|
|
||||||
fft_conv1d_kernel,
|
|
||||||
fft_conv1d_padding,
|
|
||||||
dropout=0.1):
|
|
||||||
super(Encoder, self).__init__()
|
|
||||||
n_position = len_max_seq + 1
|
|
||||||
|
|
||||||
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
|
|
||||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
|
||||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
|
||||||
padding_idx=0,
|
|
||||||
param_attr=fluid.ParamAttr(
|
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
|
||||||
trainable=False))
|
|
||||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
|
||||||
for i, layer in enumerate(self.layer_stack):
|
|
||||||
self.add_sublayer('fft_{}'.format(i), layer)
|
|
||||||
|
|
||||||
def forward(self, character, text_pos):
|
|
||||||
"""
|
|
||||||
Encoder layer of FastSpeech.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
character (Variable): Shape(B, T_text), dtype: float32. The input text
|
|
||||||
characters. T_text means the timesteps of input characters.
|
|
||||||
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
|
|
||||||
position. T_text means the timesteps of input characters.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
enc_output (Variable), Shape(B, text_T, C), the encoder output.
|
|
||||||
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
|
|
||||||
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
|
||||||
"""
|
|
||||||
enc_slf_attn_list = []
|
|
||||||
# -- prepare masks
|
|
||||||
# shape character (N, T)
|
|
||||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
|
||||||
non_pad_mask = get_non_pad_mask(character)
|
|
||||||
|
|
||||||
# -- Forward
|
|
||||||
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
|
||||||
|
|
||||||
for enc_layer in self.layer_stack:
|
|
||||||
enc_output, enc_slf_attn = enc_layer(
|
|
||||||
enc_output,
|
|
||||||
non_pad_mask=non_pad_mask,
|
|
||||||
slf_attn_mask=slf_attn_mask)
|
|
||||||
enc_slf_attn_list += [enc_slf_attn]
|
|
||||||
|
|
||||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
|
||||||
|
|
||||||
class Decoder(dg.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
len_max_seq,
|
|
||||||
n_layers,
|
|
||||||
n_head,
|
|
||||||
d_k,
|
|
||||||
d_v,
|
|
||||||
d_model,
|
|
||||||
d_inner,
|
|
||||||
fft_conv1d_kernel,
|
|
||||||
fft_conv1d_padding,
|
|
||||||
dropout=0.1):
|
|
||||||
super(Decoder, self).__init__()
|
|
||||||
|
|
||||||
n_position = len_max_seq + 1
|
|
||||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
|
||||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
|
||||||
padding_idx=0,
|
|
||||||
param_attr=fluid.ParamAttr(
|
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
|
||||||
trainable=False))
|
|
||||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
|
||||||
for i, layer in enumerate(self.layer_stack):
|
|
||||||
self.add_sublayer('fft_{}'.format(i), layer)
|
|
||||||
|
|
||||||
def forward(self, enc_seq, enc_pos):
|
|
||||||
"""
|
|
||||||
Decoder layer of FastSpeech.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
|
|
||||||
The output of length regulator.
|
|
||||||
enc_pos (Variable, optional): Shape(B, T_mel),
|
|
||||||
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
|
|
||||||
Returns:
|
|
||||||
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
|
|
||||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
|
||||||
"""
|
|
||||||
dec_slf_attn_list = []
|
|
||||||
|
|
||||||
# -- Prepare masks
|
|
||||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
|
||||||
non_pad_mask = get_non_pad_mask(enc_pos)
|
|
||||||
|
|
||||||
# -- Forward
|
|
||||||
dec_output = enc_seq + self.position_enc(enc_pos)
|
|
||||||
|
|
||||||
for dec_layer in self.layer_stack:
|
|
||||||
dec_output, dec_slf_attn = dec_layer(
|
|
||||||
dec_output,
|
|
||||||
non_pad_mask=non_pad_mask,
|
|
||||||
slf_attn_mask=slf_attn_mask)
|
|
||||||
dec_slf_attn_list += [dec_slf_attn]
|
|
||||||
|
|
||||||
return dec_output, dec_slf_attn_list
|
|
||||||
|
|
||||||
class FastSpeech(dg.Layer):
|
class FastSpeech(dg.Layer):
|
||||||
def __init__(self, cfg):
|
def __init__(self, cfg):
|
|
@ -1,166 +0,0 @@
|
||||||
import math
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
from paddle import fluid
|
|
||||||
import paddle.fluid.dygraph as dg
|
|
||||||
|
|
||||||
|
|
||||||
class Conv1D(dg.Layer):
|
|
||||||
"""
|
|
||||||
A convolution 1D block implemented with Conv2D. Form simplicity and
|
|
||||||
ensuring the output has the same length as the input, it does not allow
|
|
||||||
stride > 1.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
in_channels,
|
|
||||||
num_filters,
|
|
||||||
filter_size=3,
|
|
||||||
padding=0,
|
|
||||||
dilation=1,
|
|
||||||
stride=1,
|
|
||||||
groups=None,
|
|
||||||
param_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
use_cudnn=True,
|
|
||||||
act=None,
|
|
||||||
data_format='NCT',
|
|
||||||
dtype="float32"):
|
|
||||||
super(Conv1D, self).__init__(dtype=dtype)
|
|
||||||
|
|
||||||
self.padding = padding
|
|
||||||
self.in_channels = in_channels
|
|
||||||
self.num_filters = num_filters
|
|
||||||
self.filter_size = filter_size
|
|
||||||
self.stride = stride
|
|
||||||
self.dilation = dilation
|
|
||||||
self.padding = padding
|
|
||||||
self.act = act
|
|
||||||
self.data_format = data_format
|
|
||||||
|
|
||||||
self.conv = dg.Conv2D(
|
|
||||||
in_channels=in_channels,
|
|
||||||
num_filters=num_filters,
|
|
||||||
filter_size=(1, filter_size),
|
|
||||||
stride=(1, stride),
|
|
||||||
dilation=(1, dilation),
|
|
||||||
padding=(0, padding),
|
|
||||||
groups=groups,
|
|
||||||
param_attr=param_attr,
|
|
||||||
bias_attr=bias_attr,
|
|
||||||
use_cudnn=use_cudnn,
|
|
||||||
act=act,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
|
||||||
input channels.
|
|
||||||
Returns:
|
|
||||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
|
||||||
output channels (num_filters).
|
|
||||||
"""
|
|
||||||
if self.data_format == 'NTC':
|
|
||||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
|
||||||
x = fluid.layers.unsqueeze(x, [2])
|
|
||||||
x = self.conv(x)
|
|
||||||
x = fluid.layers.squeeze(x, [2])
|
|
||||||
if self.data_format == 'NTC':
|
|
||||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
|
||||||
return x
|
|
||||||
|
|
||||||
class Pool1D(dg.Layer):
|
|
||||||
"""
|
|
||||||
A Pool 1D block implemented with Pool2D.
|
|
||||||
"""
|
|
||||||
def __init__(self,
|
|
||||||
pool_size=-1,
|
|
||||||
pool_type='max',
|
|
||||||
pool_stride=1,
|
|
||||||
pool_padding=0,
|
|
||||||
global_pooling=False,
|
|
||||||
use_cudnn=True,
|
|
||||||
ceil_mode=False,
|
|
||||||
exclusive=True,
|
|
||||||
data_format='NCT',
|
|
||||||
dtype='float32'):
|
|
||||||
super(Pool1D, self).__init__(dtype=dtype)
|
|
||||||
self.pool_size = pool_size
|
|
||||||
self.pool_type = pool_type
|
|
||||||
self.pool_stride = pool_stride
|
|
||||||
self.pool_padding = pool_padding
|
|
||||||
self.global_pooling = global_pooling
|
|
||||||
self.use_cudnn = use_cudnn
|
|
||||||
self.ceil_mode = ceil_mode
|
|
||||||
self.exclusive = exclusive
|
|
||||||
self.data_format = data_format
|
|
||||||
self.dtype = dtype
|
|
||||||
|
|
||||||
|
|
||||||
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
|
||||||
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
|
||||||
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
|
||||||
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
|
||||||
input channels.
|
|
||||||
Returns:
|
|
||||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
|
||||||
output channels (num_filters).
|
|
||||||
"""
|
|
||||||
if self.data_format == 'NTC':
|
|
||||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
|
||||||
x = fluid.layers.unsqueeze(x, [2])
|
|
||||||
x = self.pool2d(x)
|
|
||||||
x = fluid.layers.squeeze(x, [2])
|
|
||||||
if self.data_format == 'NTC':
|
|
||||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
|
||||||
return x
|
|
||||||
|
|
||||||
class DynamicGRU(dg.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
size,
|
|
||||||
param_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
is_reverse=False,
|
|
||||||
gate_activation='sigmoid',
|
|
||||||
candidate_activation='tanh',
|
|
||||||
h_0=None,
|
|
||||||
origin_mode=False,
|
|
||||||
init_size=None):
|
|
||||||
super(DynamicGRU, self).__init__()
|
|
||||||
self.gru_unit = dg.GRUUnit(
|
|
||||||
size * 3,
|
|
||||||
param_attr=param_attr,
|
|
||||||
bias_attr=bias_attr,
|
|
||||||
activation=candidate_activation,
|
|
||||||
gate_activation=gate_activation,
|
|
||||||
origin_mode=origin_mode)
|
|
||||||
self.size = size
|
|
||||||
self.h_0 = h_0
|
|
||||||
self.is_reverse = is_reverse
|
|
||||||
|
|
||||||
def forward(self, inputs):
|
|
||||||
hidden = self.h_0
|
|
||||||
res = []
|
|
||||||
for i in range(inputs.shape[1]):
|
|
||||||
if self.is_reverse:
|
|
||||||
i = inputs.shape[1] - 1 - i
|
|
||||||
input_ = inputs[:, i:i + 1, :]
|
|
||||||
input_ = fluid.layers.reshape(
|
|
||||||
input_, [-1, input_.shape[2]], inplace=False)
|
|
||||||
hidden, reset, gate = self.gru_unit(input_, hidden)
|
|
||||||
hidden_ = fluid.layers.reshape(
|
|
||||||
hidden, [-1, 1, hidden.shape[1]], inplace=False)
|
|
||||||
res.append(hidden_)
|
|
||||||
if self.is_reverse:
|
|
||||||
res = res[::-1]
|
|
||||||
res = fluid.layers.concat(res, axis=1)
|
|
||||||
return res
|
|
||||||
|
|
|
@ -1,218 +0,0 @@
|
||||||
import math
|
|
||||||
from parakeet.g2p.text.symbols import symbols
|
|
||||||
import paddle.fluid.dygraph as dg
|
|
||||||
import paddle.fluid as fluid
|
|
||||||
import paddle.fluid.layers as layers
|
|
||||||
from parakeet.modules.layers import Conv, Pool1D, Linear
|
|
||||||
from parakeet.modules.dynamicGRU import DynamicGRU
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class EncoderPrenet(dg.Layer):
|
|
||||||
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
|
|
||||||
super(EncoderPrenet, self).__init__()
|
|
||||||
self.embedding_size = embedding_size
|
|
||||||
self.num_hidden = num_hidden
|
|
||||||
self.use_cudnn = use_cudnn
|
|
||||||
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
|
|
||||||
padding_idx = None)
|
|
||||||
self.conv_list = []
|
|
||||||
self.conv_list.append(Conv(in_channels = embedding_size,
|
|
||||||
out_channels = num_hidden,
|
|
||||||
filter_size = 5,
|
|
||||||
padding = int(np.floor(5/2)),
|
|
||||||
use_cudnn = use_cudnn,
|
|
||||||
data_format = "NCT"))
|
|
||||||
for _ in range(2):
|
|
||||||
self.conv_list.append(Conv(in_channels = num_hidden,
|
|
||||||
out_channels = num_hidden,
|
|
||||||
filter_size = 5,
|
|
||||||
padding = int(np.floor(5/2)),
|
|
||||||
use_cudnn = use_cudnn,
|
|
||||||
data_format = "NCT"))
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.conv_list):
|
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
|
||||||
|
|
||||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
|
||||||
data_layout='NCHW') for _ in range(3)]
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.batch_norm_list):
|
|
||||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
|
||||||
|
|
||||||
self.projection = Linear(num_hidden, num_hidden)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
|
||||||
x = layers.transpose(x,[0,2,1])
|
|
||||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
|
||||||
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
|
|
||||||
x = layers.transpose(x,[0,2,1]) #(N,T,C)
|
|
||||||
x = self.projection(x)
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
class CBHG(dg.Layer):
|
|
||||||
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
|
|
||||||
max_pool_kernel_size=2, is_post=False):
|
|
||||||
super(CBHG, self).__init__()
|
|
||||||
"""
|
|
||||||
:param hidden_size: dimension of hidden unit
|
|
||||||
:param batch_size: batch size
|
|
||||||
:param K: # of convolution banks
|
|
||||||
:param projection_size: dimension of projection unit
|
|
||||||
:param num_gru_layers: # of layers of GRUcell
|
|
||||||
:param max_pool_kernel_size: max pooling kernel size
|
|
||||||
:param is_post: whether post processing or not
|
|
||||||
"""
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.projection_size = projection_size
|
|
||||||
self.conv_list = []
|
|
||||||
self.conv_list.append(Conv(in_channels = projection_size,
|
|
||||||
out_channels = hidden_size,
|
|
||||||
filter_size = 1,
|
|
||||||
padding = int(np.floor(1/2)),
|
|
||||||
data_format = "NCT"))
|
|
||||||
for i in range(2,K+1):
|
|
||||||
self.conv_list.append(Conv(in_channels = hidden_size,
|
|
||||||
out_channels = hidden_size,
|
|
||||||
filter_size = i,
|
|
||||||
padding = int(np.floor(i/2)),
|
|
||||||
data_format = "NCT"))
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.conv_list):
|
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
|
||||||
|
|
||||||
self.batchnorm_list = []
|
|
||||||
for i in range(K):
|
|
||||||
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
|
|
||||||
data_layout='NCHW'))
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.batchnorm_list):
|
|
||||||
self.add_sublayer("batchnorm_list_{}".format(i), layer)
|
|
||||||
|
|
||||||
conv_outdim = hidden_size * K
|
|
||||||
|
|
||||||
self.conv_projection_1 = Conv(in_channels = conv_outdim,
|
|
||||||
out_channels = hidden_size,
|
|
||||||
filter_size = 3,
|
|
||||||
padding = int(np.floor(3/2)),
|
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
self.conv_projection_2 = Conv(in_channels = hidden_size,
|
|
||||||
out_channels = projection_size,
|
|
||||||
filter_size = 3,
|
|
||||||
padding = int(np.floor(3/2)),
|
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
|
||||||
data_layout='NCHW')
|
|
||||||
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
|
|
||||||
data_layout='NCHW')
|
|
||||||
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
|
|
||||||
pool_type='max',
|
|
||||||
pool_stride=1,
|
|
||||||
pool_padding=1,
|
|
||||||
data_format = "NCT")
|
|
||||||
self.highway = Highwaynet(self.projection_size)
|
|
||||||
|
|
||||||
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
|
|
||||||
h_0 = dg.to_variable(h_0)
|
|
||||||
self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
|
|
||||||
self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
|
|
||||||
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
|
|
||||||
is_reverse = False,
|
|
||||||
origin_mode = True,
|
|
||||||
h_0 = h_0)
|
|
||||||
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
|
|
||||||
is_reverse=True,
|
|
||||||
origin_mode=True,
|
|
||||||
h_0 = h_0)
|
|
||||||
|
|
||||||
self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
|
|
||||||
self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
|
|
||||||
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
|
|
||||||
is_reverse = False,
|
|
||||||
origin_mode = True,
|
|
||||||
h_0 = h_0)
|
|
||||||
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
|
|
||||||
is_reverse=True,
|
|
||||||
origin_mode=True,
|
|
||||||
h_0 = h_0)
|
|
||||||
|
|
||||||
def _conv_fit_dim(self, x, filter_size=3):
|
|
||||||
if filter_size % 2 == 0:
|
|
||||||
return x[:,:,:-1]
|
|
||||||
else:
|
|
||||||
return x
|
|
||||||
|
|
||||||
def forward(self, input_):
|
|
||||||
# input_.shape = [N, C, T]
|
|
||||||
|
|
||||||
conv_list = []
|
|
||||||
conv_input = input_
|
|
||||||
|
|
||||||
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
|
|
||||||
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
|
|
||||||
conv_input = layers.relu(batchnorm(conv_input))
|
|
||||||
conv_list.append(conv_input)
|
|
||||||
|
|
||||||
conv_cat = layers.concat(conv_list, axis=1)
|
|
||||||
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
|
|
||||||
|
|
||||||
|
|
||||||
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
|
|
||||||
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
|
|
||||||
|
|
||||||
# conv_proj.shape = [N, C, T]
|
|
||||||
highway = layers.transpose(conv_proj, [0,2,1])
|
|
||||||
highway = self.highway(highway)
|
|
||||||
|
|
||||||
# highway.shape = [N, T, C]
|
|
||||||
fc_forward = self.fc_forward1(highway)
|
|
||||||
fc_reverse = self.fc_reverse1(highway)
|
|
||||||
out_forward = self.gru_forward1(fc_forward)
|
|
||||||
out_reverse = self.gru_reverse1(fc_reverse)
|
|
||||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
|
||||||
fc_forward = self.fc_forward2(out)
|
|
||||||
fc_reverse = self.fc_reverse2(out)
|
|
||||||
out_forward = self.gru_forward2(fc_forward)
|
|
||||||
out_reverse = self.gru_reverse2(fc_reverse)
|
|
||||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
|
||||||
out = layers.transpose(out, [0,2,1])
|
|
||||||
return out
|
|
||||||
|
|
||||||
class Highwaynet(dg.Layer):
|
|
||||||
def __init__(self, num_units, num_layers=4):
|
|
||||||
super(Highwaynet, self).__init__()
|
|
||||||
self.num_units = num_units
|
|
||||||
self.num_layers = num_layers
|
|
||||||
|
|
||||||
self.gates = []
|
|
||||||
self.linears = []
|
|
||||||
|
|
||||||
for i in range(num_layers):
|
|
||||||
self.linears.append(Linear(num_units, num_units))
|
|
||||||
self.gates.append(Linear(num_units, num_units))
|
|
||||||
|
|
||||||
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
|
|
||||||
self.add_sublayer("linears_{}".format(i), linear)
|
|
||||||
self.add_sublayer("gates_{}".format(i), gate)
|
|
||||||
|
|
||||||
def forward(self, input_):
|
|
||||||
out = input_
|
|
||||||
|
|
||||||
for linear, gate in zip(self.linears, self.gates):
|
|
||||||
h = fluid.layers.relu(linear(out))
|
|
||||||
t_ = fluid.layers.sigmoid(gate(out))
|
|
||||||
|
|
||||||
c = 1 - t_
|
|
||||||
out = h * t_ + out * c
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,203 +0,0 @@
|
||||||
from parakeet.models.transformerTTS.module import *
|
|
||||||
import paddle.fluid.dygraph as dg
|
|
||||||
import paddle.fluid as fluid
|
|
||||||
from parakeet.modules.layers import Conv1D, Linear
|
|
||||||
from parakeet.modules.utils import *
|
|
||||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
|
||||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
|
||||||
from parakeet.modules.prenet import PreNet
|
|
||||||
from parakeet.modules.post_convnet import PostConvNet
|
|
||||||
|
|
||||||
|
|
||||||
class Encoder(dg.Layer):
|
|
||||||
def __init__(self, embedding_size, num_hidden, config, num_head=4):
|
|
||||||
super(Encoder, self).__init__()
|
|
||||||
self.num_hidden = num_hidden
|
|
||||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
|
|
||||||
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
|
|
||||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
|
||||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
|
||||||
padding_idx=0,
|
|
||||||
param_attr=fluid.ParamAttr(
|
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
|
||||||
trainable=False))
|
|
||||||
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
|
||||||
num_hidden = num_hidden,
|
|
||||||
use_cudnn=config.use_gpu)
|
|
||||||
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
|
||||||
for i, layer in enumerate(self.layers):
|
|
||||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
|
||||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
|
||||||
for i, layer in enumerate(self.ffns):
|
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
|
||||||
|
|
||||||
def forward(self, x, positional):
|
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
|
||||||
query_mask = get_non_pad_mask(positional)
|
|
||||||
mask = get_attn_key_pad_mask(positional, x)
|
|
||||||
else:
|
|
||||||
query_mask, mask = None, None
|
|
||||||
|
|
||||||
# Encoder pre_network
|
|
||||||
x = self.encoder_prenet(x) #(N,T,C)
|
|
||||||
|
|
||||||
|
|
||||||
# Get positional encoding
|
|
||||||
positional = self.pos_emb(positional)
|
|
||||||
|
|
||||||
x = positional * self.alpha + x #(N, T, C)
|
|
||||||
|
|
||||||
|
|
||||||
# Positional dropout
|
|
||||||
x = layers.dropout(x, 0.1)
|
|
||||||
|
|
||||||
# Self attention encoder
|
|
||||||
attentions = list()
|
|
||||||
for layer, ffn in zip(self.layers, self.ffns):
|
|
||||||
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
|
|
||||||
x = ffn(x)
|
|
||||||
attentions.append(attention)
|
|
||||||
|
|
||||||
return x, query_mask, attentions
|
|
||||||
|
|
||||||
class Decoder(dg.Layer):
|
|
||||||
def __init__(self, num_hidden, config, num_head=4):
|
|
||||||
super(Decoder, self).__init__()
|
|
||||||
self.num_hidden = num_hidden
|
|
||||||
param = fluid.ParamAttr()
|
|
||||||
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
|
|
||||||
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
|
|
||||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
|
||||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
|
||||||
padding_idx=0,
|
|
||||||
param_attr=fluid.ParamAttr(
|
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
|
||||||
trainable=False))
|
|
||||||
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
|
|
||||||
hidden_size = num_hidden * 2,
|
|
||||||
output_size = num_hidden,
|
|
||||||
dropout_rate=0.2)
|
|
||||||
self.linear = Linear(num_hidden, num_hidden)
|
|
||||||
|
|
||||||
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
|
||||||
for i, layer in enumerate(self.selfattn_layers):
|
|
||||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
|
||||||
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
|
||||||
for i, layer in enumerate(self.attn_layers):
|
|
||||||
self.add_sublayer("attn_{}".format(i), layer)
|
|
||||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
|
|
||||||
for i, layer in enumerate(self.ffns):
|
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
|
||||||
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
|
||||||
self.stop_linear = Linear(num_hidden, 1)
|
|
||||||
|
|
||||||
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
|
|
||||||
filter_size = 5, padding = 4, num_conv=5,
|
|
||||||
outputs_per_step=config.audio.outputs_per_step,
|
|
||||||
use_cudnn = config.use_gpu)
|
|
||||||
|
|
||||||
def forward(self, key, value, query, c_mask, positional):
|
|
||||||
|
|
||||||
# get decoder mask with triangular matrix
|
|
||||||
|
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
|
||||||
m_mask = get_non_pad_mask(positional)
|
|
||||||
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
|
|
||||||
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
|
|
||||||
mask = mask + triu_tensor
|
|
||||||
mask = fluid.layers.cast(mask == 0, np.float32)
|
|
||||||
|
|
||||||
# (batch_size, decoder_len, encoder_len)
|
|
||||||
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
|
|
||||||
else:
|
|
||||||
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
|
|
||||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
|
||||||
m_mask, zero_mask = None, None
|
|
||||||
|
|
||||||
# Decoder pre-network
|
|
||||||
query = self.decoder_prenet(query)
|
|
||||||
|
|
||||||
# Centered position
|
|
||||||
query = self.linear(query)
|
|
||||||
|
|
||||||
# Get position embedding
|
|
||||||
positional = self.pos_emb(positional)
|
|
||||||
query = positional * self.alpha + query
|
|
||||||
|
|
||||||
#positional dropout
|
|
||||||
query = fluid.layers.dropout(query, 0.1)
|
|
||||||
|
|
||||||
# Attention decoder-decoder, encoder-decoder
|
|
||||||
selfattn_list = list()
|
|
||||||
attn_list = list()
|
|
||||||
|
|
||||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
|
|
||||||
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
|
|
||||||
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
|
|
||||||
query = ffn(query)
|
|
||||||
selfattn_list.append(attn_dec)
|
|
||||||
attn_list.append(attn_dot)
|
|
||||||
# Mel linear projection
|
|
||||||
mel_out = self.mel_linear(query)
|
|
||||||
# Post Mel Network
|
|
||||||
out = self.postconvnet(mel_out)
|
|
||||||
out = mel_out + out
|
|
||||||
|
|
||||||
# Stop tokens
|
|
||||||
stop_tokens = self.stop_linear(query)
|
|
||||||
stop_tokens = layers.squeeze(stop_tokens, [-1])
|
|
||||||
stop_tokens = layers.sigmoid(stop_tokens)
|
|
||||||
|
|
||||||
return mel_out, out, attn_list, stop_tokens, selfattn_list
|
|
||||||
|
|
||||||
class TransformerTTS(dg.Layer):
|
|
||||||
def __init__(self, config):
|
|
||||||
super(TransformerTTS, self).__init__()
|
|
||||||
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
|
|
||||||
self.decoder = Decoder(config.hidden_size, config)
|
|
||||||
self.config = config
|
|
||||||
|
|
||||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
|
||||||
# key (batch_size, seq_len, channel)
|
|
||||||
# c_mask (batch_size, seq_len)
|
|
||||||
# attns_enc (channel / 2, seq_len, seq_len)
|
|
||||||
|
|
||||||
key, c_mask, attns_enc = self.encoder(characters, pos_text)
|
|
||||||
|
|
||||||
# mel_output/postnet_output (batch_size, mel_len, n_mel)
|
|
||||||
# attn_probs (128, mel_len, seq_len)
|
|
||||||
# stop_preds (batch_size, mel_len, 1)
|
|
||||||
# attns_dec (128, mel_len, mel_len)
|
|
||||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
|
|
||||||
|
|
||||||
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
|
||||||
|
|
||||||
class ModelPostNet(dg.Layer):
|
|
||||||
"""
|
|
||||||
CBHG Network (mel -> linear)
|
|
||||||
"""
|
|
||||||
def __init__(self, config):
|
|
||||||
super(ModelPostNet, self).__init__()
|
|
||||||
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
|
|
||||||
out_channels = config.hidden_size,
|
|
||||||
filter_size=1,
|
|
||||||
data_format = "NCT")
|
|
||||||
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
|
||||||
self.post_proj = Conv1D(in_channels = config.hidden_size,
|
|
||||||
out_channels = (config.audio.n_fft // 2) + 1,
|
|
||||||
filter_size=1,
|
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
def forward(self, mel):
|
|
||||||
mel = layers.transpose(mel, [0,2,1])
|
|
||||||
mel = self.pre_proj(mel)
|
|
||||||
mel = self.cbhg(mel)
|
|
||||||
mag_pred = self.post_proj(mel)
|
|
||||||
mag_pred = layers.transpose(mag_pred, [0,2,1])
|
|
||||||
return mag_pred
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue