add transformerTTS and fastspeech
This commit is contained in:
parent
d0015239db
commit
185e25fedf
|
@ -0,0 +1,4 @@
|
|||
# Fastspeech
|
||||
Paddle fluid implementation of Fastspeech, a feed-forward network based on Transformer. The implementation is based on [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263).
|
||||
|
||||
We implement Fastspeech model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
|
|
@ -35,10 +35,10 @@ epochs: 10000
|
|||
lr: 0.001
|
||||
save_step: 500
|
||||
use_gpu: True
|
||||
use_data_parallel: False
|
||||
use_data_parallel: True
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
transtts_path: ../transformerTTS/checkpoint/
|
||||
data_path: ../../dataset/LJSpeech-1.1
|
||||
transtts_path: ../TransformerTTS/checkpoint/
|
||||
transformer_step: 200000
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
|
@ -14,9 +14,9 @@ import paddle.fluid.dygraph as dg
|
|||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||
from parakeet.models.transformerTTS.network import TransformerTTS
|
||||
from network import FastSpeech
|
||||
from utils import get_alignment
|
||||
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
|
||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||
from parakeet.models.fastspeech.utils import get_alignment
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
|
@ -0,0 +1,4 @@
|
|||
# TransformerTTS
|
||||
Paddle fluid implementation of TransformerTTS, a neural TTS with Transformer. The implementation is based on [Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895).
|
||||
|
||||
We implement TransformerTTS model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
|
|
@ -23,10 +23,10 @@ lr: 0.001
|
|||
save_step: 1000
|
||||
image_step: 2000
|
||||
use_gpu: True
|
||||
use_data_parallel: False
|
||||
use_data_parallel: True
|
||||
stop_token: False
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
data_path: ../../dataset/LJSpeech-1.1
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
#checkpoint_path: ./checkpoint
|
|
@ -20,9 +20,9 @@ epochs: 10000
|
|||
lr: 0.001
|
||||
save_step: 10
|
||||
use_gpu: True
|
||||
use_data_parallel: False
|
||||
use_data_parallel: True
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
data_path: ../../dataset/LJSpeech-1.1
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
#checkpoint_path: ./checkpoint
|
|
@ -7,11 +7,13 @@ import jsonargparse
|
|||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from matplotlib import cm
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.utils import cross_entropy
|
||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||
from network import *
|
||||
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
|
@ -86,10 +88,12 @@ def main(cfg):
|
|||
if local_rank==0:
|
||||
writer.add_scalars('training_loss', {
|
||||
'mel_loss':mel_loss.numpy(),
|
||||
'post_mel_loss':post_mel_loss.numpy(),
|
||||
'stop_loss':stop_loss.numpy()
|
||||
'post_mel_loss':post_mel_loss.numpy()
|
||||
}, global_step)
|
||||
|
||||
if cfg.stop_token:
|
||||
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
|
||||
|
||||
writer.add_scalars('alphas', {
|
||||
'encoder_alpha':model.encoder.alpha.numpy(),
|
||||
'decoder_alpha':model.decoder.alpha.numpy(),
|
|
@ -6,11 +6,14 @@ from collections import OrderedDict
|
|||
import jsonargparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||
from network import *
|
||||
from parakeet.models.transformerTTS.vocoder import Vocoder
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
|
||||
new_state_dict = OrderedDict()
|
||||
for param in model_dict:
|
||||
if param.startswith('_layers.'):
|
||||
|
@ -40,7 +43,7 @@ def main(cfg):
|
|||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
model = ModelPostNet(cfg)
|
||||
model = Vocoder(cfg)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
|
@ -99,5 +102,5 @@ def main(cfg):
|
|||
if __name__ == '__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
|
||||
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split())
|
||||
main(cfg)
|
|
@ -1,3 +1,3 @@
|
|||
__version__ = "0.0.0"
|
||||
|
||||
from . import data, g2p, models, modules, utils
|
||||
from . import data, g2p, models, modules
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import librosa
|
||||
import csv
|
||||
|
||||
from paddle import fluid
|
||||
from parakeet import g2p
|
||||
from parakeet import audio
|
||||
from parakeet.data.sampler import *
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
from parakeet.data.dataset import DatasetMixin, TransformDataset
|
||||
|
||||
class LJSpeechLoader:
|
||||
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
|
||||
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||
|
||||
LJSPEECH_ROOT = Path(config.data_path)
|
||||
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
||||
transformer = LJSpeech(config)
|
||||
dataset = TransformDataset(metadata, transformer)
|
||||
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert config.batch_size % nranks == 0
|
||||
each_bs = config.batch_size // nranks
|
||||
if is_vocoder:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
|
||||
else:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
|
||||
|
||||
self.reader = fluid.io.DataLoader.from_generator(
|
||||
capacity=32,
|
||||
iterable=True,
|
||||
use_double_buffer=True,
|
||||
return_list=True)
|
||||
self.reader.set_batch_generator(dataloader, place)
|
||||
|
||||
|
||||
class LJSpeechMetaData(DatasetMixin):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root)
|
||||
self._wav_dir = self.root.joinpath("wavs")
|
||||
csv_path = self.root.joinpath("metadata.csv")
|
||||
self._table = pd.read_csv(
|
||||
csv_path,
|
||||
sep="|",
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
|
||||
def get_example(self, i):
|
||||
fname, raw_text, normalized_text = self._table.iloc[i]
|
||||
fname = str(self._wav_dir.joinpath(fname + ".wav"))
|
||||
return fname, raw_text, normalized_text
|
||||
|
||||
def __len__(self):
|
||||
return len(self._table)
|
||||
|
||||
|
||||
class LJSpeech(object):
|
||||
def __init__(self, config):
|
||||
super(LJSpeech, self).__init__()
|
||||
self.config = config
|
||||
self._ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=config.audio.sr,
|
||||
num_mels=config.audio.num_mels,
|
||||
min_level_db=config.audio.min_level_db,
|
||||
ref_level_db=config.audio.ref_level_db,
|
||||
n_fft=config.audio.n_fft,
|
||||
win_length= config.audio.win_length,
|
||||
hop_length= config.audio.hop_length,
|
||||
power=config.audio.power,
|
||||
preemphasis=config.audio.preemphasis,
|
||||
signal_norm=True,
|
||||
symmetric_norm=False,
|
||||
max_norm=1.,
|
||||
mel_fmin=0,
|
||||
mel_fmax=None,
|
||||
clip_norm=True,
|
||||
griffin_lim_iters=60,
|
||||
do_trim_silence=False,
|
||||
sound_norm=False)
|
||||
|
||||
def __call__(self, metadatum):
|
||||
"""All the code for generating an Example from a metadatum. If you want a
|
||||
different preprocessing pipeline, you can override this method.
|
||||
This method may require several processor, each of which has a lot of options.
|
||||
In this case, you'd better pass a composed transform and pass it to the init
|
||||
method.
|
||||
"""
|
||||
fname, raw_text, normalized_text = metadatum
|
||||
|
||||
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||
wav = self._ljspeech_processor.load_wav(str(fname))
|
||||
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
|
||||
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
|
||||
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||
|
||||
|
||||
def batch_examples(batch):
|
||||
texts = []
|
||||
mels = []
|
||||
mel_inputs = []
|
||||
mel_lens = []
|
||||
text_lens = []
|
||||
pos_texts = []
|
||||
pos_mels = []
|
||||
for data in batch:
|
||||
_, mel, text = data
|
||||
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
|
||||
mel_lens.append(mel.shape[1])
|
||||
text_lens.append(len(text))
|
||||
pos_texts.append(np.arange(1, len(text) + 1))
|
||||
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
text_lens = sorted(text_lens, reverse=True)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
|
||||
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
|
||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
|
||||
|
||||
def batch_examples_vocoder(batch):
|
||||
mels=[]
|
||||
mags=[]
|
||||
for data in batch:
|
||||
mag, mel, _ = data
|
||||
mels.append(mel)
|
||||
mags.append(mag)
|
||||
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
|
||||
|
||||
return (mels, mags)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import numpy as np
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||
|
||||
class FFTBlock(dg.Layer):
|
||||
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
||||
super(FFTBlock, self).__init__()
|
||||
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
|
||||
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
||||
|
||||
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
||||
"""
|
||||
Feed Forward Transformer block in FastSpeech.
|
||||
|
||||
Args:
|
||||
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
|
||||
T means the timesteps of input.
|
||||
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
|
||||
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
|
||||
len_q means the sequence length of query, len_k means the sequence length of key.
|
||||
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
|
||||
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
|
||||
"""
|
||||
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||
output *= non_pad_mask
|
||||
|
||||
output = self.pos_ffn(output)
|
||||
output *= non_pad_mask
|
||||
|
||||
return output, slf_attn
|
|
@ -1,42 +1,10 @@
|
|||
import numpy as np
|
||||
import math
|
||||
import utils
|
||||
import parakeet.models.fastspeech.utils
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.layers import Conv, Linear
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||
|
||||
class FFTBlock(dg.Layer):
|
||||
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
||||
super(FFTBlock, self).__init__()
|
||||
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
|
||||
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
||||
|
||||
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
||||
"""
|
||||
Feed Forward Transformer block in FastSpeech.
|
||||
|
||||
Args:
|
||||
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
|
||||
T means the timesteps of input.
|
||||
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
|
||||
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
|
||||
len_q means the sequence length of query, len_k means the sequence length of key.
|
||||
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
|
||||
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
|
||||
"""
|
||||
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||
output *= non_pad_mask
|
||||
|
||||
output = self.pos_ffn(output)
|
||||
output *= non_pad_mask
|
||||
|
||||
return output, slf_attn
|
||||
|
||||
|
||||
class LengthRegulator(dg.Layer):
|
||||
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
|
@ -0,0 +1,63 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.post_convnet import PostConvNet
|
||||
from parakeet.modules.layers import Linear
|
||||
from parakeet.models.fastspeech.FFTBlock import FFTBlock
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self,
|
||||
len_max_seq,
|
||||
n_layers,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
d_model,
|
||||
d_inner,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=0.1):
|
||||
super(Decoder, self).__init__()
|
||||
|
||||
n_position = len_max_seq + 1
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
def forward(self, enc_seq, enc_pos):
|
||||
"""
|
||||
Decoder layer of FastSpeech.
|
||||
|
||||
Args:
|
||||
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
|
||||
The output of length regulator.
|
||||
enc_pos (Variable, optional): Shape(B, T_mel),
|
||||
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
|
||||
Returns:
|
||||
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
|
||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||
"""
|
||||
dec_slf_attn_list = []
|
||||
|
||||
# -- Prepare masks
|
||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
||||
non_pad_mask = get_non_pad_mask(enc_pos)
|
||||
|
||||
# -- Forward
|
||||
dec_output = enc_seq + self.position_enc(enc_pos)
|
||||
|
||||
for dec_layer in self.layer_stack:
|
||||
dec_output, dec_slf_attn = dec_layer(
|
||||
dec_output,
|
||||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
dec_slf_attn_list += [dec_slf_attn]
|
||||
|
||||
return dec_output, dec_slf_attn_list
|
|
@ -0,0 +1,67 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.post_convnet import PostConvNet
|
||||
from parakeet.modules.layers import Linear
|
||||
from parakeet.models.fastspeech.FFTBlock import FFTBlock
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self,
|
||||
n_src_vocab,
|
||||
len_max_seq,
|
||||
n_layers,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
d_model,
|
||||
d_inner,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=0.1):
|
||||
super(Encoder, self).__init__()
|
||||
n_position = len_max_seq + 1
|
||||
|
||||
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
def forward(self, character, text_pos):
|
||||
"""
|
||||
Encoder layer of FastSpeech.
|
||||
|
||||
Args:
|
||||
character (Variable): Shape(B, T_text), dtype: float32. The input text
|
||||
characters. T_text means the timesteps of input characters.
|
||||
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
|
||||
position. T_text means the timesteps of input characters.
|
||||
|
||||
Returns:
|
||||
enc_output (Variable), Shape(B, text_T, C), the encoder output.
|
||||
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
|
||||
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
||||
"""
|
||||
enc_slf_attn_list = []
|
||||
# -- prepare masks
|
||||
# shape character (N, T)
|
||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
||||
non_pad_mask = get_non_pad_mask(character)
|
||||
|
||||
# -- Forward
|
||||
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
||||
|
||||
for enc_layer in self.layer_stack:
|
||||
enc_output, enc_slf_attn = enc_layer(
|
||||
enc_output,
|
||||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
enc_slf_attn_list += [enc_slf_attn]
|
||||
|
||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
|
@ -4,124 +4,10 @@ from parakeet.g2p.text.symbols import symbols
|
|||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.post_convnet import PostConvNet
|
||||
from parakeet.modules.layers import Linear
|
||||
from utils import *
|
||||
from modules import FFTBlock, LengthRegulator
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self,
|
||||
n_src_vocab,
|
||||
len_max_seq,
|
||||
n_layers,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
d_model,
|
||||
d_inner,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=0.1):
|
||||
super(Encoder, self).__init__()
|
||||
n_position = len_max_seq + 1
|
||||
|
||||
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
def forward(self, character, text_pos):
|
||||
"""
|
||||
Encoder layer of FastSpeech.
|
||||
|
||||
Args:
|
||||
character (Variable): Shape(B, T_text), dtype: float32. The input text
|
||||
characters. T_text means the timesteps of input characters.
|
||||
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
|
||||
position. T_text means the timesteps of input characters.
|
||||
|
||||
Returns:
|
||||
enc_output (Variable), Shape(B, text_T, C), the encoder output.
|
||||
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
|
||||
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
||||
"""
|
||||
enc_slf_attn_list = []
|
||||
# -- prepare masks
|
||||
# shape character (N, T)
|
||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
||||
non_pad_mask = get_non_pad_mask(character)
|
||||
|
||||
# -- Forward
|
||||
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
||||
|
||||
for enc_layer in self.layer_stack:
|
||||
enc_output, enc_slf_attn = enc_layer(
|
||||
enc_output,
|
||||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
enc_slf_attn_list += [enc_slf_attn]
|
||||
|
||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self,
|
||||
len_max_seq,
|
||||
n_layers,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
d_model,
|
||||
d_inner,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=0.1):
|
||||
super(Decoder, self).__init__()
|
||||
|
||||
n_position = len_max_seq + 1
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
def forward(self, enc_seq, enc_pos):
|
||||
"""
|
||||
Decoder layer of FastSpeech.
|
||||
|
||||
Args:
|
||||
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
|
||||
The output of length regulator.
|
||||
enc_pos (Variable, optional): Shape(B, T_mel),
|
||||
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
|
||||
Returns:
|
||||
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
|
||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||
"""
|
||||
dec_slf_attn_list = []
|
||||
|
||||
# -- Prepare masks
|
||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
||||
non_pad_mask = get_non_pad_mask(enc_pos)
|
||||
|
||||
# -- Forward
|
||||
dec_output = enc_seq + self.position_enc(enc_pos)
|
||||
|
||||
for dec_layer in self.layer_stack:
|
||||
dec_output, dec_slf_attn = dec_layer(
|
||||
dec_output,
|
||||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
dec_slf_attn_list += [dec_slf_attn]
|
||||
|
||||
return dec_output, dec_slf_attn_list
|
||||
from parakeet.models.fastspeech.utils import *
|
||||
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
|
||||
from parakeet.models.fastspeech.encoder import Encoder
|
||||
from parakeet.models.fastspeech.decoder import Decoder
|
||||
|
||||
class FastSpeech(dg.Layer):
|
||||
def __init__(self, cfg):
|
|
@ -1,166 +0,0 @@
|
|||
import math
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
class Conv1D(dg.Layer):
|
||||
"""
|
||||
A convolution 1D block implemented with Conv2D. Form simplicity and
|
||||
ensuring the output has the same length as the input, it does not allow
|
||||
stride > 1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size=3,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
stride=1,
|
||||
groups=None,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
data_format='NCT',
|
||||
dtype="float32"):
|
||||
super(Conv1D, self).__init__(dtype=dtype)
|
||||
|
||||
self.padding = padding
|
||||
self.in_channels = in_channels
|
||||
self.num_filters = num_filters
|
||||
self.filter_size = filter_size
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.padding = padding
|
||||
self.act = act
|
||||
self.data_format = data_format
|
||||
|
||||
self.conv = dg.Conv2D(
|
||||
in_channels=in_channels,
|
||||
num_filters=num_filters,
|
||||
filter_size=(1, filter_size),
|
||||
stride=(1, stride),
|
||||
dilation=(1, dilation),
|
||||
padding=(0, padding),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
x = fluid.layers.unsqueeze(x, [2])
|
||||
x = self.conv(x)
|
||||
x = fluid.layers.squeeze(x, [2])
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
||||
|
||||
class Pool1D(dg.Layer):
|
||||
"""
|
||||
A Pool 1D block implemented with Pool2D.
|
||||
"""
|
||||
def __init__(self,
|
||||
pool_size=-1,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=0,
|
||||
global_pooling=False,
|
||||
use_cudnn=True,
|
||||
ceil_mode=False,
|
||||
exclusive=True,
|
||||
data_format='NCT',
|
||||
dtype='float32'):
|
||||
super(Pool1D, self).__init__(dtype=dtype)
|
||||
self.pool_size = pool_size
|
||||
self.pool_type = pool_type
|
||||
self.pool_stride = pool_stride
|
||||
self.pool_padding = pool_padding
|
||||
self.global_pooling = global_pooling
|
||||
self.use_cudnn = use_cudnn
|
||||
self.ceil_mode = ceil_mode
|
||||
self.exclusive = exclusive
|
||||
self.data_format = data_format
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
||||
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
||||
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
||||
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
x = fluid.layers.unsqueeze(x, [2])
|
||||
x = self.pool2d(x)
|
||||
x = fluid.layers.squeeze(x, [2])
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
||||
|
||||
class DynamicGRU(dg.Layer):
|
||||
def __init__(self,
|
||||
size,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
is_reverse=False,
|
||||
gate_activation='sigmoid',
|
||||
candidate_activation='tanh',
|
||||
h_0=None,
|
||||
origin_mode=False,
|
||||
init_size=None):
|
||||
super(DynamicGRU, self).__init__()
|
||||
self.gru_unit = dg.GRUUnit(
|
||||
size * 3,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
activation=candidate_activation,
|
||||
gate_activation=gate_activation,
|
||||
origin_mode=origin_mode)
|
||||
self.size = size
|
||||
self.h_0 = h_0
|
||||
self.is_reverse = is_reverse
|
||||
|
||||
def forward(self, inputs):
|
||||
hidden = self.h_0
|
||||
res = []
|
||||
for i in range(inputs.shape[1]):
|
||||
if self.is_reverse:
|
||||
i = inputs.shape[1] - 1 - i
|
||||
input_ = inputs[:, i:i + 1, :]
|
||||
input_ = fluid.layers.reshape(
|
||||
input_, [-1, input_.shape[2]], inplace=False)
|
||||
hidden, reset, gate = self.gru_unit(input_, hidden)
|
||||
hidden_ = fluid.layers.reshape(
|
||||
hidden, [-1, 1, hidden.shape[1]], inplace=False)
|
||||
res.append(hidden_)
|
||||
if self.is_reverse:
|
||||
res = res[::-1]
|
||||
res = fluid.layers.concat(res, axis=1)
|
||||
return res
|
||||
|
|
@ -1,218 +0,0 @@
|
|||
import math
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.layers import Conv, Pool1D, Linear
|
||||
from parakeet.modules.dynamicGRU import DynamicGRU
|
||||
import numpy as np
|
||||
|
||||
|
||||
class EncoderPrenet(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
|
||||
super(EncoderPrenet, self).__init__()
|
||||
self.embedding_size = embedding_size
|
||||
self.num_hidden = num_hidden
|
||||
self.use_cudnn = use_cudnn
|
||||
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
|
||||
padding_idx = None)
|
||||
self.conv_list = []
|
||||
self.conv_list.append(Conv(in_channels = embedding_size,
|
||||
out_channels = num_hidden,
|
||||
filter_size = 5,
|
||||
padding = int(np.floor(5/2)),
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT"))
|
||||
for _ in range(2):
|
||||
self.conv_list.append(Conv(in_channels = num_hidden,
|
||||
out_channels = num_hidden,
|
||||
filter_size = 5,
|
||||
padding = int(np.floor(5/2)),
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT"))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||
data_layout='NCHW') for _ in range(3)]
|
||||
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
self.projection = Linear(num_hidden, num_hidden)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
||||
x = layers.transpose(x,[0,2,1])
|
||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
|
||||
x = layers.transpose(x,[0,2,1]) #(N,T,C)
|
||||
x = self.projection(x)
|
||||
|
||||
return x
|
||||
|
||||
class CBHG(dg.Layer):
|
||||
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
|
||||
max_pool_kernel_size=2, is_post=False):
|
||||
super(CBHG, self).__init__()
|
||||
"""
|
||||
:param hidden_size: dimension of hidden unit
|
||||
:param batch_size: batch size
|
||||
:param K: # of convolution banks
|
||||
:param projection_size: dimension of projection unit
|
||||
:param num_gru_layers: # of layers of GRUcell
|
||||
:param max_pool_kernel_size: max pooling kernel size
|
||||
:param is_post: whether post processing or not
|
||||
"""
|
||||
self.hidden_size = hidden_size
|
||||
self.projection_size = projection_size
|
||||
self.conv_list = []
|
||||
self.conv_list.append(Conv(in_channels = projection_size,
|
||||
out_channels = hidden_size,
|
||||
filter_size = 1,
|
||||
padding = int(np.floor(1/2)),
|
||||
data_format = "NCT"))
|
||||
for i in range(2,K+1):
|
||||
self.conv_list.append(Conv(in_channels = hidden_size,
|
||||
out_channels = hidden_size,
|
||||
filter_size = i,
|
||||
padding = int(np.floor(i/2)),
|
||||
data_format = "NCT"))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batchnorm_list = []
|
||||
for i in range(K):
|
||||
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
|
||||
data_layout='NCHW'))
|
||||
|
||||
for i, layer in enumerate(self.batchnorm_list):
|
||||
self.add_sublayer("batchnorm_list_{}".format(i), layer)
|
||||
|
||||
conv_outdim = hidden_size * K
|
||||
|
||||
self.conv_projection_1 = Conv(in_channels = conv_outdim,
|
||||
out_channels = hidden_size,
|
||||
filter_size = 3,
|
||||
padding = int(np.floor(3/2)),
|
||||
data_format = "NCT")
|
||||
|
||||
self.conv_projection_2 = Conv(in_channels = hidden_size,
|
||||
out_channels = projection_size,
|
||||
filter_size = 3,
|
||||
padding = int(np.floor(3/2)),
|
||||
data_format = "NCT")
|
||||
|
||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
||||
data_layout='NCHW')
|
||||
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
|
||||
data_layout='NCHW')
|
||||
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=1,
|
||||
data_format = "NCT")
|
||||
self.highway = Highwaynet(self.projection_size)
|
||||
|
||||
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
|
||||
h_0 = dg.to_variable(h_0)
|
||||
self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse = False,
|
||||
origin_mode = True,
|
||||
h_0 = h_0)
|
||||
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0 = h_0)
|
||||
|
||||
self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse = False,
|
||||
origin_mode = True,
|
||||
h_0 = h_0)
|
||||
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0 = h_0)
|
||||
|
||||
def _conv_fit_dim(self, x, filter_size=3):
|
||||
if filter_size % 2 == 0:
|
||||
return x[:,:,:-1]
|
||||
else:
|
||||
return x
|
||||
|
||||
def forward(self, input_):
|
||||
# input_.shape = [N, C, T]
|
||||
|
||||
conv_list = []
|
||||
conv_input = input_
|
||||
|
||||
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
|
||||
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
|
||||
conv_input = layers.relu(batchnorm(conv_input))
|
||||
conv_list.append(conv_input)
|
||||
|
||||
conv_cat = layers.concat(conv_list, axis=1)
|
||||
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
|
||||
|
||||
|
||||
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
|
||||
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
|
||||
|
||||
# conv_proj.shape = [N, C, T]
|
||||
highway = layers.transpose(conv_proj, [0,2,1])
|
||||
highway = self.highway(highway)
|
||||
|
||||
# highway.shape = [N, T, C]
|
||||
fc_forward = self.fc_forward1(highway)
|
||||
fc_reverse = self.fc_reverse1(highway)
|
||||
out_forward = self.gru_forward1(fc_forward)
|
||||
out_reverse = self.gru_reverse1(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
fc_forward = self.fc_forward2(out)
|
||||
fc_reverse = self.fc_reverse2(out)
|
||||
out_forward = self.gru_forward2(fc_forward)
|
||||
out_reverse = self.gru_reverse2(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
out = layers.transpose(out, [0,2,1])
|
||||
return out
|
||||
|
||||
class Highwaynet(dg.Layer):
|
||||
def __init__(self, num_units, num_layers=4):
|
||||
super(Highwaynet, self).__init__()
|
||||
self.num_units = num_units
|
||||
self.num_layers = num_layers
|
||||
|
||||
self.gates = []
|
||||
self.linears = []
|
||||
|
||||
for i in range(num_layers):
|
||||
self.linears.append(Linear(num_units, num_units))
|
||||
self.gates.append(Linear(num_units, num_units))
|
||||
|
||||
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
|
||||
self.add_sublayer("linears_{}".format(i), linear)
|
||||
self.add_sublayer("gates_{}".format(i), gate)
|
||||
|
||||
def forward(self, input_):
|
||||
out = input_
|
||||
|
||||
for linear, gate in zip(self.linears, self.gates):
|
||||
h = fluid.layers.relu(linear(out))
|
||||
t_ = fluid.layers.sigmoid(gate(out))
|
||||
|
||||
c = 1 - t_
|
||||
out = h * t_ + out * c
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,203 +0,0 @@
|
|||
from parakeet.models.transformerTTS.module import *
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.layers import Conv1D, Linear
|
||||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||
from parakeet.modules.prenet import PreNet
|
||||
from parakeet.modules.post_convnet import PostConvNet
|
||||
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, config, num_head=4):
|
||||
super(Encoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
|
||||
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
|
||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
||||
num_hidden = num_hidden,
|
||||
use_cudnn=config.use_gpu)
|
||||
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
for i, layer in enumerate(self.layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
|
||||
def forward(self, x, positional):
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
query_mask = get_non_pad_mask(positional)
|
||||
mask = get_attn_key_pad_mask(positional, x)
|
||||
else:
|
||||
query_mask, mask = None, None
|
||||
|
||||
# Encoder pre_network
|
||||
x = self.encoder_prenet(x) #(N,T,C)
|
||||
|
||||
|
||||
# Get positional encoding
|
||||
positional = self.pos_emb(positional)
|
||||
|
||||
x = positional * self.alpha + x #(N, T, C)
|
||||
|
||||
|
||||
# Positional dropout
|
||||
x = layers.dropout(x, 0.1)
|
||||
|
||||
# Self attention encoder
|
||||
attentions = list()
|
||||
for layer, ffn in zip(self.layers, self.ffns):
|
||||
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
|
||||
x = ffn(x)
|
||||
attentions.append(attention)
|
||||
|
||||
return x, query_mask, attentions
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self, num_hidden, config, num_head=4):
|
||||
super(Decoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr()
|
||||
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
|
||||
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
|
||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
|
||||
hidden_size = num_hidden * 2,
|
||||
output_size = num_hidden,
|
||||
dropout_rate=0.2)
|
||||
self.linear = Linear(num_hidden, num_hidden)
|
||||
|
||||
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
for i, layer in enumerate(self.selfattn_layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
for i, layer in enumerate(self.attn_layers):
|
||||
self.add_sublayer("attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
||||
self.stop_linear = Linear(num_hidden, 1)
|
||||
|
||||
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
|
||||
filter_size = 5, padding = 4, num_conv=5,
|
||||
outputs_per_step=config.audio.outputs_per_step,
|
||||
use_cudnn = config.use_gpu)
|
||||
|
||||
def forward(self, key, value, query, c_mask, positional):
|
||||
|
||||
# get decoder mask with triangular matrix
|
||||
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
m_mask = get_non_pad_mask(positional)
|
||||
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
|
||||
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
|
||||
mask = mask + triu_tensor
|
||||
mask = fluid.layers.cast(mask == 0, np.float32)
|
||||
|
||||
# (batch_size, decoder_len, encoder_len)
|
||||
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
|
||||
else:
|
||||
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
|
||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
||||
m_mask, zero_mask = None, None
|
||||
|
||||
# Decoder pre-network
|
||||
query = self.decoder_prenet(query)
|
||||
|
||||
# Centered position
|
||||
query = self.linear(query)
|
||||
|
||||
# Get position embedding
|
||||
positional = self.pos_emb(positional)
|
||||
query = positional * self.alpha + query
|
||||
|
||||
#positional dropout
|
||||
query = fluid.layers.dropout(query, 0.1)
|
||||
|
||||
# Attention decoder-decoder, encoder-decoder
|
||||
selfattn_list = list()
|
||||
attn_list = list()
|
||||
|
||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
|
||||
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
|
||||
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
|
||||
query = ffn(query)
|
||||
selfattn_list.append(attn_dec)
|
||||
attn_list.append(attn_dot)
|
||||
# Mel linear projection
|
||||
mel_out = self.mel_linear(query)
|
||||
# Post Mel Network
|
||||
out = self.postconvnet(mel_out)
|
||||
out = mel_out + out
|
||||
|
||||
# Stop tokens
|
||||
stop_tokens = self.stop_linear(query)
|
||||
stop_tokens = layers.squeeze(stop_tokens, [-1])
|
||||
stop_tokens = layers.sigmoid(stop_tokens)
|
||||
|
||||
return mel_out, out, attn_list, stop_tokens, selfattn_list
|
||||
|
||||
class TransformerTTS(dg.Layer):
|
||||
def __init__(self, config):
|
||||
super(TransformerTTS, self).__init__()
|
||||
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
|
||||
self.decoder = Decoder(config.hidden_size, config)
|
||||
self.config = config
|
||||
|
||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
||||
# key (batch_size, seq_len, channel)
|
||||
# c_mask (batch_size, seq_len)
|
||||
# attns_enc (channel / 2, seq_len, seq_len)
|
||||
|
||||
key, c_mask, attns_enc = self.encoder(characters, pos_text)
|
||||
|
||||
# mel_output/postnet_output (batch_size, mel_len, n_mel)
|
||||
# attn_probs (128, mel_len, seq_len)
|
||||
# stop_preds (batch_size, mel_len, 1)
|
||||
# attns_dec (128, mel_len, mel_len)
|
||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
|
||||
|
||||
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
||||
|
||||
class ModelPostNet(dg.Layer):
|
||||
"""
|
||||
CBHG Network (mel -> linear)
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(ModelPostNet, self).__init__()
|
||||
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
|
||||
out_channels = config.hidden_size,
|
||||
filter_size=1,
|
||||
data_format = "NCT")
|
||||
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
||||
self.post_proj = Conv1D(in_channels = config.hidden_size,
|
||||
out_channels = (config.audio.n_fft // 2) + 1,
|
||||
filter_size=1,
|
||||
data_format = "NCT")
|
||||
|
||||
def forward(self, mel):
|
||||
mel = layers.transpose(mel, [0,2,1])
|
||||
mel = self.pre_proj(mel)
|
||||
mel = self.cbhg(mel)
|
||||
mag_pred = self.post_proj(mel)
|
||||
mag_pred = layers.transpose(mag_pred, [0,2,1])
|
||||
return mag_pred
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue