add FastSpeech
This commit is contained in:
parent
9fe6ad11f0
commit
2179d6d5b0
|
@ -0,0 +1,148 @@
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import librosa
|
||||||
|
|
||||||
|
from paddle import fluid
|
||||||
|
from parakeet import g2p
|
||||||
|
from parakeet import audio
|
||||||
|
from parakeet.data.sampler import *
|
||||||
|
from parakeet.data.datacargo import DataCargo
|
||||||
|
from parakeet.data.dataset import Dataset
|
||||||
|
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||||
|
|
||||||
|
class LJSpeechLoader:
|
||||||
|
def __init__(self, config, nranks, rank, is_vocoder=False):
|
||||||
|
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||||
|
|
||||||
|
LJSPEECH_ROOT = Path(config.data_path)
|
||||||
|
dataset = LJSpeech(LJSPEECH_ROOT, config)
|
||||||
|
sampler = DistributedSampler(len(dataset), nranks, rank)
|
||||||
|
|
||||||
|
assert config.batch_size % nranks == 0
|
||||||
|
each_bs = config.batch_size // nranks
|
||||||
|
if is_vocoder:
|
||||||
|
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True)
|
||||||
|
else:
|
||||||
|
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True)
|
||||||
|
|
||||||
|
self.reader = fluid.io.DataLoader.from_generator(
|
||||||
|
capacity=32,
|
||||||
|
iterable=True,
|
||||||
|
use_double_buffer=True,
|
||||||
|
return_list=True)
|
||||||
|
self.reader.set_batch_generator(dataloader, place)
|
||||||
|
|
||||||
|
|
||||||
|
class LJSpeech(Dataset):
|
||||||
|
def __init__(self, root, config):
|
||||||
|
super(LJSpeech, self).__init__()
|
||||||
|
assert isinstance(root, (str, Path)), "root should be a string or Path object"
|
||||||
|
self.root = root if isinstance(root, Path) else Path(root)
|
||||||
|
self.metadata = self._prepare_metadata()
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def _prepare_metadata(self):
|
||||||
|
csv_path = self.root.joinpath("metadata.csv")
|
||||||
|
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
|
||||||
|
names=["fname", "raw_text", "normalized_text"])
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def _get_example(self, metadatum):
|
||||||
|
"""All the code for generating an Example from a metadatum. If you want a
|
||||||
|
different preprocessing pipeline, you can override this method.
|
||||||
|
This method may require several processor, each of which has a lot of options.
|
||||||
|
In this case, you'd better pass a composed transform and pass it to the init
|
||||||
|
method.
|
||||||
|
"""
|
||||||
|
|
||||||
|
fname, raw_text, normalized_text = metadatum
|
||||||
|
wav_path = self.root.joinpath("wavs", fname + ".wav")
|
||||||
|
|
||||||
|
_ljspeech_processor = audio.AudioProcessor(
|
||||||
|
sample_rate=22050,
|
||||||
|
num_mels=80,
|
||||||
|
min_level_db=-100,
|
||||||
|
ref_level_db=20,
|
||||||
|
n_fft=2048,
|
||||||
|
win_length= int(22050 * 0.05),
|
||||||
|
hop_length= int(22050 * 0.0125),
|
||||||
|
power=1.2,
|
||||||
|
preemphasis=0.97,
|
||||||
|
signal_norm=True,
|
||||||
|
symmetric_norm=False,
|
||||||
|
max_norm=1.,
|
||||||
|
mel_fmin=0,
|
||||||
|
mel_fmax=None,
|
||||||
|
clip_norm=True,
|
||||||
|
griffin_lim_iters=60,
|
||||||
|
do_trim_silence=False,
|
||||||
|
sound_norm=False)
|
||||||
|
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||||
|
wav = _ljspeech_processor.load_wav(str(wav_path))
|
||||||
|
mag = _ljspeech_processor.spectrogram(wav).astype(np.float32)
|
||||||
|
mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32)
|
||||||
|
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||||
|
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
metadatum = self.metadata.iloc[index]
|
||||||
|
example = self._get_example(metadatum)
|
||||||
|
return example
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for i in range(len(self)):
|
||||||
|
yield self[i]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_examples(batch):
|
||||||
|
texts = []
|
||||||
|
mels = []
|
||||||
|
mel_inputs = []
|
||||||
|
text_lens = []
|
||||||
|
pos_texts = []
|
||||||
|
pos_mels = []
|
||||||
|
for data in batch:
|
||||||
|
_, mel, text = data
|
||||||
|
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
|
||||||
|
text_lens.append(len(text))
|
||||||
|
pos_texts.append(np.arange(1, len(text) + 1))
|
||||||
|
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||||
|
mels.append(mel)
|
||||||
|
texts.append(text)
|
||||||
|
|
||||||
|
# Sort by text_len in descending order
|
||||||
|
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||||
|
text_lens = sorted(text_lens, reverse=True)
|
||||||
|
|
||||||
|
# Pad sequence with largest len of the batch
|
||||||
|
texts = TextIDBatcher(pad_id=0)(texts)
|
||||||
|
pos_texts = TextIDBatcher(pad_id=0)(pos_texts)
|
||||||
|
pos_mels = TextIDBatcher(pad_id=0)(pos_mels)
|
||||||
|
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||||
|
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
|
||||||
|
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
|
||||||
|
|
||||||
|
def batch_examples_vocoder(batch):
|
||||||
|
mels=[]
|
||||||
|
mags=[]
|
||||||
|
for data in batch:
|
||||||
|
mag, mel, _ = data
|
||||||
|
mels.append(mel)
|
||||||
|
mags.append(mag)
|
||||||
|
|
||||||
|
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||||
|
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
|
||||||
|
|
||||||
|
return (mels, mags)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
audio:
|
||||||
|
num_mels: 80
|
||||||
|
n_fft: 2048
|
||||||
|
sr: 22050
|
||||||
|
preemphasis: 0.97
|
||||||
|
hop_length: 275
|
||||||
|
win_length: 1102
|
||||||
|
power: 1.2
|
||||||
|
min_level_db: -100
|
||||||
|
ref_level_db: 20
|
||||||
|
outputs_per_step: 1
|
||||||
|
|
||||||
|
encoder_n_layer: 6
|
||||||
|
encoder_head: 2
|
||||||
|
encoder_conv1d_filter_size: 1536
|
||||||
|
max_sep_len: 2048
|
||||||
|
encoder_output_size: 384
|
||||||
|
word_vec_dim: 384
|
||||||
|
decoder_n_layer: 6
|
||||||
|
decoder_head: 2
|
||||||
|
decoder_conv1d_filter_size: 1536
|
||||||
|
decoder_output_size: 384
|
||||||
|
d_model: 384
|
||||||
|
duration_predictor_output_size: 256
|
||||||
|
duration_predictor_filter_size: 3
|
||||||
|
fft_conv1d_filter: 3
|
||||||
|
fft_conv1d_padding: 1
|
||||||
|
|
||||||
|
|
||||||
|
batch_size: 32
|
||||||
|
epochs: 10000
|
||||||
|
lr: 0.001
|
||||||
|
save_step: 500
|
||||||
|
image_step: 2000
|
||||||
|
use_gpu: False
|
||||||
|
use_data_parallel: False
|
||||||
|
|
||||||
|
data_path: ../../../dataset/LJSpeech-1.1
|
||||||
|
transtts_path: ./checkpoint
|
||||||
|
transformer_step: 70000
|
||||||
|
log_dir: ./log
|
|
@ -0,0 +1,43 @@
|
||||||
|
audio:
|
||||||
|
num_mels: 80
|
||||||
|
n_fft: 2048
|
||||||
|
sr: 22050
|
||||||
|
preemphasis: 0.97
|
||||||
|
hop_length: 275
|
||||||
|
win_length: 1102
|
||||||
|
power: 1.2
|
||||||
|
min_level_db: -100
|
||||||
|
ref_level_db: 20
|
||||||
|
outputs_per_step: 1
|
||||||
|
|
||||||
|
encoder_n_layer: 6
|
||||||
|
encoder_head: 2
|
||||||
|
encoder_conv1d_filter_size: 1536
|
||||||
|
max_sep_len: 2048
|
||||||
|
encoder_output_size: 384
|
||||||
|
embedding_size: 384
|
||||||
|
decoder_n_layer: 6
|
||||||
|
decoder_head: 2
|
||||||
|
decoder_conv1d_filter_size: 1536
|
||||||
|
decoder_output_size: 384
|
||||||
|
hidden_size: 384
|
||||||
|
duration_predictor_output_size: 256
|
||||||
|
duration_predictor_filter_size: 3
|
||||||
|
fft_conv1d_filter: 3
|
||||||
|
fft_conv1d_padding: 1
|
||||||
|
dropout: 0.1
|
||||||
|
transformer_head: 4
|
||||||
|
|
||||||
|
warm_up_step: 4000
|
||||||
|
grad_clip_thresh: 0.1
|
||||||
|
batch_size: 32
|
||||||
|
epochs: 10000
|
||||||
|
lr: 0.001
|
||||||
|
save_step: 500
|
||||||
|
use_gpu: True
|
||||||
|
use_data_parallel: False
|
||||||
|
|
||||||
|
data_path: ../../../dataset/LJSpeech-1.1
|
||||||
|
transtts_path: ../transformerTTS/checkpoint
|
||||||
|
transformer_step: 20
|
||||||
|
log_dir: ./log
|
|
@ -0,0 +1,124 @@
|
||||||
|
import torch
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
|
||||||
|
import hparams
|
||||||
|
import Audio
|
||||||
|
from text import text_to_sequence
|
||||||
|
from utils import process_text, pad_1D, pad_2D
|
||||||
|
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
|
||||||
|
|
||||||
|
class FastSpeechDataset(Dataset):
|
||||||
|
""" LJSpeech """
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.text = process_text(os.path.join("data", "train.txt"))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.text)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
mel_gt_name = os.path.join(
|
||||||
|
hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1))
|
||||||
|
mel_gt_target = np.load(mel_gt_name)
|
||||||
|
D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy"))
|
||||||
|
|
||||||
|
character = self.text[idx][0:len(self.text[idx])-1]
|
||||||
|
character = np.array(text_to_sequence(
|
||||||
|
character, hparams.text_cleaners))
|
||||||
|
|
||||||
|
sample = {"text": character,
|
||||||
|
"mel_target": mel_gt_target,
|
||||||
|
"D": D}
|
||||||
|
|
||||||
|
return sample
|
||||||
|
|
||||||
|
|
||||||
|
def reprocess(batch, cut_list):
|
||||||
|
texts = [batch[ind]["text"] for ind in cut_list]
|
||||||
|
mel_targets = [batch[ind]["mel_target"] for ind in cut_list]
|
||||||
|
Ds = [batch[ind]["D"] for ind in cut_list]
|
||||||
|
|
||||||
|
length_text = np.array([])
|
||||||
|
for text in texts:
|
||||||
|
length_text = np.append(length_text, text.shape[0])
|
||||||
|
|
||||||
|
src_pos = list()
|
||||||
|
max_len = int(max(length_text))
|
||||||
|
for length_src_row in length_text:
|
||||||
|
src_pos.append(np.pad([i+1 for i in range(int(length_src_row))],
|
||||||
|
(0, max_len-int(length_src_row)), 'constant'))
|
||||||
|
src_pos = np.array(src_pos)
|
||||||
|
|
||||||
|
length_mel = np.array(list())
|
||||||
|
for mel in mel_targets:
|
||||||
|
length_mel = np.append(length_mel, mel.shape[0])
|
||||||
|
|
||||||
|
mel_pos = list()
|
||||||
|
max_mel_len = int(max(length_mel))
|
||||||
|
for length_mel_row in length_mel:
|
||||||
|
mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))],
|
||||||
|
(0, max_mel_len-int(length_mel_row)), 'constant'))
|
||||||
|
mel_pos = np.array(mel_pos)
|
||||||
|
|
||||||
|
texts = pad_1D(texts)
|
||||||
|
Ds = pad_1D(Ds)
|
||||||
|
mel_targets = pad_2D(mel_targets)
|
||||||
|
|
||||||
|
out = {"text": texts,
|
||||||
|
"mel_target": mel_targets,
|
||||||
|
"D": Ds,
|
||||||
|
"mel_pos": mel_pos,
|
||||||
|
"src_pos": src_pos,
|
||||||
|
"mel_max_len": max_mel_len}
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def collate_fn(batch):
|
||||||
|
len_arr = np.array([d["text"].shape[0] for d in batch])
|
||||||
|
index_arr = np.argsort(-len_arr)
|
||||||
|
batchsize = len(batch)
|
||||||
|
real_batchsize = int(math.sqrt(batchsize))
|
||||||
|
|
||||||
|
cut_list = list()
|
||||||
|
for i in range(real_batchsize):
|
||||||
|
cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize])
|
||||||
|
|
||||||
|
output = list()
|
||||||
|
for i in range(real_batchsize):
|
||||||
|
output.append(reprocess(batch, cut_list[i]))
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Test
|
||||||
|
dataset = FastSpeechDataset()
|
||||||
|
training_loader = DataLoader(dataset,
|
||||||
|
batch_size=1,
|
||||||
|
shuffle=False,
|
||||||
|
collate_fn=collate_fn,
|
||||||
|
drop_last=True,
|
||||||
|
num_workers=0)
|
||||||
|
total_step = hparams.epochs * len(training_loader) * hparams.batch_size
|
||||||
|
|
||||||
|
cnt = 0
|
||||||
|
for i, batchs in enumerate(training_loader):
|
||||||
|
for j, data_of_batch in enumerate(batchs):
|
||||||
|
mel_target = torch.from_numpy(
|
||||||
|
data_of_batch["mel_target"]).float().to(device)
|
||||||
|
D = torch.from_numpy(data_of_batch["D"]).int().to(device)
|
||||||
|
# print(mel_target.size())
|
||||||
|
# print(D.sum())
|
||||||
|
print(cnt)
|
||||||
|
if mel_target.size(1) == D.sum().item():
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
print(cnt)
|
|
@ -0,0 +1,117 @@
|
||||||
|
import numpy as np
|
||||||
|
import math
|
||||||
|
import utils
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from parakeet.modules.layers import Conv1D
|
||||||
|
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||||
|
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class FFTBlock(dg.Layer):
|
||||||
|
"""FFT Block"""
|
||||||
|
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
||||||
|
super(FFTBlock, self).__init__()
|
||||||
|
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout)
|
||||||
|
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
||||||
|
|
||||||
|
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
||||||
|
enc_output, enc_slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||||
|
enc_output *= non_pad_mask
|
||||||
|
|
||||||
|
enc_output = self.pos_ffn(enc_output)
|
||||||
|
enc_output *= non_pad_mask
|
||||||
|
|
||||||
|
return enc_output, enc_slf_attn
|
||||||
|
|
||||||
|
|
||||||
|
class LengthRegulator(dg.Layer):
|
||||||
|
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
||||||
|
super(LengthRegulator, self).__init__()
|
||||||
|
self.duration_predictor = DurationPredictor(input_size=input_size,
|
||||||
|
out_channels=out_channels,
|
||||||
|
filter_size=filter_size,
|
||||||
|
dropout=dropout)
|
||||||
|
|
||||||
|
def LR(self, x, duration_predictor_output, alpha=1.0):
|
||||||
|
output = []
|
||||||
|
batch_size = x.shape[0]
|
||||||
|
for i in range(batch_size):
|
||||||
|
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
|
||||||
|
output = self.pad(output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def pad(self, input_ele):
|
||||||
|
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
|
||||||
|
out_list = []
|
||||||
|
for i in range(len(input_ele)):
|
||||||
|
pad_len = max_len - input_ele[i].shape[0]
|
||||||
|
one_batch_padded = layers.pad(
|
||||||
|
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
|
||||||
|
out_list.append(one_batch_padded)
|
||||||
|
out_padded = layers.stack(out_list)
|
||||||
|
return out_padded
|
||||||
|
|
||||||
|
def expand(self, batch, predicted, alpha):
|
||||||
|
out = []
|
||||||
|
time_steps = batch.shape[1]
|
||||||
|
fertilities = predicted.numpy()
|
||||||
|
batch = layers.squeeze(batch,[0])
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(time_steps):
|
||||||
|
if fertilities[0,i]==0:
|
||||||
|
continue
|
||||||
|
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
|
||||||
|
out = layers.concat(out, axis=0)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x, alpha=1.0, target=None):
|
||||||
|
duration_predictor_output = self.duration_predictor(x)
|
||||||
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
|
output = self.LR(x, target)
|
||||||
|
return output, duration_predictor_output
|
||||||
|
else:
|
||||||
|
duration_predictor_output = layers.round(duration_predictor_output)
|
||||||
|
output = self.LR(x, duration_predictor_output, alpha)
|
||||||
|
mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])])
|
||||||
|
return output, mel_pos
|
||||||
|
|
||||||
|
class DurationPredictor(dg.Layer):
|
||||||
|
""" Duration Predictor """
|
||||||
|
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
||||||
|
super(DurationPredictor, self).__init__()
|
||||||
|
self.input_size = input_size
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.filter_size = filter_size
|
||||||
|
self.dropout = dropout
|
||||||
|
|
||||||
|
self.conv1 = Conv1D(in_channels = self.input_size,
|
||||||
|
out_channels = self.out_channels,
|
||||||
|
filter_size = self.filter_size,
|
||||||
|
padding=1,
|
||||||
|
data_format='NTC')
|
||||||
|
self.conv2 = Conv1D(in_channels = self.out_channels,
|
||||||
|
out_channels = self.out_channels,
|
||||||
|
filter_size = self.filter_size,
|
||||||
|
padding=1,
|
||||||
|
data_format='NTC')
|
||||||
|
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
||||||
|
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
||||||
|
|
||||||
|
self.linear =dg.Linear(self.out_channels, 1)
|
||||||
|
|
||||||
|
def forward(self, encoder_output):
|
||||||
|
|
||||||
|
# encoder_output.shape(N, T, C)
|
||||||
|
out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout)
|
||||||
|
out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout)
|
||||||
|
out = layers.relu(self.linear(out))
|
||||||
|
out = layers.squeeze(out, axes=[-1])
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
from utils import *
|
||||||
|
from modules import *
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from parakeet.g2p.text.symbols import symbols
|
||||||
|
from parakeet.modules.utils import *
|
||||||
|
from parakeet.modules.post_convnet import PostConvNet
|
||||||
|
|
||||||
|
class Encoder(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
n_src_vocab,
|
||||||
|
len_max_seq,
|
||||||
|
d_word_vec,
|
||||||
|
n_layers,
|
||||||
|
n_head,
|
||||||
|
d_k,
|
||||||
|
d_v,
|
||||||
|
d_model,
|
||||||
|
d_inner,
|
||||||
|
fft_conv1d_kernel,
|
||||||
|
fft_conv1d_padding,
|
||||||
|
dropout=0.1):
|
||||||
|
super(Encoder, self).__init__()
|
||||||
|
n_position = len_max_seq + 1
|
||||||
|
|
||||||
|
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0)
|
||||||
|
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
|
||||||
|
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
|
||||||
|
padding_idx=0,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
|
trainable=False))
|
||||||
|
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||||
|
for i, layer in enumerate(self.layer_stack):
|
||||||
|
self.add_sublayer('fft_{}'.format(i), layer)
|
||||||
|
|
||||||
|
def forward(self, character, text_pos):
|
||||||
|
enc_slf_attn_list = []
|
||||||
|
# -- prepare masks
|
||||||
|
# shape character (N, T)
|
||||||
|
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
||||||
|
non_pad_mask = get_non_pad_mask(character)
|
||||||
|
|
||||||
|
# -- Forward
|
||||||
|
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
||||||
|
|
||||||
|
for enc_layer in self.layer_stack:
|
||||||
|
enc_output, enc_slf_attn = enc_layer(
|
||||||
|
enc_output,
|
||||||
|
non_pad_mask=non_pad_mask,
|
||||||
|
slf_attn_mask=slf_attn_mask)
|
||||||
|
enc_slf_attn_list += [enc_slf_attn]
|
||||||
|
|
||||||
|
return enc_output, non_pad_mask, enc_slf_attn_list
|
||||||
|
|
||||||
|
class Decoder(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
len_max_seq,
|
||||||
|
d_word_vec,
|
||||||
|
n_layers,
|
||||||
|
n_head,
|
||||||
|
d_k,
|
||||||
|
d_v,
|
||||||
|
d_model,
|
||||||
|
d_inner,
|
||||||
|
fft_conv1d_kernel,
|
||||||
|
fft_conv1d_padding,
|
||||||
|
dropout=0.1):
|
||||||
|
super(Decoder, self).__init__()
|
||||||
|
|
||||||
|
n_position = len_max_seq + 1
|
||||||
|
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
|
||||||
|
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
|
||||||
|
padding_idx=0,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
|
trainable=False))
|
||||||
|
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||||
|
for i, layer in enumerate(self.layer_stack):
|
||||||
|
self.add_sublayer('fft_{}'.format(i), layer)
|
||||||
|
|
||||||
|
def forward(self, enc_seq, enc_pos):
|
||||||
|
dec_slf_attn_list = []
|
||||||
|
|
||||||
|
# -- Prepare masks
|
||||||
|
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
||||||
|
non_pad_mask = get_non_pad_mask(enc_pos)
|
||||||
|
|
||||||
|
# -- Forward
|
||||||
|
dec_output = enc_seq + self.position_enc(enc_pos)
|
||||||
|
|
||||||
|
for dec_layer in self.layer_stack:
|
||||||
|
dec_output, dec_slf_attn = dec_layer(
|
||||||
|
dec_output,
|
||||||
|
non_pad_mask=non_pad_mask,
|
||||||
|
slf_attn_mask=slf_attn_mask)
|
||||||
|
dec_slf_attn_list += [dec_slf_attn]
|
||||||
|
|
||||||
|
return dec_output, dec_slf_attn_list
|
||||||
|
|
||||||
|
class FastSpeech(dg.Layer):
|
||||||
|
def __init__(self, cfg):
|
||||||
|
" FastSpeech"
|
||||||
|
super(FastSpeech, self).__init__()
|
||||||
|
|
||||||
|
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
|
||||||
|
len_max_seq=cfg.max_sep_len,
|
||||||
|
d_word_vec=cfg.embedding_size,
|
||||||
|
n_layers=cfg.encoder_n_layer,
|
||||||
|
n_head=cfg.encoder_head,
|
||||||
|
d_k=64,
|
||||||
|
d_v=64,
|
||||||
|
d_model=cfg.hidden_size,
|
||||||
|
d_inner=cfg.encoder_conv1d_filter_size,
|
||||||
|
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||||
|
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||||
|
dropout=0.1)
|
||||||
|
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size,
|
||||||
|
out_channels=cfg.duration_predictor_output_size,
|
||||||
|
filter_size=cfg.duration_predictor_filter_size,
|
||||||
|
dropout=cfg.dropout)
|
||||||
|
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
|
||||||
|
d_word_vec=cfg.embedding_size,
|
||||||
|
n_layers=cfg.decoder_n_layer,
|
||||||
|
n_head=cfg.decoder_head,
|
||||||
|
d_k=64,
|
||||||
|
d_v=64,
|
||||||
|
d_model=cfg.hidden_size,
|
||||||
|
d_inner=cfg.decoder_conv1d_filter_size,
|
||||||
|
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||||
|
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||||
|
dropout=0.1)
|
||||||
|
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels)
|
||||||
|
self.postnet = PostConvNet(n_mels=80,
|
||||||
|
num_hidden=512,
|
||||||
|
filter_size=5,
|
||||||
|
padding=int(5 / 2),
|
||||||
|
num_conv=5,
|
||||||
|
outputs_per_step=1,
|
||||||
|
use_cudnn=True,
|
||||||
|
dropout=0.1)
|
||||||
|
|
||||||
|
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
|
||||||
|
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
|
||||||
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
|
|
||||||
|
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
|
||||||
|
target=length_target,
|
||||||
|
alpha=alpha)
|
||||||
|
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
|
||||||
|
|
||||||
|
mel_output = self.mel_linear(decoder_output)
|
||||||
|
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||||
|
|
||||||
|
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
|
||||||
|
else:
|
||||||
|
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
|
||||||
|
decoder_output = self.decoder(length_regulator_output, decoder_pos)
|
||||||
|
|
||||||
|
mel_output = self.mel_linear(decoder_output)
|
||||||
|
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||||
|
|
||||||
|
return mel_output, mel_output_postnet
|
|
@ -0,0 +1,93 @@
|
||||||
|
import jsonargparse
|
||||||
|
|
||||||
|
def add_config_options_to_parser(parser):
|
||||||
|
parser.add_argument('--audio.num_mels', type=int, default=80,
|
||||||
|
help="the number of mel bands when calculating mel spectrograms.")
|
||||||
|
parser.add_argument('--audio.n_fft', type=int, default=2048,
|
||||||
|
help="the number of fft components.")
|
||||||
|
parser.add_argument('--audio.sr', type=int, default=22050,
|
||||||
|
help="the sampling rate of audio data file.")
|
||||||
|
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
|
||||||
|
help="the preemphasis coefficient.")
|
||||||
|
parser.add_argument('--audio.hop_length', type=float, default=128,
|
||||||
|
help="the number of samples to advance between frames.")
|
||||||
|
parser.add_argument('--audio.win_length', type=float, default=1024,
|
||||||
|
help="the length (width) of the window function.")
|
||||||
|
parser.add_argument('--audio.power', type=float, default=1.4,
|
||||||
|
help="the power to raise before griffin-lim.")
|
||||||
|
parser.add_argument('--audio.min_level_db', type=int, default=-100,
|
||||||
|
help="the minimum level db.")
|
||||||
|
parser.add_argument('--audio.ref_level_db', type=int, default=20,
|
||||||
|
help="the reference level db.")
|
||||||
|
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||||
|
help="the outputs per step.")
|
||||||
|
|
||||||
|
parser.add_argument('--embedding_size', type=int, default=256,
|
||||||
|
help="the dim size of embedding.")
|
||||||
|
parser.add_argument('--encoder_n_layer', type=int, default=6,
|
||||||
|
help="the number of FFT Block in encoder.")
|
||||||
|
parser.add_argument('--encoder_head', type=int, default=2,
|
||||||
|
help="the attention head number in encoder.")
|
||||||
|
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
|
||||||
|
help="the filter size of conv1d in encoder.")
|
||||||
|
parser.add_argument('--max_sep_len', type=int, default=2048,
|
||||||
|
help="the max length of sequence.")
|
||||||
|
parser.add_argument('--encoder_output_size', type=int, default=256,
|
||||||
|
help="the output channel size of encoder.")
|
||||||
|
parser.add_argument('--decoder_n_layer', type=int, default=6,
|
||||||
|
help="the number of FFT Block in decoder.")
|
||||||
|
parser.add_argument('--decoder_head', type=int, default=2,
|
||||||
|
help="the attention head number in decoder.")
|
||||||
|
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
|
||||||
|
help="the filter size of conv1d in decoder.")
|
||||||
|
parser.add_argument('--decoder_output_size', type=int, default=256,
|
||||||
|
help="the output channel size of decoder.")
|
||||||
|
parser.add_argument('--hidden_size', type=int, default=256,
|
||||||
|
help="the hidden size in model.")
|
||||||
|
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
|
||||||
|
help="the output size of duration predictior.")
|
||||||
|
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
|
||||||
|
help="the filter size of conv1d in duration prediction.")
|
||||||
|
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
|
||||||
|
help="the filter size of conv1d in fft.")
|
||||||
|
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
|
||||||
|
help="the padding size of conv1d in fft.")
|
||||||
|
parser.add_argument('--dropout', type=float, default=0.1,
|
||||||
|
help="the dropout in network.")
|
||||||
|
parser.add_argument('--transformer_head', type=int, default=4,
|
||||||
|
help="the attention head num of transformerTTS.")
|
||||||
|
|
||||||
|
parser.add_argument('--warm_up_step', type=int, default=4000,
|
||||||
|
help="the warm up step of learning rate.")
|
||||||
|
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
|
||||||
|
help="the threshold of grad clip.")
|
||||||
|
parser.add_argument('--batch_size', type=int, default=32,
|
||||||
|
help="batch size for training.")
|
||||||
|
parser.add_argument('--epochs', type=int, default=10000,
|
||||||
|
help="the number of epoch for training.")
|
||||||
|
parser.add_argument('--lr', type=float, default=0.001,
|
||||||
|
help="the learning rate for training.")
|
||||||
|
parser.add_argument('--save_step', type=int, default=500,
|
||||||
|
help="checkpointing interval during training.")
|
||||||
|
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||||
|
help="use gpu or not during training.")
|
||||||
|
parser.add_argument('--use_data_parallel', type=bool, default=False,
|
||||||
|
help="use data parallel or not during training.")
|
||||||
|
|
||||||
|
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||||
|
help="the path of dataset.")
|
||||||
|
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||||
|
help="the path to load checkpoint or pretrain model.")
|
||||||
|
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||||
|
help="the path to save checkpoint.")
|
||||||
|
parser.add_argument('--log_dir', type=str, default='./log',
|
||||||
|
help="the directory to save tensorboard log.")
|
||||||
|
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||||
|
help="the directory to save audio sample in synthesis.")
|
||||||
|
parser.add_argument('--transtts_path', type=str, default='./log',
|
||||||
|
help="the directory to load pretrain transformerTTS model.")
|
||||||
|
parser.add_argument('--transformer_step', type=int, default=70000,
|
||||||
|
help="the step to load transformerTTS model.")
|
||||||
|
|
||||||
|
|
||||||
|
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
|
|
@ -0,0 +1,139 @@
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
import jsonargparse
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import tqdm
|
||||||
|
from tensorboardX import SummaryWriter
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from parse import add_config_options_to_parser
|
||||||
|
from pprint import pprint
|
||||||
|
from network import FastSpeech
|
||||||
|
from utils import get_alignment
|
||||||
|
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
|
||||||
|
from parakeet.models.transformerTTS.network import TransformerTTS
|
||||||
|
|
||||||
|
class MyDataParallel(dg.parallel.DataParallel):
|
||||||
|
"""
|
||||||
|
A data parallel proxy for model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, layers, strategy):
|
||||||
|
super(MyDataParallel, self).__init__(layers, strategy)
|
||||||
|
|
||||||
|
def __getattr__(self, key):
|
||||||
|
if key in self.__dict__:
|
||||||
|
return object.__getattribute__(self, key)
|
||||||
|
elif key is "_layers":
|
||||||
|
return object.__getattribute__(self, "_sub_layers")["_layers"]
|
||||||
|
else:
|
||||||
|
return getattr(
|
||||||
|
object.__getattribute__(self, "_sub_layers")["_layers"], key)
|
||||||
|
|
||||||
|
def main(cfg):
|
||||||
|
|
||||||
|
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||||
|
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||||
|
|
||||||
|
if local_rank == 0:
|
||||||
|
# Print the whole config setting.
|
||||||
|
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||||
|
|
||||||
|
global_step = 0
|
||||||
|
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||||
|
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||||
|
if cfg.use_gpu else fluid.CPUPlace())
|
||||||
|
|
||||||
|
if not os.path.exists(cfg.log_dir):
|
||||||
|
os.mkdir(cfg.log_dir)
|
||||||
|
path = os.path.join(cfg.log_dir,'fastspeech')
|
||||||
|
|
||||||
|
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||||
|
|
||||||
|
with dg.guard(place):
|
||||||
|
transformerTTS = TransformerTTS(cfg)
|
||||||
|
model_path = os.path.join(cfg.transtts_path, "transformer")
|
||||||
|
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
|
||||||
|
#for param in transformerTTS.state_dict():
|
||||||
|
# print(param)
|
||||||
|
|
||||||
|
transformerTTS.set_dict(model_dict)
|
||||||
|
transformerTTS.eval()
|
||||||
|
|
||||||
|
model = FastSpeech(cfg)
|
||||||
|
model.train()
|
||||||
|
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step))
|
||||||
|
|
||||||
|
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
|
||||||
|
|
||||||
|
if cfg.checkpoint_path is not None:
|
||||||
|
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
|
||||||
|
model.set_dict(model_dict)
|
||||||
|
optimizer.set_dict(opti_dict)
|
||||||
|
print("load checkpoint!!!")
|
||||||
|
|
||||||
|
if cfg.use_data_parallel:
|
||||||
|
strategy = dg.parallel.prepare_context()
|
||||||
|
model = MyDataParallel(model, strategy)
|
||||||
|
|
||||||
|
for epoch in range(cfg.epochs):
|
||||||
|
pbar = tqdm(reader)
|
||||||
|
|
||||||
|
for i, data in enumerate(pbar):
|
||||||
|
pbar.set_description('Processing at epoch %d'%epoch)
|
||||||
|
character, mel, mel_input, pos_text, pos_mel, text_length = data
|
||||||
|
|
||||||
|
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
|
||||||
|
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
|
||||||
|
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
#Forward
|
||||||
|
result= model(character,
|
||||||
|
pos_text,
|
||||||
|
mel_pos=pos_mel,
|
||||||
|
length_target=alignment)
|
||||||
|
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
|
||||||
|
mel_loss = layers.mse_loss(mel_output, mel)
|
||||||
|
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
|
||||||
|
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
|
||||||
|
total_loss = mel_loss + mel_postnet_loss + duration_loss
|
||||||
|
|
||||||
|
if local_rank==0:
|
||||||
|
print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
|
||||||
|
|
||||||
|
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
|
||||||
|
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
|
||||||
|
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
|
||||||
|
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||||
|
|
||||||
|
|
||||||
|
if cfg.use_data_parallel:
|
||||||
|
total_loss = model.scale_loss(total_loss)
|
||||||
|
total_loss.backward()
|
||||||
|
model.apply_collective_grads()
|
||||||
|
else:
|
||||||
|
total_loss.backward()
|
||||||
|
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||||
|
model.clear_gradients()
|
||||||
|
|
||||||
|
# save checkpoint
|
||||||
|
if local_rank==0 and global_step % cfg.save_step == 0:
|
||||||
|
if not os.path.exists(cfg.save_path):
|
||||||
|
os.mkdir(cfg.save_path)
|
||||||
|
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step)
|
||||||
|
dg.save_dygraph(model.state_dict(), save_path)
|
||||||
|
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||||
|
if local_rank==0:
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ =='__main__':
|
||||||
|
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse')
|
||||||
|
add_config_options_to_parser(parser)
|
||||||
|
cfg = parser.parse_args('-c config/fastspeech.yaml'.split())
|
||||||
|
main(cfg)
|
|
@ -0,0 +1,32 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def get_alignment(attn_probs, n_head):
|
||||||
|
max_F = 0
|
||||||
|
assert attn_probs[0].shape[0] % n_head == 0
|
||||||
|
batch_size = int(attn_probs[0].shape[0] // n_head)
|
||||||
|
for i in range(len(attn_probs)):
|
||||||
|
multi_attn = attn_probs[i].numpy()
|
||||||
|
for j in range(n_head):
|
||||||
|
attn = multi_attn[j*batch_size:(j+1)*batch_size]
|
||||||
|
F = score_F(attn)
|
||||||
|
if max_F < F:
|
||||||
|
max_F = F
|
||||||
|
max_attn = attn
|
||||||
|
alignment = compute_duration(max_attn)
|
||||||
|
return alignment
|
||||||
|
|
||||||
|
def score_F(attn):
|
||||||
|
max = np.max(attn, axis=-1)
|
||||||
|
mean = np.mean(max)
|
||||||
|
return mean
|
||||||
|
|
||||||
|
def compute_duration(attn):
|
||||||
|
alignment = np.zeros([attn.shape[0],attn.shape[2]])
|
||||||
|
for i in range(attn.shape[0]):
|
||||||
|
for j in range(attn.shape[1]):
|
||||||
|
max_index = attn[i,j].tolist().index(attn[i,j].max())
|
||||||
|
alignment[i,max_index] += 1
|
||||||
|
|
||||||
|
return alignment
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,8 @@ audio:
|
||||||
ref_level_db: 20
|
ref_level_db: 20
|
||||||
outputs_per_step: 1
|
outputs_per_step: 1
|
||||||
|
|
||||||
network:
|
hidden_size: 256
|
||||||
hidden_size: 256
|
embedding_size: 512
|
||||||
embedding_size: 512
|
|
||||||
|
|
||||||
|
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
|
|
|
@ -10,15 +10,15 @@ audio:
|
||||||
ref_level_db: 20
|
ref_level_db: 20
|
||||||
outputs_per_step: 1
|
outputs_per_step: 1
|
||||||
|
|
||||||
network:
|
|
||||||
hidden_size: 256
|
hidden_size: 384 #256
|
||||||
embedding_size: 512
|
embedding_size: 384 #512
|
||||||
|
|
||||||
|
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
epochs: 10000
|
epochs: 10000
|
||||||
lr: 0.001
|
lr: 0.001
|
||||||
save_step: 500
|
save_step: 10
|
||||||
image_step: 2000
|
image_step: 2000
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: True
|
use_data_parallel: True
|
||||||
|
|
|
@ -3,10 +3,10 @@ import numpy as np
|
||||||
from paddle import fluid
|
from paddle import fluid
|
||||||
from parakeet.data.sampler import DistributedSampler
|
from parakeet.data.sampler import DistributedSampler
|
||||||
from parakeet.data.datacargo import DataCargo
|
from parakeet.data.datacargo import DataCargo
|
||||||
from preprocess import batch_examples, LJSpeech, batch_examples_postnet
|
from preprocess import batch_examples, LJSpeech, batch_examples_vocoder
|
||||||
|
|
||||||
class LJSpeechLoader:
|
class LJSpeechLoader:
|
||||||
def __init__(self, config, nranks, rank, is_postnet=False):
|
def __init__(self, config, nranks, rank, is_vocoder=False):
|
||||||
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||||
|
|
||||||
LJSPEECH_ROOT = Path(config.data_path)
|
LJSPEECH_ROOT = Path(config.data_path)
|
||||||
|
@ -15,8 +15,8 @@ class LJSpeechLoader:
|
||||||
|
|
||||||
assert config.batch_size % nranks == 0
|
assert config.batch_size % nranks == 0
|
||||||
each_bs = config.batch_size // nranks
|
each_bs = config.batch_size // nranks
|
||||||
if is_postnet:
|
if is_vocoder:
|
||||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True)
|
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True)
|
||||||
else:
|
else:
|
||||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True)
|
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True)
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,6 @@ class Conv1D(dg.Layer):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
name_scope,
|
|
||||||
in_channels,
|
in_channels,
|
||||||
num_filters,
|
num_filters,
|
||||||
filter_size=3,
|
filter_size=3,
|
||||||
|
@ -28,7 +27,7 @@ class Conv1D(dg.Layer):
|
||||||
act=None,
|
act=None,
|
||||||
data_format='NCT',
|
data_format='NCT',
|
||||||
dtype="float32"):
|
dtype="float32"):
|
||||||
super(Conv1D, self).__init__(name_scope, dtype=dtype)
|
super(Conv1D, self).__init__(dtype=dtype)
|
||||||
|
|
||||||
self.padding = padding
|
self.padding = padding
|
||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
|
@ -41,7 +40,7 @@ class Conv1D(dg.Layer):
|
||||||
self.data_format = data_format
|
self.data_format = data_format
|
||||||
|
|
||||||
self.conv = dg.Conv2D(
|
self.conv = dg.Conv2D(
|
||||||
self.full_name(),
|
in_channels=in_channels,
|
||||||
num_filters=num_filters,
|
num_filters=num_filters,
|
||||||
filter_size=(1, filter_size),
|
filter_size=(1, filter_size),
|
||||||
stride=(1, stride),
|
stride=(1, stride),
|
||||||
|
@ -77,7 +76,6 @@ class Pool1D(dg.Layer):
|
||||||
A Pool 1D block implemented with Pool2D.
|
A Pool 1D block implemented with Pool2D.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
name_scope,
|
|
||||||
pool_size=-1,
|
pool_size=-1,
|
||||||
pool_type='max',
|
pool_type='max',
|
||||||
pool_stride=1,
|
pool_stride=1,
|
||||||
|
@ -88,7 +86,7 @@ class Pool1D(dg.Layer):
|
||||||
exclusive=True,
|
exclusive=True,
|
||||||
data_format='NCT',
|
data_format='NCT',
|
||||||
dtype='float32'):
|
dtype='float32'):
|
||||||
super(Pool1D, self).__init__(name_scope, dtype=dtype)
|
super(Pool1D, self).__init__(dtype=dtype)
|
||||||
self.pool_size = pool_size
|
self.pool_size = pool_size
|
||||||
self.pool_type = pool_type
|
self.pool_type = pool_type
|
||||||
self.pool_stride = pool_stride
|
self.pool_stride = pool_stride
|
||||||
|
@ -101,7 +99,7 @@ class Pool1D(dg.Layer):
|
||||||
self.dtype = dtype
|
self.dtype = dtype
|
||||||
|
|
||||||
|
|
||||||
self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type,
|
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
||||||
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
||||||
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
||||||
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
|
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
|
||||||
|
@ -127,7 +125,6 @@ class Pool1D(dg.Layer):
|
||||||
|
|
||||||
class DynamicGRU(dg.Layer):
|
class DynamicGRU(dg.Layer):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
scope_name,
|
|
||||||
size,
|
size,
|
||||||
param_attr=None,
|
param_attr=None,
|
||||||
bias_attr=None,
|
bias_attr=None,
|
||||||
|
@ -137,9 +134,8 @@ class DynamicGRU(dg.Layer):
|
||||||
h_0=None,
|
h_0=None,
|
||||||
origin_mode=False,
|
origin_mode=False,
|
||||||
init_size=None):
|
init_size=None):
|
||||||
super(DynamicGRU, self).__init__(scope_name)
|
super(DynamicGRU, self).__init__()
|
||||||
self.gru_unit = dg.GRUUnit(
|
self.gru_unit = dg.GRUUnit(
|
||||||
self.full_name(),
|
|
||||||
size * 3,
|
size * 3,
|
||||||
param_attr=param_attr,
|
param_attr=param_attr,
|
||||||
bias_attr=bias_attr,
|
bias_attr=bias_attr,
|
||||||
|
|
|
@ -3,339 +3,63 @@ from parakeet.g2p.text.symbols import symbols
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
import paddle.fluid.layers as layers
|
import paddle.fluid.layers as layers
|
||||||
from layers import Conv1D, Pool1D, DynamicGRU
|
from parakeet.modules.layers import Conv1D, Pool1D
|
||||||
|
from parakeet.modules.dynamicGRU import DynamicGRU
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
class FC(dg.Layer):
|
|
||||||
def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1):
|
|
||||||
super(FC, self).__init__(name_scope)
|
|
||||||
self.in_features = in_features
|
|
||||||
self.out_features = out_features
|
|
||||||
self.is_bias = is_bias
|
|
||||||
self.dtype = dtype
|
|
||||||
self.gain = gain
|
|
||||||
|
|
||||||
self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features),
|
|
||||||
dtype=dtype,
|
|
||||||
default_initializer = fluid.initializer.XavierInitializer())
|
|
||||||
#self.weight = gain * self.weight
|
|
||||||
# mind the implicit conversion to ParamAttr for many cases
|
|
||||||
if is_bias is not False:
|
|
||||||
k = math.sqrt(1 / in_features)
|
|
||||||
self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ),
|
|
||||||
is_bias=True,
|
|
||||||
dtype=dtype,
|
|
||||||
default_initializer = fluid.initializer.Uniform(low=-k, high=k))
|
|
||||||
|
|
||||||
# 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = fluid.layers.matmul(x, self.weight)
|
|
||||||
if hasattr(self, "bias"):
|
|
||||||
x = fluid.layers.elementwise_add(x, self.bias)
|
|
||||||
return x
|
|
||||||
|
|
||||||
class Conv(dg.Layer):
|
|
||||||
def __init__(self, name_scope, in_channels, out_channels, filter_size=1,
|
|
||||||
padding=0, dilation=1, stride=1, use_cudnn=True,
|
|
||||||
data_format="NCT", is_bias=True, gain=1):
|
|
||||||
super(Conv, self).__init__(name_scope)
|
|
||||||
self.in_channels = in_channels
|
|
||||||
self.out_channels = out_channels
|
|
||||||
self.filter_size = filter_size
|
|
||||||
self.padding = padding
|
|
||||||
self.dilation = dilation
|
|
||||||
self.stride = stride
|
|
||||||
self.use_cudnn = use_cudnn
|
|
||||||
self.data_format = data_format
|
|
||||||
self.is_bias = is_bias
|
|
||||||
self.gain = gain
|
|
||||||
|
|
||||||
self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer())
|
|
||||||
self.bias_attr = None
|
|
||||||
if is_bias is not False:
|
|
||||||
k = math.sqrt(1 / in_channels)
|
|
||||||
self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k))
|
|
||||||
|
|
||||||
self.conv = Conv1D( self.full_name(),
|
|
||||||
in_channels = in_channels,
|
|
||||||
num_filters = out_channels,
|
|
||||||
filter_size = filter_size,
|
|
||||||
padding = padding,
|
|
||||||
dilation = dilation,
|
|
||||||
stride = stride,
|
|
||||||
param_attr = self.weight_attr,
|
|
||||||
bias_attr = self.bias_attr,
|
|
||||||
use_cudnn = use_cudnn,
|
|
||||||
data_format = data_format)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = self.conv(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
class EncoderPrenet(dg.Layer):
|
class EncoderPrenet(dg.Layer):
|
||||||
def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True):
|
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
|
||||||
super(EncoderPrenet, self).__init__(name_scope)
|
super(EncoderPrenet, self).__init__()
|
||||||
self.embedding_size = embedding_size
|
self.embedding_size = embedding_size
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
self.use_cudnn = use_cudnn
|
self.use_cudnn = use_cudnn
|
||||||
self.embedding = dg.Embedding(self.full_name(),
|
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
|
||||||
size = [len(symbols), embedding_size],
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
padding_idx = None)
|
padding_idx = None)
|
||||||
self.conv1 = Conv(self.full_name(),
|
self.conv_list = []
|
||||||
in_channels = embedding_size,
|
self.conv_list.append(Conv1D(in_channels = embedding_size,
|
||||||
out_channels = num_hidden,
|
out_channels = num_hidden,
|
||||||
filter_size = 5,
|
filter_size = 5,
|
||||||
padding = int(np.floor(5/2)),
|
padding = int(np.floor(5/2)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn,
|
||||||
data_format = "NCT",
|
data_format = "NCT"))
|
||||||
gain = math.sqrt(2))
|
for _ in range(2):
|
||||||
self.conv2 = Conv(self.full_name(),
|
self.conv_list = Conv1D(in_channels = num_hidden,
|
||||||
in_channels = num_hidden,
|
out_channels = num_hidden,
|
||||||
out_channels = num_hidden,
|
filter_size = 5,
|
||||||
filter_size = 5,
|
padding = int(np.floor(5/2)),
|
||||||
padding = int(np.floor(5/2)),
|
use_cudnn = use_cudnn,
|
||||||
use_cudnn = use_cudnn,
|
data_format = "NCT")
|
||||||
data_format = "NCT",
|
|
||||||
gain = math.sqrt(2))
|
|
||||||
self.conv3 = Conv(self.full_name(),
|
|
||||||
in_channels = num_hidden,
|
|
||||||
out_channels = num_hidden,
|
|
||||||
filter_size = 5,
|
|
||||||
padding = int(np.floor(5/2)),
|
|
||||||
use_cudnn = use_cudnn,
|
|
||||||
data_format = "NCT",
|
|
||||||
gain = math.sqrt(2))
|
|
||||||
|
|
||||||
self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
|
||||||
moving_mean_name = 'moving_mean',
|
|
||||||
moving_variance_name = 'moving_var',
|
|
||||||
data_layout='NCHW')
|
|
||||||
self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
|
||||||
moving_mean_name = 'moving_mean',
|
|
||||||
moving_variance_name = 'moving_var',
|
|
||||||
data_layout='NCHW')
|
|
||||||
self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
|
||||||
moving_mean_name = 'moving_mean',
|
|
||||||
moving_variance_name = 'moving_var',
|
|
||||||
data_layout='NCHW')
|
|
||||||
|
|
||||||
self.projection = FC(self.full_name(), num_hidden, num_hidden)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
|
||||||
x = layers.transpose(x,[0,2,1])
|
|
||||||
x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2)
|
|
||||||
x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2)
|
|
||||||
x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2)
|
|
||||||
x = layers.transpose(x,[0,2,1]) #(N,T,C)
|
|
||||||
x = self.projection(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
class FFN(dg.Layer):
|
|
||||||
def __init__(self, name_scope, num_hidden, use_cudnn=True):
|
|
||||||
super(FFN, self).__init__(name_scope)
|
|
||||||
self.num_hidden = num_hidden
|
|
||||||
self.use_cudnn = use_cudnn
|
|
||||||
self.w_1 = Conv(self.full_name(),
|
|
||||||
in_channels = num_hidden,
|
|
||||||
out_channels = num_hidden * 4,
|
|
||||||
filter_size = 1,
|
|
||||||
use_cudnn = use_cudnn,
|
|
||||||
data_format = "NCT",
|
|
||||||
gain = math.sqrt(2))
|
|
||||||
self.w_2 = Conv(self.full_name(),
|
|
||||||
in_channels = num_hidden * 4,
|
|
||||||
out_channels = num_hidden,
|
|
||||||
filter_size = 1,
|
|
||||||
use_cudnn = use_cudnn,
|
|
||||||
data_format = "NCT",
|
|
||||||
gain = math.sqrt(2))
|
|
||||||
self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2)
|
|
||||||
|
|
||||||
def forward(self, input):
|
|
||||||
#FFN Networt
|
|
||||||
x = layers.transpose(input, [0,2,1])
|
|
||||||
x = self.w_2(layers.relu(self.w_1(x)))
|
|
||||||
x = layers.transpose(x,[0,2,1])
|
|
||||||
|
|
||||||
# dropout
|
|
||||||
# x = layers.dropout(x, 0.1)
|
|
||||||
# not sure where dropout should be placed, in paper should before residual,
|
|
||||||
# but the diagonal alignment did not appear correctly in the attention plot.
|
|
||||||
|
|
||||||
# residual connection
|
|
||||||
x = x + input
|
|
||||||
|
|
||||||
|
|
||||||
#layer normalization
|
|
||||||
x = self.layer_norm(x)
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
class DecoderPrenet(dg.Layer):
|
|
||||||
def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5):
|
|
||||||
super(DecoderPrenet, self).__init__(name_scope)
|
|
||||||
self.input_size = input_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.output_size = output_size
|
|
||||||
self.dropout_rate = dropout_rate
|
|
||||||
|
|
||||||
self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1
|
|
||||||
self.fc2 = FC(self.full_name(), hidden_size, output_size)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate)
|
|
||||||
x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate)
|
|
||||||
return x
|
|
||||||
|
|
||||||
class ScaledDotProductAttention(dg.Layer):
|
|
||||||
def __init__(self, name_scope, d_key):
|
|
||||||
super(ScaledDotProductAttention, self).__init__(name_scope)
|
|
||||||
|
|
||||||
self.d_key = d_key
|
|
||||||
|
|
||||||
# please attention this mask is diff from pytorch
|
|
||||||
def forward(self, key, value, query, mask=None, query_mask=None):
|
|
||||||
# Compute attention score
|
|
||||||
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
|
|
||||||
attention = attention / math.sqrt(self.d_key)
|
|
||||||
|
|
||||||
# Mask key to ignore padding
|
|
||||||
if mask is not None:
|
|
||||||
attention = attention * mask
|
|
||||||
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
|
|
||||||
attention = attention + mask
|
|
||||||
|
|
||||||
|
|
||||||
attention = layers.softmax(attention)
|
|
||||||
# Mask query to ignore padding
|
|
||||||
# Not sure how to work
|
|
||||||
if query_mask is not None:
|
|
||||||
attention = attention * query_mask
|
|
||||||
|
|
||||||
result = layers.matmul(attention, value)
|
|
||||||
return result, attention
|
|
||||||
|
|
||||||
class MultiheadAttention(dg.Layer):
|
|
||||||
def __init__(self, name_scope, num_hidden, num_head=4):
|
|
||||||
super(MultiheadAttention, self).__init__(name_scope)
|
|
||||||
self.num_hidden = num_hidden
|
|
||||||
self.num_hidden_per_attn = num_hidden // num_head
|
|
||||||
self.num_head = num_head
|
|
||||||
|
|
||||||
self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
|
|
||||||
self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
|
|
||||||
self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False)
|
|
||||||
|
|
||||||
self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn)
|
|
||||||
|
|
||||||
self.fc = FC(self.full_name(), num_hidden * 2, num_hidden)
|
|
||||||
|
|
||||||
self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2)
|
|
||||||
|
|
||||||
def forward(self, key, value, query_input, mask=None, query_mask=None):
|
|
||||||
batch_size = key.shape[0]
|
|
||||||
seq_len_key = key.shape[1]
|
|
||||||
seq_len_query = query_input.shape[1]
|
|
||||||
|
|
||||||
# repeat masks h times
|
|
||||||
if query_mask is not None:
|
|
||||||
query_mask = layers.unsqueeze(query_mask, axes=[-1])
|
|
||||||
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
|
|
||||||
if mask is not None:
|
|
||||||
mask = layers.expand(mask, (self.num_head, 1, 1))
|
|
||||||
|
|
||||||
# Make multihead attention
|
|
||||||
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
|
|
||||||
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn])
|
|
||||||
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn])
|
|
||||||
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn])
|
|
||||||
|
|
||||||
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn])
|
|
||||||
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn])
|
|
||||||
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn])
|
|
||||||
|
|
||||||
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
|
|
||||||
|
|
||||||
# concat all multihead result
|
|
||||||
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn])
|
|
||||||
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
|
|
||||||
#print(result.().shape)
|
|
||||||
# concat result with input
|
|
||||||
result = layers.concat([query_input, result], axis=-1)
|
|
||||||
|
|
||||||
result = self.fc(result)
|
|
||||||
result = result + query_input
|
|
||||||
|
|
||||||
result = self.layer_norm(result)
|
|
||||||
return result, attention
|
|
||||||
|
|
||||||
class PostConvNet(dg.Layer):
|
|
||||||
def __init__(self, name_scope, config):
|
|
||||||
super(PostConvNet, self).__init__(name_scope)
|
|
||||||
|
|
||||||
num_hidden = config.network.hidden_size
|
|
||||||
self.num_hidden = num_hidden
|
|
||||||
self.conv1 = Conv(self.full_name(),
|
|
||||||
in_channels = config.audio.num_mels * config.audio.outputs_per_step,
|
|
||||||
out_channels = num_hidden,
|
|
||||||
filter_size = 5,
|
|
||||||
padding = 4,
|
|
||||||
use_cudnn = config.use_gpu,
|
|
||||||
data_format = "NCT",
|
|
||||||
gain = 5 / 3)
|
|
||||||
self.conv_list = [Conv(self.full_name(),
|
|
||||||
in_channels = num_hidden,
|
|
||||||
out_channels = num_hidden,
|
|
||||||
filter_size = 5,
|
|
||||||
padding = 4,
|
|
||||||
use_cudnn = config.use_gpu,
|
|
||||||
data_format = "NCT",
|
|
||||||
gain = 5 / 3) for _ in range(3)]
|
|
||||||
for i, layer in enumerate(self.conv_list):
|
for i, layer in enumerate(self.conv_list):
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||||
self.conv5 = Conv(self.full_name(),
|
|
||||||
in_channels = num_hidden,
|
|
||||||
out_channels = config.audio.num_mels * config.audio.outputs_per_step,
|
|
||||||
filter_size = 5,
|
|
||||||
padding = 4,
|
|
||||||
use_cudnn = config.use_gpu,
|
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden,
|
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
moving_mean_name = 'moving_mean',
|
moving_mean_name = 'moving_mean',
|
||||||
moving_variance_name = 'moving_var',
|
moving_variance_name = 'moving_var',
|
||||||
data_layout='NCHW') for _ in range(3)]
|
data_layout='NCHW') for _ in range(3)]
|
||||||
|
|
||||||
for i, layer in enumerate(self.batch_norm_list):
|
for i, layer in enumerate(self.batch_norm_list):
|
||||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||||
self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
|
||||||
moving_mean_name = 'moving_mean',
|
|
||||||
moving_variance_name = 'moving_var',
|
|
||||||
data_layout='NCHW')
|
|
||||||
|
|
||||||
def forward(self, input):
|
self.projection = dg.Linear(num_hidden, num_hidden)
|
||||||
input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1)
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
||||||
|
x = layers.transpose(x,[0,2,1])
|
||||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||||
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1)
|
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
|
||||||
input = self.conv5(input)[:, :, :-4]
|
x = layers.transpose(x,[0,2,1]) #(N,T,C)
|
||||||
return input
|
x = self.projection(x)
|
||||||
|
return x
|
||||||
|
|
||||||
class CBHG(dg.Layer):
|
class CBHG(dg.Layer):
|
||||||
def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2,
|
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
|
||||||
max_pool_kernel_size=2, is_post=False):
|
max_pool_kernel_size=2, is_post=False):
|
||||||
super(CBHG, self).__init__(name_scope)
|
super(CBHG, self).__init__()
|
||||||
"""
|
"""
|
||||||
:param hidden_size: dimension of hidden unit
|
:param hidden_size: dimension of hidden unit
|
||||||
:param K: # of convolution banks
|
:param K: # of convolution banks
|
||||||
|
@ -344,19 +68,16 @@ class CBHG(dg.Layer):
|
||||||
:param max_pool_kernel_size: max pooling kernel size
|
:param max_pool_kernel_size: max pooling kernel size
|
||||||
:param is_post: whether post processing or not
|
:param is_post: whether post processing or not
|
||||||
"""
|
"""
|
||||||
hidden_size = config.network.hidden_size
|
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.projection_size = projection_size
|
self.projection_size = projection_size
|
||||||
self.conv_list = []
|
self.conv_list = []
|
||||||
self.conv_list.append(Conv(self.full_name(),
|
self.conv_list.append(Conv1D(in_channels = projection_size,
|
||||||
in_channels = projection_size,
|
|
||||||
out_channels = hidden_size,
|
out_channels = hidden_size,
|
||||||
filter_size = 1,
|
filter_size = 1,
|
||||||
padding = int(np.floor(1/2)),
|
padding = int(np.floor(1/2)),
|
||||||
data_format = "NCT"))
|
data_format = "NCT"))
|
||||||
for i in range(2,K+1):
|
for i in range(2,K+1):
|
||||||
self.conv_list.append(Conv(self.full_name(),
|
self.conv_list.append(Conv1D(in_channels = hidden_size,
|
||||||
in_channels = hidden_size,
|
|
||||||
out_channels = hidden_size,
|
out_channels = hidden_size,
|
||||||
filter_size = i,
|
filter_size = i,
|
||||||
padding = int(np.floor(i/2)),
|
padding = int(np.floor(i/2)),
|
||||||
|
@ -367,7 +88,7 @@ class CBHG(dg.Layer):
|
||||||
|
|
||||||
self.batchnorm_list = []
|
self.batchnorm_list = []
|
||||||
for i in range(K):
|
for i in range(K):
|
||||||
self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size,
|
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
moving_mean_name = 'moving_mean',
|
moving_mean_name = 'moving_mean',
|
||||||
|
@ -379,69 +100,63 @@ class CBHG(dg.Layer):
|
||||||
|
|
||||||
conv_outdim = hidden_size * K
|
conv_outdim = hidden_size * K
|
||||||
|
|
||||||
self.conv_projection_1 = Conv(self.full_name(),
|
self.conv_projection_1 = Conv1D(in_channels = conv_outdim,
|
||||||
in_channels = conv_outdim,
|
|
||||||
out_channels = hidden_size,
|
out_channels = hidden_size,
|
||||||
filter_size = 3,
|
filter_size = 3,
|
||||||
padding = int(np.floor(3/2)),
|
padding = int(np.floor(3/2)),
|
||||||
data_format = "NCT")
|
data_format = "NCT")
|
||||||
|
|
||||||
self.conv_projection_2 = Conv(self.full_name(),
|
self.conv_projection_2 = Conv1D(in_channels = hidden_size,
|
||||||
in_channels = hidden_size,
|
|
||||||
out_channels = projection_size,
|
out_channels = projection_size,
|
||||||
filter_size = 3,
|
filter_size = 3,
|
||||||
padding = int(np.floor(3/2)),
|
padding = int(np.floor(3/2)),
|
||||||
data_format = "NCT")
|
data_format = "NCT")
|
||||||
|
|
||||||
self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size,
|
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
moving_mean_name = 'moving_mean',
|
moving_mean_name = 'moving_mean',
|
||||||
moving_variance_name = 'moving_var',
|
moving_variance_name = 'moving_var',
|
||||||
data_layout='NCHW')
|
data_layout='NCHW')
|
||||||
self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size,
|
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
moving_mean_name = 'moving_mean',
|
moving_mean_name = 'moving_mean',
|
||||||
moving_variance_name = 'moving_var',
|
moving_variance_name = 'moving_var',
|
||||||
data_layout='NCHW')
|
data_layout='NCHW')
|
||||||
self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size,
|
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
|
||||||
pool_type='max',
|
pool_type='max',
|
||||||
pool_stride=1,
|
pool_stride=1,
|
||||||
pool_padding=1,
|
pool_padding=1,
|
||||||
data_format = "NCT")
|
data_format = "NCT")
|
||||||
self.highway = Highwaynet(self.full_name(), self.projection_size)
|
self.highway = Highwaynet(self.projection_size)
|
||||||
|
|
||||||
h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32")
|
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
|
||||||
h_0 = dg.to_variable(h_0)
|
h_0 = dg.to_variable(h_0)
|
||||||
self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
|
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||||
self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
|
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||||
self.gru_forward1 = DynamicGRU(self.full_name(),
|
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
|
||||||
size = self.hidden_size // 2,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
is_reverse = False,
|
is_reverse = False,
|
||||||
origin_mode = True,
|
origin_mode = True,
|
||||||
h_0 = h_0)
|
h_0 = h_0)
|
||||||
self.gru_reverse1 = DynamicGRU(self.full_name(),
|
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
|
||||||
size = self.hidden_size // 2,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
is_reverse=True,
|
is_reverse=True,
|
||||||
origin_mode=True,
|
origin_mode=True,
|
||||||
h_0 = h_0)
|
h_0 = h_0)
|
||||||
|
|
||||||
self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
|
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||||
self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3)
|
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||||
self.gru_forward2 = DynamicGRU(self.full_name(),
|
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
|
||||||
size = self.hidden_size // 2,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
is_reverse = False,
|
is_reverse = False,
|
||||||
origin_mode = True,
|
origin_mode = True,
|
||||||
h_0 = h_0)
|
h_0 = h_0)
|
||||||
self.gru_reverse2 = DynamicGRU(self.full_name(),
|
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
|
||||||
size = self.hidden_size // 2,
|
|
||||||
param_attr = fluid.ParamAttr(name='weight'),
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
bias_attr = fluid.ParamAttr(name='bias'),
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
is_reverse=True,
|
is_reverse=True,
|
||||||
|
@ -491,8 +206,8 @@ class CBHG(dg.Layer):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
class Highwaynet(dg.Layer):
|
class Highwaynet(dg.Layer):
|
||||||
def __init__(self, name_scope, num_units, num_layers=4):
|
def __init__(self, num_units, num_layers=4):
|
||||||
super(Highwaynet, self).__init__(name_scope)
|
super(Highwaynet, self).__init__()
|
||||||
self.num_units = num_units
|
self.num_units = num_units
|
||||||
self.num_layers = num_layers
|
self.num_layers = num_layers
|
||||||
|
|
||||||
|
@ -500,8 +215,8 @@ class Highwaynet(dg.Layer):
|
||||||
self.linears = []
|
self.linears = []
|
||||||
|
|
||||||
for i in range(num_layers):
|
for i in range(num_layers):
|
||||||
self.linears.append(FC(self.full_name(), num_units, num_units))
|
self.linears.append(dg.Linear(num_units, num_units))
|
||||||
self.gates.append(FC(self.full_name(), num_units, num_units))
|
self.gates.append(dg.Linear(num_units, num_units))
|
||||||
|
|
||||||
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
|
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
|
||||||
self.add_sublayer("linears_{}".format(i), linear)
|
self.add_sublayer("linears_{}".format(i), linear)
|
||||||
|
|
|
@ -1,39 +1,42 @@
|
||||||
from module import *
|
from parakeet.models.transformerTTS.module import *
|
||||||
from utils import get_positional_table, get_sinusoid_encoding_table
|
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
|
from parakeet.modules.layers import Conv1D
|
||||||
|
from parakeet.modules.utils import *
|
||||||
|
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||||
|
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||||
|
from parakeet.modules.prenet import PreNet
|
||||||
|
from parakeet.modules.post_convnet import PostConvNet
|
||||||
|
|
||||||
|
|
||||||
class Encoder(dg.Layer):
|
class Encoder(dg.Layer):
|
||||||
def __init__(self, name_scope, embedding_size, num_hidden, config):
|
def __init__(self, embedding_size, num_hidden, config):
|
||||||
super(Encoder, self).__init__(name_scope)
|
super(Encoder, self).__init__()
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
param = fluid.ParamAttr(name='alpha',
|
param = fluid.ParamAttr(name='alpha',
|
||||||
initializer=fluid.initializer.Constant(value=1.0))
|
initializer=fluid.initializer.Constant(value=1.0))
|
||||||
self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32')
|
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
|
||||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||||
self.pos_emb = dg.Embedding(name_scope=self.full_name(),
|
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||||
size=[1024, num_hidden],
|
|
||||||
padding_idx=0,
|
padding_idx=0,
|
||||||
param_attr=fluid.ParamAttr(
|
param_attr=fluid.ParamAttr(
|
||||||
name='weight',
|
name='weight',
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
trainable=False))
|
trainable=False))
|
||||||
self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(),
|
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
||||||
embedding_size = embedding_size,
|
|
||||||
num_hidden = num_hidden,
|
num_hidden = num_hidden,
|
||||||
use_cudnn=config.use_gpu)
|
use_cudnn=config.use_gpu)
|
||||||
self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
|
self.layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.layers):
|
for i, layer in enumerate(self.layers):
|
||||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||||
self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)]
|
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.ffns):
|
for i, layer in enumerate(self.ffns):
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
self.add_sublayer("ffns_{}".format(i), layer)
|
||||||
|
|
||||||
def forward(self, x, positional):
|
def forward(self, x, positional):
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
query_mask = (positional != 0).astype(np.float32)
|
query_mask = get_non_pad_mask(positional)
|
||||||
mask = (positional != 0).astype(np.float32)
|
mask = get_attn_key_pad_mask(positional, x)
|
||||||
mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1])
|
|
||||||
else:
|
else:
|
||||||
query_mask, mask = None, None
|
query_mask, mask = None, None
|
||||||
|
|
||||||
|
@ -59,65 +62,60 @@ class Encoder(dg.Layer):
|
||||||
return x, query_mask, attentions
|
return x, query_mask, attentions
|
||||||
|
|
||||||
class Decoder(dg.Layer):
|
class Decoder(dg.Layer):
|
||||||
def __init__(self, name_scope, num_hidden, config):
|
def __init__(self, num_hidden, config):
|
||||||
super(Decoder, self).__init__(name_scope)
|
super(Decoder, self).__init__()
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
param = fluid.ParamAttr(name='alpha')
|
param = fluid.ParamAttr(name='alpha')
|
||||||
self.alpha = self.create_parameter(param, shape=(1,), dtype='float32',
|
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
|
||||||
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
|
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
|
||||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||||
self.pos_emb = dg.Embedding(name_scope=self.full_name(),
|
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||||
size=[1024, num_hidden],
|
|
||||||
padding_idx=0,
|
padding_idx=0,
|
||||||
param_attr=fluid.ParamAttr(
|
param_attr=fluid.ParamAttr(
|
||||||
name='weight',
|
name='weight',
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||||
trainable=False))
|
trainable=False))
|
||||||
self.decoder_prenet = DecoderPrenet(self.full_name(),
|
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
|
||||||
input_size = config.audio.num_mels,
|
|
||||||
hidden_size = num_hidden * 2,
|
hidden_size = num_hidden * 2,
|
||||||
output_size = num_hidden,
|
output_size = num_hidden,
|
||||||
dropout_rate=0.2)
|
dropout_rate=0.2)
|
||||||
self.linear = FC(self.full_name(), num_hidden, num_hidden)
|
self.linear = dg.Linear(num_hidden, num_hidden)
|
||||||
|
|
||||||
self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
|
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.selfattn_layers):
|
for i, layer in enumerate(self.selfattn_layers):
|
||||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||||
self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
|
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.attn_layers):
|
for i, layer in enumerate(self.attn_layers):
|
||||||
self.add_sublayer("attn_{}".format(i), layer)
|
self.add_sublayer("attn_{}".format(i), layer)
|
||||||
self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)]
|
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
|
||||||
for i, layer in enumerate(self.ffns):
|
for i, layer in enumerate(self.ffns):
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
self.add_sublayer("ffns_{}".format(i), layer)
|
||||||
self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
||||||
self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1)
|
self.stop_linear = dg.Linear(num_hidden, 1)
|
||||||
|
|
||||||
self.postconvnet = PostConvNet(self.full_name(), config)
|
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
|
||||||
|
filter_size = 5, padding = 4, num_conv=5,
|
||||||
|
outputs_per_step=config.audio.outputs_per_step,
|
||||||
|
use_cudnn = config.use_gpu)
|
||||||
|
|
||||||
def forward(self, key, value, query, c_mask, positional):
|
def forward(self, key, value, query, c_mask, positional):
|
||||||
batch_size = key.shape[0]
|
|
||||||
decoder_len = query.shape[1]
|
|
||||||
|
|
||||||
# get decoder mask with triangular matrix
|
# get decoder mask with triangular matrix
|
||||||
|
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
#zeros = np.zeros(positional.shape, dtype=np.float32)
|
m_mask = get_non_pad_mask(positional)
|
||||||
m_mask = (positional != 0).astype(np.float32)
|
mask = get_attn_key_pad_mask(positional, query)
|
||||||
mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1)
|
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
|
||||||
mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0)
|
mask = mask + triu_tensor
|
||||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
mask = fluid.layers.cast(mask != 0, np.float32)
|
||||||
|
|
||||||
|
|
||||||
# (batch_size, decoder_len, decoder_len)
|
# (batch_size, decoder_len, encoder_len)
|
||||||
zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(np.float32), axes=2), [1,1,decoder_len])
|
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
|
||||||
# (batch_size, decoder_len, seq_len)
|
|
||||||
zero_mask = fluid.layers.transpose(zero_mask, [0,2,1])
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0)
|
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
|
||||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
mask = fluid.layers.cast(dg.to_variable(mask != 0), np.float32)
|
||||||
m_mask, zero_mask = None, None
|
m_mask, zero_mask = None, None
|
||||||
#import pdb; pdb.set_trace()
|
|
||||||
# Decoder pre-network
|
# Decoder pre-network
|
||||||
query = self.decoder_prenet(query)
|
query = self.decoder_prenet(query)
|
||||||
|
|
||||||
|
@ -145,21 +143,21 @@ class Decoder(dg.Layer):
|
||||||
# Mel linear projection
|
# Mel linear projection
|
||||||
mel_out = self.mel_linear(query)
|
mel_out = self.mel_linear(query)
|
||||||
# Post Mel Network
|
# Post Mel Network
|
||||||
postnet_input = layers.transpose(mel_out, [0,2,1])
|
out = self.postconvnet(mel_out)
|
||||||
out = self.postconvnet(postnet_input)
|
out = mel_out + out
|
||||||
out = postnet_input + out
|
|
||||||
out = layers.transpose(out, [0,2,1])
|
|
||||||
|
|
||||||
# Stop tokens
|
# Stop tokens
|
||||||
stop_tokens = self.stop_linear(query)
|
stop_tokens = self.stop_linear(query)
|
||||||
|
stop_tokens = layers.squeeze(stop_tokens, [-1])
|
||||||
|
stop_tokens = layers.sigmoid(stop_tokens)
|
||||||
|
|
||||||
return mel_out, out, attn_list, stop_tokens, selfattn_list
|
return mel_out, out, attn_list, stop_tokens, selfattn_list
|
||||||
|
|
||||||
class Model(dg.Layer):
|
class TransformerTTS(dg.Layer):
|
||||||
def __init__(self, name_scope, config):
|
def __init__(self, config):
|
||||||
super(Model, self).__init__(name_scope)
|
super(TransformerTTS, self).__init__()
|
||||||
self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config)
|
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
|
||||||
self.decoder = Decoder(self.full_name(), config.network.hidden_size, config)
|
self.decoder = Decoder(config.hidden_size, config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
def forward(self, characters, mel_input, pos_text, pos_mel):
|
||||||
|
@ -180,16 +178,16 @@ class ModelPostNet(dg.Layer):
|
||||||
"""
|
"""
|
||||||
CBHG Network (mel -> linear)
|
CBHG Network (mel -> linear)
|
||||||
"""
|
"""
|
||||||
def __init__(self, name_scope, config):
|
def __init__(self, config):
|
||||||
super(ModelPostNet, self).__init__(name_scope)
|
super(ModelPostNet, self).__init__()
|
||||||
self.pre_proj = Conv(self.full_name(),
|
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
|
||||||
in_channels = config.audio.num_mels,
|
out_channels = config.hidden_size,
|
||||||
out_channels = config.network.hidden_size,
|
filter_size=1,
|
||||||
data_format = "NCT")
|
data_format = "NCT")
|
||||||
self.cbhg = CBHG(self.full_name(), config)
|
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
||||||
self.post_proj = Conv(self.full_name(),
|
self.post_proj = Conv1D(in_channels = config.hidden_size,
|
||||||
in_channels = config.audio.num_mels,
|
|
||||||
out_channels = (config.audio.n_fft // 2) + 1,
|
out_channels = (config.audio.n_fft // 2) + 1,
|
||||||
|
filter_size=1,
|
||||||
data_format = "NCT")
|
data_format = "NCT")
|
||||||
|
|
||||||
def forward(self, mel):
|
def forward(self, mel):
|
||||||
|
|
|
@ -22,9 +22,9 @@ def add_config_options_to_parser(parser):
|
||||||
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||||
help="the outputs per step.")
|
help="the outputs per step.")
|
||||||
|
|
||||||
parser.add_argument('--network.hidden_size', type=int, default=256,
|
parser.add_argument('--hidden_size', type=int, default=256,
|
||||||
help="the hidden size in network.")
|
help="the hidden size in network.")
|
||||||
parser.add_argument('--network.embedding_size', type=int, default=512,
|
parser.add_argument('--embedding_size', type=int, default=512,
|
||||||
help="the embedding vector size.")
|
help="the embedding vector size.")
|
||||||
|
|
||||||
parser.add_argument('--batch_size', type=int, default=32,
|
parser.add_argument('--batch_size', type=int, default=32,
|
||||||
|
|
|
@ -62,20 +62,6 @@ class LJSpeech(Dataset):
|
||||||
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||||
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||||
|
|
||||||
def _batch_examples(self, minibatch):
|
|
||||||
mag_batch = []
|
|
||||||
mel_batch = []
|
|
||||||
phoneme_batch = []
|
|
||||||
for example in minibatch:
|
|
||||||
mag, mel, phoneme = example
|
|
||||||
mag_batch.append(mag)
|
|
||||||
mel_batch.append(mel)
|
|
||||||
phoneme_batch.append(phoneme)
|
|
||||||
mag_batch = SpecBatcher(pad_value=0.)(mag_batch)
|
|
||||||
mel_batch = SpecBatcher(pad_value=0.)(mel_batch)
|
|
||||||
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
|
|
||||||
return (mag_batch, mel_batch, phoneme_batch)
|
|
||||||
|
|
||||||
def __getitem__(self, index):
|
def __getitem__(self, index):
|
||||||
metadatum = self.metadata.iloc[index]
|
metadatum = self.metadata.iloc[index]
|
||||||
example = self._get_example(metadatum)
|
example = self._get_example(metadatum)
|
||||||
|
@ -121,7 +107,7 @@ def batch_examples(batch):
|
||||||
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
|
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
|
||||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
|
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
|
||||||
|
|
||||||
def batch_examples_postnet(batch):
|
def batch_examples_vocoder(batch):
|
||||||
mels=[]
|
mels=[]
|
||||||
mags=[]
|
mags=[]
|
||||||
for data in batch:
|
for data in batch:
|
||||||
|
|
|
@ -28,8 +28,8 @@ def synthesis(text_input, cfg):
|
||||||
writer = SummaryWriter(path)
|
writer = SummaryWriter(path)
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
model = Model('transtts', cfg)
|
model = Model(cfg)
|
||||||
model_postnet = ModelPostNet('postnet', cfg)
|
model_postnet = ModelPostNet(cfg)
|
||||||
|
|
||||||
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
|
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
|
||||||
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
|
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
|
||||||
|
|
|
@ -47,7 +47,7 @@ def main(cfg):
|
||||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
model = ModelPostNet('postnet', cfg)
|
model = ModelPostNet(cfg)
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
|
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
|
||||||
|
@ -62,7 +62,7 @@ def main(cfg):
|
||||||
strategy = dg.parallel.prepare_context()
|
strategy = dg.parallel.prepare_context()
|
||||||
model = MyDataParallel(model, strategy)
|
model = MyDataParallel(model, strategy)
|
||||||
|
|
||||||
reader = LJSpeechLoader(cfg, nranks, local_rank, is_postnet=True).reader()
|
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader()
|
||||||
|
|
||||||
for epoch in range(cfg.epochs):
|
for epoch in range(cfg.epochs):
|
||||||
pbar = tqdm(reader)
|
pbar = tqdm(reader)
|
||||||
|
@ -74,7 +74,6 @@ def main(cfg):
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
mag_pred = model(mel)
|
mag_pred = model(mel)
|
||||||
|
|
||||||
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
|
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
|
||||||
if cfg.use_data_parallel:
|
if cfg.use_data_parallel:
|
||||||
loss = model.scale_loss(loss)
|
loss = model.scale_loss(loss)
|
||||||
|
|
|
@ -9,7 +9,8 @@ import jsonargparse
|
||||||
from parse import add_config_options_to_parser
|
from parse import add_config_options_to_parser
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from matplotlib import cm
|
from matplotlib import cm
|
||||||
from data import LJSpeechLoader
|
from parakeet.modules.utils import cross_entropy
|
||||||
|
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
|
||||||
|
|
||||||
class MyDataParallel(dg.parallel.DataParallel):
|
class MyDataParallel(dg.parallel.DataParallel):
|
||||||
"""
|
"""
|
||||||
|
@ -49,7 +50,7 @@ def main(cfg):
|
||||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
model = Model('transtts', cfg)
|
model = TransformerTTS(cfg)
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
|
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
|
||||||
|
@ -76,14 +77,21 @@ def main(cfg):
|
||||||
|
|
||||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
|
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
|
||||||
|
|
||||||
|
label = np.zeros(stop_preds.shape).astype(np.float32)
|
||||||
|
text_length = text_length.numpy()
|
||||||
|
for i in range(label.shape[0]):
|
||||||
|
label[i][text_length[i] - 1] = 1
|
||||||
|
|
||||||
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||||
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||||
loss = mel_loss + post_mel_loss
|
stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
|
||||||
|
loss = mel_loss + post_mel_loss + stop_loss
|
||||||
|
|
||||||
if local_rank==0:
|
if local_rank==0:
|
||||||
writer.add_scalars('training_loss', {
|
writer.add_scalars('training_loss', {
|
||||||
'mel_loss':mel_loss.numpy(),
|
'mel_loss':mel_loss.numpy(),
|
||||||
'post_mel_loss':post_mel_loss.numpy(),
|
'post_mel_loss':post_mel_loss.numpy(),
|
||||||
|
'stop_loss':stop_loss.numpy()
|
||||||
}, global_step)
|
}, global_step)
|
||||||
|
|
||||||
writer.add_scalars('alphas', {
|
writer.add_scalars('alphas', {
|
||||||
|
@ -97,7 +105,7 @@ def main(cfg):
|
||||||
for i, prob in enumerate(attn_probs):
|
for i, prob in enumerate(attn_probs):
|
||||||
for j in range(4):
|
for j in range(4):
|
||||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||||
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||||
|
|
||||||
for i, prob in enumerate(attn_enc):
|
for i, prob in enumerate(attn_enc):
|
||||||
for j in range(4):
|
for j in range(4):
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
|
||||||
|
class DynamicGRU(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
size,
|
||||||
|
param_attr=None,
|
||||||
|
bias_attr=None,
|
||||||
|
is_reverse=False,
|
||||||
|
gate_activation='sigmoid',
|
||||||
|
candidate_activation='tanh',
|
||||||
|
h_0=None,
|
||||||
|
origin_mode=False,
|
||||||
|
init_size=None):
|
||||||
|
super(DynamicGRU, self).__init__()
|
||||||
|
self.gru_unit = dg.GRUUnit(
|
||||||
|
size * 3,
|
||||||
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
activation=candidate_activation,
|
||||||
|
gate_activation=gate_activation,
|
||||||
|
origin_mode=origin_mode)
|
||||||
|
self.size = size
|
||||||
|
self.h_0 = h_0
|
||||||
|
self.is_reverse = is_reverse
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
hidden = self.h_0
|
||||||
|
res = []
|
||||||
|
for i in range(inputs.shape[1]):
|
||||||
|
if self.is_reverse:
|
||||||
|
i = inputs.shape[1] - 1 - i
|
||||||
|
input_ = inputs[:, i:i + 1, :]
|
||||||
|
input_ = layers.reshape(
|
||||||
|
input_, [-1, input_.shape[2]], inplace=False)
|
||||||
|
hidden, reset, gate = self.gru_unit(input_, hidden)
|
||||||
|
hidden_ = layers.reshape(
|
||||||
|
hidden, [-1, 1, hidden.shape[1]], inplace=False)
|
||||||
|
res.append(hidden_)
|
||||||
|
if self.is_reverse:
|
||||||
|
res = res[::-1]
|
||||||
|
res = layers.concat(res, axis=1)
|
||||||
|
return res
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
from parakeet.modules.layers import Conv1D
|
||||||
|
|
||||||
|
class PositionwiseFeedForward(dg.Layer):
|
||||||
|
''' A two-feed-forward-layer module '''
|
||||||
|
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
|
||||||
|
super(PositionwiseFeedForward, self).__init__()
|
||||||
|
self.num_hidden = num_hidden
|
||||||
|
self.use_cudnn = use_cudnn
|
||||||
|
self.dropout = dropout
|
||||||
|
|
||||||
|
self.w_1 = Conv1D(in_channels = d_in,
|
||||||
|
out_channels = num_hidden,
|
||||||
|
filter_size = filter_size,
|
||||||
|
padding=padding,
|
||||||
|
use_cudnn = use_cudnn,
|
||||||
|
data_format = "NTC")
|
||||||
|
self.w_2 = Conv1D(in_channels = num_hidden,
|
||||||
|
out_channels = d_in,
|
||||||
|
filter_size = filter_size,
|
||||||
|
padding=padding,
|
||||||
|
use_cudnn = use_cudnn,
|
||||||
|
data_format = "NTC")
|
||||||
|
self.layer_norm = dg.LayerNorm(d_in)
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
#FFN Networt
|
||||||
|
x = self.w_2(layers.relu(self.w_1(input)))
|
||||||
|
|
||||||
|
# dropout
|
||||||
|
x = layers.dropout(x, self.dropout)
|
||||||
|
|
||||||
|
# residual connection
|
||||||
|
x = x + input
|
||||||
|
|
||||||
|
#layer normalization
|
||||||
|
x = self.layer_norm(x)
|
||||||
|
|
||||||
|
return x
|
|
@ -0,0 +1,122 @@
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import fluid
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1D(dg.Layer):
|
||||||
|
"""
|
||||||
|
A convolution 1D block implemented with Conv2D. Form simplicity and
|
||||||
|
ensuring the output has the same length as the input, it does not allow
|
||||||
|
stride > 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
filter_size=3,
|
||||||
|
padding=0,
|
||||||
|
dilation=1,
|
||||||
|
stride=1,
|
||||||
|
groups=None,
|
||||||
|
param_attr=None,
|
||||||
|
bias_attr=None,
|
||||||
|
use_cudnn=True,
|
||||||
|
act=None,
|
||||||
|
data_format='NCT',
|
||||||
|
dtype="float32"):
|
||||||
|
super(Conv1D, self).__init__(dtype=dtype)
|
||||||
|
|
||||||
|
self.padding = padding
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.num_filters = out_channels
|
||||||
|
self.filter_size = filter_size
|
||||||
|
self.stride = stride
|
||||||
|
self.dilation = dilation
|
||||||
|
self.padding = padding
|
||||||
|
self.act = act
|
||||||
|
self.data_format = data_format
|
||||||
|
|
||||||
|
self.conv = dg.Conv2D(
|
||||||
|
num_channels=in_channels,
|
||||||
|
num_filters=out_channels,
|
||||||
|
filter_size=(1, filter_size),
|
||||||
|
stride=(1, stride),
|
||||||
|
dilation=(1, dilation),
|
||||||
|
padding=(0, padding),
|
||||||
|
groups=groups,
|
||||||
|
param_attr=param_attr,
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
use_cudnn=use_cudnn,
|
||||||
|
act=act,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||||
|
input channels.
|
||||||
|
Returns:
|
||||||
|
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||||
|
output channels (num_filters).
|
||||||
|
"""
|
||||||
|
if self.data_format == 'NTC':
|
||||||
|
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||||
|
x = fluid.layers.unsqueeze(x, [2])
|
||||||
|
x = self.conv(x)
|
||||||
|
x = fluid.layers.squeeze(x, [2])
|
||||||
|
if self.data_format == 'NTC':
|
||||||
|
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||||
|
return x
|
||||||
|
|
||||||
|
class Pool1D(dg.Layer):
|
||||||
|
"""
|
||||||
|
A Pool 1D block implemented with Pool2D.
|
||||||
|
"""
|
||||||
|
def __init__(self,
|
||||||
|
pool_size=-1,
|
||||||
|
pool_type='max',
|
||||||
|
pool_stride=1,
|
||||||
|
pool_padding=0,
|
||||||
|
global_pooling=False,
|
||||||
|
use_cudnn=True,
|
||||||
|
ceil_mode=False,
|
||||||
|
exclusive=True,
|
||||||
|
data_format='NCT'):
|
||||||
|
super(Pool1D, self).__init__()
|
||||||
|
self.pool_size = pool_size
|
||||||
|
self.pool_type = pool_type
|
||||||
|
self.pool_stride = pool_stride
|
||||||
|
self.pool_padding = pool_padding
|
||||||
|
self.global_pooling = global_pooling
|
||||||
|
self.use_cudnn = use_cudnn
|
||||||
|
self.ceil_mode = ceil_mode
|
||||||
|
self.exclusive = exclusive
|
||||||
|
self.data_format = data_format
|
||||||
|
|
||||||
|
|
||||||
|
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
||||||
|
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
||||||
|
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
||||||
|
ceil_mode = ceil_mode, exclusive = exclusive)
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||||
|
input channels.
|
||||||
|
Returns:
|
||||||
|
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||||
|
output channels (num_filters).
|
||||||
|
"""
|
||||||
|
if self.data_format == 'NTC':
|
||||||
|
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||||
|
x = fluid.layers.unsqueeze(x, [2])
|
||||||
|
x = self.pool2d(x)
|
||||||
|
x = fluid.layers.squeeze(x, [2])
|
||||||
|
if self.data_format == 'NTC':
|
||||||
|
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||||
|
return x
|
|
@ -0,0 +1,84 @@
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
|
||||||
|
class ScaledDotProductAttention(dg.Layer):
|
||||||
|
def __init__(self, d_key):
|
||||||
|
super(ScaledDotProductAttention, self).__init__()
|
||||||
|
|
||||||
|
self.d_key = d_key
|
||||||
|
|
||||||
|
# please attention this mask is diff from pytorch
|
||||||
|
def forward(self, key, value, query, mask=None, query_mask=None):
|
||||||
|
# Compute attention score
|
||||||
|
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
|
||||||
|
attention = attention / math.sqrt(self.d_key)
|
||||||
|
|
||||||
|
# Mask key to ignore padding
|
||||||
|
if mask is not None:
|
||||||
|
attention = attention * (mask == 0).astype(np.float32)
|
||||||
|
mask = mask * (-2 ** 32 + 1)
|
||||||
|
attention = attention + mask
|
||||||
|
|
||||||
|
|
||||||
|
attention = layers.softmax(attention)
|
||||||
|
attention = layers.dropout(attention, 0.0)
|
||||||
|
# Mask query to ignore padding
|
||||||
|
# Not sure how to work
|
||||||
|
if query_mask is not None:
|
||||||
|
attention = attention * query_mask
|
||||||
|
|
||||||
|
result = layers.matmul(attention, value)
|
||||||
|
return result, attention
|
||||||
|
|
||||||
|
class MultiheadAttention(dg.Layer):
|
||||||
|
def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1):
|
||||||
|
super(MultiheadAttention, self).__init__()
|
||||||
|
self.num_hidden = num_hidden
|
||||||
|
self.num_head = num_head
|
||||||
|
self.d_k = d_k
|
||||||
|
self.d_q = d_q
|
||||||
|
self.dropout = dropout
|
||||||
|
|
||||||
|
self.key = dg.Linear(num_hidden, num_head * d_k)
|
||||||
|
self.value = dg.Linear(num_hidden, num_head * d_k)
|
||||||
|
self.query = dg.Linear(num_hidden, num_head * d_q)
|
||||||
|
|
||||||
|
self.scal_attn = ScaledDotProductAttention(d_k)
|
||||||
|
|
||||||
|
self.fc = dg.Linear(num_head * d_q, num_hidden)
|
||||||
|
|
||||||
|
self.layer_norm = dg.LayerNorm(num_hidden)
|
||||||
|
|
||||||
|
def forward(self, key, value, query_input, mask=None, query_mask=None):
|
||||||
|
batch_size = key.shape[0]
|
||||||
|
seq_len_key = key.shape[1]
|
||||||
|
seq_len_query = query_input.shape[1]
|
||||||
|
|
||||||
|
# repeat masks h times
|
||||||
|
if query_mask is not None:
|
||||||
|
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
|
||||||
|
if mask is not None:
|
||||||
|
mask = layers.expand(mask, (self.num_head, 1, 1))
|
||||||
|
|
||||||
|
# Make multihead attention
|
||||||
|
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
|
||||||
|
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||||
|
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||||
|
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
|
||||||
|
|
||||||
|
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||||
|
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||||
|
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
|
||||||
|
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
|
||||||
|
|
||||||
|
# concat all multihead result
|
||||||
|
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
||||||
|
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
|
||||||
|
|
||||||
|
result = layers.dropout(self.fc(result), self.dropout)
|
||||||
|
result = result + query_input
|
||||||
|
|
||||||
|
result = self.layer_norm(result)
|
||||||
|
return result, attention
|
|
@ -0,0 +1,67 @@
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
from parakeet.modules.layers import Conv1D
|
||||||
|
|
||||||
|
class PostConvNet(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
n_mels=80,
|
||||||
|
num_hidden=512,
|
||||||
|
filter_size=5,
|
||||||
|
padding=0,
|
||||||
|
num_conv=5,
|
||||||
|
outputs_per_step=1,
|
||||||
|
use_cudnn=True,
|
||||||
|
dropout=0.1):
|
||||||
|
super(PostConvNet, self).__init__()
|
||||||
|
|
||||||
|
self.dropout = dropout
|
||||||
|
self.conv_list = []
|
||||||
|
self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step,
|
||||||
|
out_channels = num_hidden,
|
||||||
|
filter_size = filter_size,
|
||||||
|
padding = padding,
|
||||||
|
use_cudnn = use_cudnn,
|
||||||
|
data_format = "NCT"))
|
||||||
|
|
||||||
|
for _ in range(1, num_conv-1):
|
||||||
|
self.conv_list.append(Conv1D(in_channels = num_hidden,
|
||||||
|
out_channels = num_hidden,
|
||||||
|
filter_size = filter_size,
|
||||||
|
padding = padding,
|
||||||
|
use_cudnn = use_cudnn,
|
||||||
|
data_format = "NCT") )
|
||||||
|
|
||||||
|
self.conv_list.append(Conv1D(in_channels = num_hidden,
|
||||||
|
out_channels = n_mels * outputs_per_step,
|
||||||
|
filter_size = filter_size,
|
||||||
|
padding = padding,
|
||||||
|
use_cudnn = use_cudnn,
|
||||||
|
data_format = "NCT"))
|
||||||
|
|
||||||
|
for i, layer in enumerate(self.conv_list):
|
||||||
|
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||||
|
|
||||||
|
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||||
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
|
moving_mean_name = 'moving_mean',
|
||||||
|
moving_variance_name = 'moving_var',
|
||||||
|
data_layout='NCHW') for _ in range(num_conv-1)]
|
||||||
|
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
|
||||||
|
param_attr = fluid.ParamAttr(name='weight'),
|
||||||
|
bias_attr = fluid.ParamAttr(name='bias'),
|
||||||
|
moving_mean_name = 'moving_mean',
|
||||||
|
moving_variance_name = 'moving_var',
|
||||||
|
data_layout='NCHW'))
|
||||||
|
for i, layer in enumerate(self.batch_norm_list):
|
||||||
|
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
input = layers.transpose(input, [0,2,1])
|
||||||
|
len = input.shape[-1]
|
||||||
|
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||||
|
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
|
||||||
|
input = layers.transpose(input, [0,2,1])
|
||||||
|
return input
|
|
@ -0,0 +1,26 @@
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
|
||||||
|
class PreNet(dg.Layer):
|
||||||
|
"""
|
||||||
|
Pre Net before passing through the network
|
||||||
|
"""
|
||||||
|
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
|
||||||
|
"""
|
||||||
|
:param input_size: dimension of input
|
||||||
|
:param hidden_size: dimension of hidden unit
|
||||||
|
:param output_size: dimension of output
|
||||||
|
"""
|
||||||
|
super(PreNet, self).__init__()
|
||||||
|
self.input_size = input_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.output_size = output_size
|
||||||
|
self.dropout_rate = dropout_rate
|
||||||
|
|
||||||
|
self.linear1 = dg.Linear(input_size, hidden_size)
|
||||||
|
self.linear2 = dg.Linear(hidden_size, output_size)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate)
|
||||||
|
x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate)
|
||||||
|
return x
|
|
@ -2,6 +2,7 @@ import numpy as np
|
||||||
import librosa
|
import librosa
|
||||||
import os, copy
|
import os, copy
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
|
import paddle.fluid.layers as layers
|
||||||
|
|
||||||
|
|
||||||
def get_positional_table(d_pos_vec, n_position=1024):
|
def get_positional_table(d_pos_vec, n_position=1024):
|
||||||
|
@ -33,6 +34,28 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
||||||
|
|
||||||
return sinusoid_table
|
return sinusoid_table
|
||||||
|
|
||||||
|
def get_non_pad_mask(seq):
|
||||||
|
return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
|
||||||
|
|
||||||
|
def get_attn_key_pad_mask(seq_k, seq_q):
|
||||||
|
''' For masking out the padding part of key sequence. '''
|
||||||
|
|
||||||
|
# Expand to fit the shape of key query attention matrix.
|
||||||
|
len_q = seq_q.shape[1]
|
||||||
|
padding_mask = (seq_k != 0).astype(np.float32)
|
||||||
|
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1])
|
||||||
|
return padding_mask
|
||||||
|
|
||||||
|
def get_triu_tensor(seq_k, seq_q):
|
||||||
|
''' For make a triu tensor '''
|
||||||
|
len_k = seq_k.shape[1]
|
||||||
|
len_q = seq_q.shape[1]
|
||||||
|
batch_size = seq_k.shape[0]
|
||||||
|
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
|
||||||
|
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
|
||||||
|
|
||||||
|
return triu_tensor
|
||||||
|
|
||||||
def guided_attention(N, T, g=0.2):
|
def guided_attention(N, T, g=0.2):
|
||||||
'''Guided attention. Refer to page 3 on the paper.'''
|
'''Guided attention. Refer to page 3 on the paper.'''
|
||||||
W = np.zeros((N, T), dtype=np.float32)
|
W = np.zeros((N, T), dtype=np.float32)
|
||||||
|
@ -40,3 +63,11 @@ def guided_attention(N, T, g=0.2):
|
||||||
for t_pos in range(W.shape[1]):
|
for t_pos in range(W.shape[1]):
|
||||||
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
|
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
|
||||||
return W
|
return W
|
||||||
|
|
||||||
|
|
||||||
|
def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001):
|
||||||
|
input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
|
||||||
|
label = input * (label * (position_weight - 1) + 1)
|
||||||
|
return layers.reduce_sum(label, dim=[0, 1])
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue