Merge branch 'master' into 'master'
Modified data.py of TransformerTTS See merge request !30
This commit is contained in:
commit
8e86389ea4
|
@ -3,8 +3,8 @@ audio:
|
||||||
n_fft: 2048
|
n_fft: 2048
|
||||||
sr: 22050
|
sr: 22050
|
||||||
preemphasis: 0.97
|
preemphasis: 0.97
|
||||||
hop_length: 275
|
hop_length: 256
|
||||||
win_length: 1102
|
win_length: 1024
|
||||||
power: 1.2
|
power: 1.2
|
||||||
min_level_db: -100
|
min_level_db: -100
|
||||||
ref_level_db: 20
|
ref_level_db: 20
|
|
@ -52,6 +52,12 @@ def add_config_options_to_parser(parser):
|
||||||
type=int,
|
type=int,
|
||||||
default=0,
|
default=0,
|
||||||
help="use data parallel or not during training.")
|
help="use data parallel or not during training.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--alpha',
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="The hyperparameter to determine the length of the expanded sequence \
|
||||||
|
mel, thereby controlling the voice speed.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--data_path',
|
'--data_path',
|
||||||
|
|
|
@ -24,6 +24,7 @@ import paddle.fluid.dygraph as dg
|
||||||
from parakeet.g2p.en import text_to_sequence
|
from parakeet.g2p.en import text_to_sequence
|
||||||
from parakeet import audio
|
from parakeet import audio
|
||||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||||
|
from parakeet.models.transformer_tts.utils import *
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(step, model_path):
|
def load_checkpoint(step, model_path):
|
||||||
|
@ -59,12 +60,26 @@ def synthesis(text_input, args):
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
text = np.asarray(text_to_sequence(text_input))
|
text = np.asarray(text_to_sequence(text_input))
|
||||||
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
|
text = np.expand_dims(text, axis=0)
|
||||||
pos_text = np.arange(1, text.shape[1] + 1)
|
pos_text = np.arange(1, text.shape[1] + 1)
|
||||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
|
pos_text = np.expand_dims(pos_text, axis=0)
|
||||||
|
enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32)
|
||||||
|
enc_slf_attn_mask = get_attn_key_pad_mask(pos_text,
|
||||||
|
text).astype(np.float32)
|
||||||
|
|
||||||
|
text = dg.to_variable(text)
|
||||||
|
pos_text = dg.to_variable(pos_text)
|
||||||
|
enc_non_pad_mask = dg.to_variable(enc_non_pad_mask)
|
||||||
|
enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask)
|
||||||
|
|
||||||
mel_output, mel_output_postnet = model(
|
mel_output, mel_output_postnet = model(
|
||||||
text, pos_text, alpha=args.alpha)
|
text,
|
||||||
|
pos_text,
|
||||||
|
alpha=args.alpha,
|
||||||
|
enc_non_pad_mask=enc_non_pad_mask,
|
||||||
|
enc_slf_attn_mask=enc_slf_attn_mask,
|
||||||
|
dec_non_pad_mask=None,
|
||||||
|
dec_slf_attn_mask=None)
|
||||||
|
|
||||||
_ljspeech_processor = audio.AudioProcessor(
|
_ljspeech_processor = audio.AudioProcessor(
|
||||||
sample_rate=cfg['audio']['sr'],
|
sample_rate=cfg['audio']['sr'],
|
||||||
|
|
|
@ -21,6 +21,7 @@ from parse import add_config_options_to_parser
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from ruamel import yaml
|
from ruamel import yaml
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from matplotlib import cm
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
|
@ -66,12 +67,12 @@ def main(args):
|
||||||
|
|
||||||
with dg.guard(place):
|
with dg.guard(place):
|
||||||
with fluid.unique_name.guard():
|
with fluid.unique_name.guard():
|
||||||
transformerTTS = TransformerTTS(cfg)
|
transformer_tts = TransformerTTS(cfg)
|
||||||
model_dict, _ = load_checkpoint(
|
model_dict, _ = load_checkpoint(
|
||||||
str(args.transformer_step),
|
str(args.transformer_step),
|
||||||
os.path.join(args.transtts_path, "transformer"))
|
os.path.join(args.transtts_path, "transformer"))
|
||||||
transformerTTS.set_dict(model_dict)
|
transformer_tts.set_dict(model_dict)
|
||||||
transformerTTS.eval()
|
transformer_tts.eval()
|
||||||
|
|
||||||
model = FastSpeech(cfg)
|
model = FastSpeech(cfg)
|
||||||
model.train()
|
model.train()
|
||||||
|
@ -100,13 +101,33 @@ def main(args):
|
||||||
|
|
||||||
for i, data in enumerate(pbar):
|
for i, data in enumerate(pbar):
|
||||||
pbar.set_description('Processing at epoch %d' % epoch)
|
pbar.set_description('Processing at epoch %d' % epoch)
|
||||||
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
|
(character, mel, mel_input, pos_text, pos_mel, text_length,
|
||||||
|
mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask,
|
||||||
|
enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data
|
||||||
|
|
||||||
_, _, attn_probs, _, _, _ = transformerTTS(
|
_, _, attn_probs, _, _, _ = transformer_tts(
|
||||||
character, mel_input, pos_text, pos_mel)
|
character,
|
||||||
alignment = dg.to_variable(
|
mel_input,
|
||||||
get_alignment(attn_probs, mel_lens, cfg[
|
pos_text,
|
||||||
'transformer_head'])).astype(np.float32)
|
pos_mel,
|
||||||
|
dec_slf_mask=dec_slf_mask,
|
||||||
|
enc_slf_mask=enc_slf_mask,
|
||||||
|
enc_query_mask=enc_query_mask,
|
||||||
|
enc_dec_mask=enc_dec_mask,
|
||||||
|
dec_query_slf_mask=dec_query_slf_mask,
|
||||||
|
dec_query_mask=dec_query_mask)
|
||||||
|
alignment, max_attn = get_alignment(attn_probs, mel_lens,
|
||||||
|
cfg['transformer_head'])
|
||||||
|
alignment = dg.to_variable(alignment).astype(np.float32)
|
||||||
|
|
||||||
|
if local_rank == 0 and global_step % 5 == 1:
|
||||||
|
x = np.uint8(
|
||||||
|
cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255)
|
||||||
|
writer.add_image(
|
||||||
|
'Attention_%d_0' % global_step,
|
||||||
|
x,
|
||||||
|
0,
|
||||||
|
dataformats="HWC")
|
||||||
|
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
|
@ -115,7 +136,11 @@ def main(args):
|
||||||
character,
|
character,
|
||||||
pos_text,
|
pos_text,
|
||||||
mel_pos=pos_mel,
|
mel_pos=pos_mel,
|
||||||
length_target=alignment)
|
length_target=alignment,
|
||||||
|
enc_non_pad_mask=enc_query_mask,
|
||||||
|
enc_slf_attn_mask=enc_slf_mask,
|
||||||
|
dec_non_pad_mask=dec_query_slf_mask,
|
||||||
|
dec_slf_attn_mask=dec_slf_mask)
|
||||||
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
|
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
|
||||||
mel_loss = layers.mse_loss(mel_output, mel)
|
mel_loss = layers.mse_loss(mel_output, mel)
|
||||||
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
|
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# train model
|
# train model
|
||||||
# if you wish to resume from an exists model, uncomment --checkpoint_path and --fastspeech_step
|
# if you wish to resume from an exists model, uncomment --checkpoint_path and --fastspeech_step
|
||||||
CUDA_VISIBLE_DEVICES=0\
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
python -u train.py \
|
python -u train.py \
|
||||||
--batch_size=32 \
|
--batch_size=32 \
|
||||||
--epochs=10000 \
|
--epochs=10000 \
|
||||||
|
|
|
@ -9,3 +9,6 @@ audio:
|
||||||
min_level_db: -100
|
min_level_db: -100
|
||||||
ref_level_db: 20
|
ref_level_db: 20
|
||||||
outputs_per_step: 1
|
outputs_per_step: 1
|
||||||
|
|
||||||
|
hidden_size: 256
|
||||||
|
embedding_size: 512
|
|
@ -23,7 +23,8 @@ from parakeet import audio
|
||||||
from parakeet.data.sampler import *
|
from parakeet.data.sampler import *
|
||||||
from parakeet.data.datacargo import DataCargo
|
from parakeet.data.datacargo import DataCargo
|
||||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||||
from parakeet.data.dataset import DatasetMixin, TransformDataset
|
from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset
|
||||||
|
from parakeet.models.transformer_tts.utils import *
|
||||||
|
|
||||||
|
|
||||||
class LJSpeechLoader:
|
class LJSpeechLoader:
|
||||||
|
@ -40,6 +41,8 @@ class LJSpeechLoader:
|
||||||
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
||||||
transformer = LJSpeech(config)
|
transformer = LJSpeech(config)
|
||||||
dataset = TransformDataset(metadata, transformer)
|
dataset = TransformDataset(metadata, transformer)
|
||||||
|
dataset = CacheDataset(dataset)
|
||||||
|
|
||||||
sampler = DistributedSampler(
|
sampler = DistributedSampler(
|
||||||
len(metadata), nranks, rank, shuffle=shuffle)
|
len(metadata), nranks, rank, shuffle=shuffle)
|
||||||
|
|
||||||
|
@ -196,8 +199,18 @@ def batch_examples(batch):
|
||||||
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
|
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||||
mel_inputs = np.transpose(
|
mel_inputs = np.transpose(
|
||||||
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
|
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||||
|
enc_slf_mask = get_attn_key_pad_mask(pos_texts, texts).astype(np.float32)
|
||||||
|
enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32)
|
||||||
|
dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels,
|
||||||
|
mel_inputs).astype(np.float32)
|
||||||
|
enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0],
|
||||||
|
mel_inputs).astype(np.float32)
|
||||||
|
dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32)
|
||||||
|
dec_query_mask = get_non_pad_mask(pos_mels).astype(np.float32)
|
||||||
|
|
||||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
|
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
|
||||||
np.array(mel_lens))
|
np.array(mel_lens), enc_slf_mask, enc_query_mask, dec_slf_mask,
|
||||||
|
enc_dec_mask, dec_query_slf_mask, dec_query_mask)
|
||||||
|
|
||||||
|
|
||||||
def batch_examples_vocoder(batch):
|
def batch_examples_vocoder(batch):
|
||||||
|
|
|
@ -16,6 +16,7 @@ from scipy.io.wavfile import write
|
||||||
from parakeet.g2p.en import text_to_sequence
|
from parakeet.g2p.en import text_to_sequence
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from matplotlib import cm
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from ruamel import yaml
|
from ruamel import yaml
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
|
@ -25,6 +26,7 @@ import argparse
|
||||||
from parse import add_config_options_to_parser
|
from parse import add_config_options_to_parser
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from parakeet.models.transformer_tts.utils import *
|
||||||
from parakeet import audio
|
from parakeet import audio
|
||||||
from parakeet.models.transformer_tts.vocoder import Vocoder
|
from parakeet.models.transformer_tts.vocoder import Vocoder
|
||||||
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
|
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
|
||||||
|
@ -78,14 +80,18 @@ def synthesis(text_input, args):
|
||||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
|
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
|
||||||
|
|
||||||
pbar = tqdm(range(args.max_len))
|
pbar = tqdm(range(args.max_len))
|
||||||
|
|
||||||
for i in pbar:
|
for i in pbar:
|
||||||
|
dec_slf_mask = get_triu_tensor(
|
||||||
|
mel_input.numpy(), mel_input.numpy()).astype(np.float32)
|
||||||
|
dec_slf_mask = fluid.layers.cast(
|
||||||
|
dg.to_variable(dec_slf_mask == 0), np.float32)
|
||||||
pos_mel = np.arange(1, mel_input.shape[1] + 1)
|
pos_mel = np.arange(1, mel_input.shape[1] + 1)
|
||||||
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
|
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
|
||||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||||
text, mel_input, pos_text, pos_mel)
|
text, mel_input, pos_text, pos_mel, dec_slf_mask)
|
||||||
mel_input = fluid.layers.concat(
|
mel_input = fluid.layers.concat(
|
||||||
[mel_input, postnet_pred[:, -1:, :]], axis=1)
|
[mel_input, postnet_pred[:, -1:, :]], axis=1)
|
||||||
|
|
||||||
mag_pred = model_vocoder(postnet_pred)
|
mag_pred = model_vocoder(postnet_pred)
|
||||||
|
|
||||||
_ljspeech_processor = audio.AudioProcessor(
|
_ljspeech_processor = audio.AudioProcessor(
|
||||||
|
@ -111,6 +117,33 @@ def synthesis(text_input, args):
|
||||||
wav = _ljspeech_processor.inv_spectrogram(
|
wav = _ljspeech_processor.inv_spectrogram(
|
||||||
fluid.layers.transpose(
|
fluid.layers.transpose(
|
||||||
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
|
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
|
||||||
|
global_step = 0
|
||||||
|
for i, prob in enumerate(attn_probs):
|
||||||
|
for j in range(4):
|
||||||
|
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
|
||||||
|
writer.add_image(
|
||||||
|
'Attention_%d_0' % global_step,
|
||||||
|
x,
|
||||||
|
i * 4 + j,
|
||||||
|
dataformats="HWC")
|
||||||
|
|
||||||
|
for i, prob in enumerate(attn_enc):
|
||||||
|
for j in range(4):
|
||||||
|
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
|
||||||
|
writer.add_image(
|
||||||
|
'Attention_enc_%d_0' % global_step,
|
||||||
|
x,
|
||||||
|
i * 4 + j,
|
||||||
|
dataformats="HWC")
|
||||||
|
|
||||||
|
for i, prob in enumerate(attn_dec):
|
||||||
|
for j in range(4):
|
||||||
|
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
|
||||||
|
writer.add_image(
|
||||||
|
'Attention_dec_%d_0' % global_step,
|
||||||
|
x,
|
||||||
|
i * 4 + j,
|
||||||
|
dataformats="HWC")
|
||||||
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
|
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
|
||||||
if not os.path.exists(args.sample_path):
|
if not os.path.exists(args.sample_path):
|
||||||
os.mkdir(args.sample_path)
|
os.mkdir(args.sample_path)
|
||||||
|
@ -124,4 +157,6 @@ if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Synthesis model")
|
parser = argparse.ArgumentParser(description="Synthesis model")
|
||||||
add_config_options_to_parser(parser)
|
add_config_options_to_parser(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
synthesis("Transformer model is so fast!", args)
|
synthesis(
|
||||||
|
"They emphasized the necessity that the information now being furnished be handled with judgment and care.",
|
||||||
|
args)
|
||||||
|
|
|
@ -2,10 +2,10 @@
|
||||||
# train model
|
# train model
|
||||||
CUDA_VISIBLE_DEVICES=0 \
|
CUDA_VISIBLE_DEVICES=0 \
|
||||||
python -u synthesis.py \
|
python -u synthesis.py \
|
||||||
--max_len=50 \
|
--max_len=600 \
|
||||||
--transformer_step=160000 \
|
--transformer_step=160000 \
|
||||||
--vocoder_step=70000 \
|
--vocoder_step=90000 \
|
||||||
--use_gpu=1
|
--use_gpu=1 \
|
||||||
--checkpoint_path='./checkpoint' \
|
--checkpoint_path='./checkpoint' \
|
||||||
--log_dir='./log' \
|
--log_dir='./log' \
|
||||||
--sample_path='./sample' \
|
--sample_path='./sample' \
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
import os
|
import os
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from pathlib import Path
|
#from pathlib import Path
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import argparse
|
import argparse
|
||||||
from parse import add_config_options_to_parser
|
from parse import add_config_options_to_parser
|
||||||
|
@ -89,21 +89,31 @@ def main(args):
|
||||||
pbar = tqdm(reader)
|
pbar = tqdm(reader)
|
||||||
for i, data in enumerate(pbar):
|
for i, data in enumerate(pbar):
|
||||||
pbar.set_description('Processing at epoch %d' % epoch)
|
pbar.set_description('Processing at epoch %d' % epoch)
|
||||||
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
|
character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data
|
||||||
|
|
||||||
global_step += 1
|
global_step += 1
|
||||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
|
||||||
character, mel_input, pos_text, pos_mel)
|
|
||||||
|
|
||||||
label = (pos_mel == 0).astype(np.float32)
|
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||||
|
character,
|
||||||
|
mel_input,
|
||||||
|
pos_text,
|
||||||
|
pos_mel,
|
||||||
|
dec_slf_mask=dec_slf_mask,
|
||||||
|
enc_slf_mask=enc_slf_mask,
|
||||||
|
enc_query_mask=enc_query_mask,
|
||||||
|
enc_dec_mask=enc_dec_mask,
|
||||||
|
dec_query_slf_mask=dec_query_slf_mask,
|
||||||
|
dec_query_mask=dec_query_mask)
|
||||||
|
|
||||||
mel_loss = layers.mean(
|
mel_loss = layers.mean(
|
||||||
layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||||
post_mel_loss = layers.mean(
|
post_mel_loss = layers.mean(
|
||||||
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||||
loss = mel_loss + post_mel_loss
|
loss = mel_loss + post_mel_loss
|
||||||
|
|
||||||
# Note: When used stop token loss the learning did not work.
|
# Note: When used stop token loss the learning did not work.
|
||||||
if args.stop_token:
|
if args.stop_token:
|
||||||
|
label = (pos_mel == 0).astype(np.float32)
|
||||||
stop_loss = cross_entropy(stop_preds, label)
|
stop_loss = cross_entropy(stop_preds, label)
|
||||||
loss = loss + stop_loss
|
loss = loss + stop_loss
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
# train model
|
# train model
|
||||||
# if you wish to resume from an exists model, uncomment --checkpoint_path and --transformer_step
|
# if you wish to resume from an exists model, uncomment --checkpoint_path and --transformer_step
|
||||||
CUDA_VISIBLE_DEVICES=0 \
|
export CUDA_VISIBLE_DEVICES=2
|
||||||
python -u train_transformer.py \
|
python -u train_transformer.py \
|
||||||
--batch_size=32 \
|
--batch_size=32 \
|
||||||
--epochs=10000 \
|
--epochs=10000 \
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
import six
|
import six
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
class DatasetMixin(object):
|
class DatasetMixin(object):
|
||||||
|
|
|
@ -32,6 +32,7 @@ class Decoder(dg.Layer):
|
||||||
super(Decoder, self).__init__()
|
super(Decoder, self).__init__()
|
||||||
|
|
||||||
n_position = len_max_seq + 1
|
n_position = len_max_seq + 1
|
||||||
|
self.n_head = n_head
|
||||||
self.pos_inp = get_sinusoid_encoding_table(
|
self.pos_inp = get_sinusoid_encoding_table(
|
||||||
n_position, d_model, padding_idx=0)
|
n_position, d_model, padding_idx=0)
|
||||||
self.position_enc = dg.Embedding(
|
self.position_enc = dg.Embedding(
|
||||||
|
@ -55,7 +56,7 @@ class Decoder(dg.Layer):
|
||||||
for i, layer in enumerate(self.layer_stack):
|
for i, layer in enumerate(self.layer_stack):
|
||||||
self.add_sublayer('fft_{}'.format(i), layer)
|
self.add_sublayer('fft_{}'.format(i), layer)
|
||||||
|
|
||||||
def forward(self, enc_seq, enc_pos):
|
def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None):
|
||||||
"""
|
"""
|
||||||
Decoder layer of FastSpeech.
|
Decoder layer of FastSpeech.
|
||||||
|
|
||||||
|
@ -69,10 +70,7 @@ class Decoder(dg.Layer):
|
||||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||||
"""
|
"""
|
||||||
dec_slf_attn_list = []
|
dec_slf_attn_list = []
|
||||||
|
slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])
|
||||||
# -- Prepare masks
|
|
||||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
|
||||||
non_pad_mask = get_non_pad_mask(enc_pos)
|
|
||||||
|
|
||||||
# -- Forward
|
# -- Forward
|
||||||
dec_output = enc_seq + self.position_enc(enc_pos)
|
dec_output = enc_seq + self.position_enc(enc_pos)
|
||||||
|
|
|
@ -32,14 +32,17 @@ class Encoder(dg.Layer):
|
||||||
dropout=0.1):
|
dropout=0.1):
|
||||||
super(Encoder, self).__init__()
|
super(Encoder, self).__init__()
|
||||||
n_position = len_max_seq + 1
|
n_position = len_max_seq + 1
|
||||||
|
self.n_head = n_head
|
||||||
|
|
||||||
self.src_word_emb = dg.Embedding(
|
self.src_word_emb = dg.Embedding(
|
||||||
size=[n_src_vocab, d_model], padding_idx=0)
|
size=[n_src_vocab, d_model],
|
||||||
|
padding_idx=0,
|
||||||
|
param_attr=fluid.initializer.Normal(
|
||||||
|
loc=0.0, scale=1.0))
|
||||||
self.pos_inp = get_sinusoid_encoding_table(
|
self.pos_inp = get_sinusoid_encoding_table(
|
||||||
n_position, d_model, padding_idx=0)
|
n_position, d_model, padding_idx=0)
|
||||||
self.position_enc = dg.Embedding(
|
self.position_enc = dg.Embedding(
|
||||||
size=[n_position, d_model],
|
size=[n_position, d_model],
|
||||||
padding_idx=0,
|
|
||||||
param_attr=fluid.ParamAttr(
|
param_attr=fluid.ParamAttr(
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||||
self.pos_inp),
|
self.pos_inp),
|
||||||
|
@ -58,7 +61,7 @@ class Encoder(dg.Layer):
|
||||||
for i, layer in enumerate(self.layer_stack):
|
for i, layer in enumerate(self.layer_stack):
|
||||||
self.add_sublayer('fft_{}'.format(i), layer)
|
self.add_sublayer('fft_{}'.format(i), layer)
|
||||||
|
|
||||||
def forward(self, character, text_pos):
|
def forward(self, character, text_pos, non_pad_mask, slf_attn_mask=None):
|
||||||
"""
|
"""
|
||||||
Encoder layer of FastSpeech.
|
Encoder layer of FastSpeech.
|
||||||
|
|
||||||
|
@ -74,10 +77,7 @@ class Encoder(dg.Layer):
|
||||||
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
||||||
"""
|
"""
|
||||||
enc_slf_attn_list = []
|
enc_slf_attn_list = []
|
||||||
# -- prepare masks
|
slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])
|
||||||
# shape character (N, T)
|
|
||||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
|
||||||
non_pad_mask = get_non_pad_mask(character)
|
|
||||||
|
|
||||||
# -- Forward
|
# -- Forward
|
||||||
enc_output = self.src_word_emb(character) + self.position_enc(
|
enc_output = self.src_word_emb(character) + self.position_enc(
|
||||||
|
@ -90,4 +90,4 @@ class Encoder(dg.Layer):
|
||||||
slf_attn_mask=slf_attn_mask)
|
slf_attn_mask=slf_attn_mask)
|
||||||
enc_slf_attn_list += [enc_slf_attn]
|
enc_slf_attn_list += [enc_slf_attn]
|
||||||
|
|
||||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
return enc_output, enc_slf_attn_list
|
||||||
|
|
|
@ -12,9 +12,11 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import math
|
import math
|
||||||
|
import numpy as np
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
from parakeet.g2p.text.symbols import symbols
|
from parakeet.g2p.text.symbols import symbols
|
||||||
|
from parakeet.models.transformer_tts.utils import *
|
||||||
from parakeet.models.transformer_tts.post_convnet import PostConvNet
|
from parakeet.models.transformer_tts.post_convnet import PostConvNet
|
||||||
from parakeet.models.fastspeech.length_regulator import LengthRegulator
|
from parakeet.models.fastspeech.length_regulator import LengthRegulator
|
||||||
from parakeet.models.fastspeech.encoder import Encoder
|
from parakeet.models.fastspeech.encoder import Encoder
|
||||||
|
@ -78,6 +80,10 @@ class FastSpeech(dg.Layer):
|
||||||
def forward(self,
|
def forward(self,
|
||||||
character,
|
character,
|
||||||
text_pos,
|
text_pos,
|
||||||
|
enc_non_pad_mask,
|
||||||
|
dec_non_pad_mask,
|
||||||
|
enc_slf_attn_mask=None,
|
||||||
|
dec_slf_attn_mask=None,
|
||||||
mel_pos=None,
|
mel_pos=None,
|
||||||
length_target=None,
|
length_target=None,
|
||||||
alpha=1.0):
|
alpha=1.0):
|
||||||
|
@ -106,14 +112,20 @@ class FastSpeech(dg.Layer):
|
||||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
|
encoder_output, enc_slf_attn_list = self.encoder(
|
||||||
character, text_pos)
|
character,
|
||||||
|
text_pos,
|
||||||
|
enc_non_pad_mask,
|
||||||
|
slf_attn_mask=enc_slf_attn_mask)
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
|
|
||||||
length_regulator_output, duration_predictor_output = self.length_regulator(
|
length_regulator_output, duration_predictor_output = self.length_regulator(
|
||||||
encoder_output, target=length_target, alpha=alpha)
|
encoder_output, target=length_target, alpha=alpha)
|
||||||
decoder_output, dec_slf_attn_list = self.decoder(
|
decoder_output, dec_slf_attn_list = self.decoder(
|
||||||
length_regulator_output, mel_pos)
|
length_regulator_output,
|
||||||
|
mel_pos,
|
||||||
|
dec_non_pad_mask,
|
||||||
|
slf_attn_mask=dec_slf_attn_mask)
|
||||||
|
|
||||||
mel_output = self.mel_linear(decoder_output)
|
mel_output = self.mel_linear(decoder_output)
|
||||||
mel_output_postnet = self.postnet(mel_output) + mel_output
|
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||||
|
@ -122,8 +134,18 @@ class FastSpeech(dg.Layer):
|
||||||
else:
|
else:
|
||||||
length_regulator_output, decoder_pos = self.length_regulator(
|
length_regulator_output, decoder_pos = self.length_regulator(
|
||||||
encoder_output, alpha=alpha)
|
encoder_output, alpha=alpha)
|
||||||
decoder_output, _ = self.decoder(length_regulator_output,
|
slf_attn_mask = get_triu_tensor(
|
||||||
decoder_pos)
|
decoder_pos.numpy(), decoder_pos.numpy()).astype(np.float32)
|
||||||
|
slf_attn_mask = fluid.layers.cast(
|
||||||
|
dg.to_variable(slf_attn_mask == 0), np.float32)
|
||||||
|
slf_attn_mask = dg.to_variable(slf_attn_mask)
|
||||||
|
dec_non_pad_mask = fluid.layers.unsqueeze(
|
||||||
|
(decoder_pos != 0).astype(np.float32), [-1])
|
||||||
|
decoder_output, _ = self.decoder(
|
||||||
|
length_regulator_output,
|
||||||
|
decoder_pos,
|
||||||
|
dec_non_pad_mask,
|
||||||
|
slf_attn_mask=slf_attn_mask)
|
||||||
mel_output = self.mel_linear(decoder_output)
|
mel_output = self.mel_linear(decoder_output)
|
||||||
mel_output_postnet = self.postnet(mel_output) + mel_output
|
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ class FFTBlock(dg.Layer):
|
||||||
padding=padding,
|
padding=padding,
|
||||||
dropout=dropout)
|
dropout=dropout)
|
||||||
|
|
||||||
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
def forward(self, enc_input, non_pad_mask, slf_attn_mask=None):
|
||||||
"""
|
"""
|
||||||
Feed Forward Transformer block in FastSpeech.
|
Feed Forward Transformer block in FastSpeech.
|
||||||
|
|
||||||
|
@ -63,6 +63,7 @@ class FFTBlock(dg.Layer):
|
||||||
"""
|
"""
|
||||||
output, slf_attn = self.slf_attn(
|
output, slf_attn = self.slf_attn(
|
||||||
enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||||
|
|
||||||
output *= non_pad_mask
|
output *= non_pad_mask
|
||||||
|
|
||||||
output = self.pos_ffn(output)
|
output = self.pos_ffn(output)
|
||||||
|
|
|
@ -146,11 +146,17 @@ class DurationPredictor(dg.Layer):
|
||||||
out = layers.transpose(encoder_output, [0, 2, 1])
|
out = layers.transpose(encoder_output, [0, 2, 1])
|
||||||
out = self.conv1(out)
|
out = self.conv1(out)
|
||||||
out = layers.transpose(out, [0, 2, 1])
|
out = layers.transpose(out, [0, 2, 1])
|
||||||
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
|
out = layers.dropout(
|
||||||
|
layers.relu(self.layer_norm1(out)),
|
||||||
|
self.dropout,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
out = layers.transpose(out, [0, 2, 1])
|
out = layers.transpose(out, [0, 2, 1])
|
||||||
out = self.conv2(out)
|
out = self.conv2(out)
|
||||||
out = layers.transpose(out, [0, 2, 1])
|
out = layers.transpose(out, [0, 2, 1])
|
||||||
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
|
out = layers.dropout(
|
||||||
|
layers.relu(self.layer_norm2(out)),
|
||||||
|
self.dropout,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
out = layers.relu(self.linear(out))
|
out = layers.relu(self.linear(out))
|
||||||
out = layers.squeeze(out, axes=[-1])
|
out = layers.squeeze(out, axes=[-1])
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@ def get_alignment(attn_probs, mel_lens, n_head):
|
||||||
max_F = 0
|
max_F = 0
|
||||||
assert attn_probs[0].shape[0] % n_head == 0
|
assert attn_probs[0].shape[0] % n_head == 0
|
||||||
batch_size = int(attn_probs[0].shape[0] // n_head)
|
batch_size = int(attn_probs[0].shape[0] // n_head)
|
||||||
#max_attn = attn_probs[0].numpy()[0,batch_size]
|
|
||||||
for i in range(len(attn_probs)):
|
for i in range(len(attn_probs)):
|
||||||
multi_attn = attn_probs[i].numpy()
|
multi_attn = attn_probs[i].numpy()
|
||||||
for j in range(n_head):
|
for j in range(n_head):
|
||||||
|
@ -28,7 +27,7 @@ def get_alignment(attn_probs, mel_lens, n_head):
|
||||||
max_F = F
|
max_F = F
|
||||||
max_attn = attn
|
max_attn = attn
|
||||||
alignment = compute_duration(max_attn, mel_lens)
|
alignment = compute_duration(max_attn, mel_lens)
|
||||||
return alignment
|
return alignment, max_attn
|
||||||
|
|
||||||
|
|
||||||
def score_F(attn):
|
def score_F(attn):
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
import math
|
import math
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import paddle.fluid as fluid
|
import paddle.fluid as fluid
|
||||||
from parakeet.modules.utils import *
|
from parakeet.models.transformer_tts.utils import *
|
||||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||||
from parakeet.modules.ffn import PositionwiseFeedForward
|
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||||
from parakeet.models.transformer_tts.prenet import PreNet
|
from parakeet.models.transformer_tts.prenet import PreNet
|
||||||
|
@ -25,6 +25,7 @@ class Decoder(dg.Layer):
|
||||||
def __init__(self, num_hidden, config, num_head=4):
|
def __init__(self, num_hidden, config, num_head=4):
|
||||||
super(Decoder, self).__init__()
|
super(Decoder, self).__init__()
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
|
self.num_head = num_head
|
||||||
param = fluid.ParamAttr()
|
param = fluid.ParamAttr()
|
||||||
self.alpha = self.create_parameter(
|
self.alpha = self.create_parameter(
|
||||||
shape=(1, ),
|
shape=(1, ),
|
||||||
|
@ -98,28 +99,27 @@ class Decoder(dg.Layer):
|
||||||
outputs_per_step=config['audio']['outputs_per_step'],
|
outputs_per_step=config['audio']['outputs_per_step'],
|
||||||
use_cudnn=True)
|
use_cudnn=True)
|
||||||
|
|
||||||
def forward(self, key, value, query, c_mask, positional):
|
def forward(self,
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
query,
|
||||||
|
positional,
|
||||||
|
mask,
|
||||||
|
m_mask=None,
|
||||||
|
m_self_mask=None,
|
||||||
|
zero_mask=None):
|
||||||
|
|
||||||
# get decoder mask with triangular matrix
|
# get decoder mask with triangular matrix
|
||||||
|
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
m_mask = get_non_pad_mask(positional)
|
m_mask = layers.expand(m_mask, [self.num_head, 1, key.shape[1]])
|
||||||
mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
|
m_self_mask = layers.expand(m_self_mask,
|
||||||
query)
|
[self.num_head, 1, query.shape[1]])
|
||||||
triu_tensor = dg.to_variable(
|
mask = layers.expand(mask, [self.num_head, 1, 1])
|
||||||
get_triu_tensor(query.numpy(), query.numpy())).astype(
|
zero_mask = layers.expand(zero_mask, [self.num_head, 1, 1])
|
||||||
np.float32)
|
|
||||||
mask = mask + triu_tensor
|
|
||||||
mask = fluid.layers.cast(mask == 0, np.float32)
|
|
||||||
|
|
||||||
# (batch_size, decoder_len, encoder_len)
|
|
||||||
zero_mask = get_attn_key_pad_mask(
|
|
||||||
layers.squeeze(c_mask, [-1]), query)
|
|
||||||
else:
|
else:
|
||||||
mask = get_triu_tensor(query.numpy(),
|
m_mask, m_self_mask, zero_mask = None, None, None
|
||||||
query.numpy()).astype(np.float32)
|
|
||||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
|
||||||
m_mask, zero_mask = None, None
|
|
||||||
|
|
||||||
# Decoder pre-network
|
# Decoder pre-network
|
||||||
query = self.decoder_prenet(query)
|
query = self.decoder_prenet(query)
|
||||||
|
@ -132,7 +132,8 @@ class Decoder(dg.Layer):
|
||||||
query = positional * self.alpha + query
|
query = positional * self.alpha + query
|
||||||
|
|
||||||
#positional dropout
|
#positional dropout
|
||||||
query = fluid.layers.dropout(query, 0.1)
|
query = fluid.layers.dropout(
|
||||||
|
query, 0.1, dropout_implementation='upscale_in_train')
|
||||||
|
|
||||||
# Attention decoder-decoder, encoder-decoder
|
# Attention decoder-decoder, encoder-decoder
|
||||||
selfattn_list = list()
|
selfattn_list = list()
|
||||||
|
@ -141,12 +142,13 @@ class Decoder(dg.Layer):
|
||||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
|
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
|
||||||
self.ffns):
|
self.ffns):
|
||||||
query, attn_dec = selfattn(
|
query, attn_dec = selfattn(
|
||||||
query, query, query, mask=mask, query_mask=m_mask)
|
query, query, query, mask=mask, query_mask=m_self_mask)
|
||||||
query, attn_dot = attn(
|
query, attn_dot = attn(
|
||||||
key, value, query, mask=zero_mask, query_mask=m_mask)
|
key, value, query, mask=zero_mask, query_mask=m_mask)
|
||||||
query = ffn(query)
|
query = ffn(query)
|
||||||
selfattn_list.append(attn_dec)
|
selfattn_list.append(attn_dec)
|
||||||
attn_list.append(attn_dot)
|
attn_list.append(attn_dot)
|
||||||
|
|
||||||
# Mel linear projection
|
# Mel linear projection
|
||||||
mel_out = self.mel_linear(query)
|
mel_out = self.mel_linear(query)
|
||||||
# Post Mel Network
|
# Post Mel Network
|
||||||
|
|
|
@ -23,6 +23,7 @@ class Encoder(dg.Layer):
|
||||||
def __init__(self, embedding_size, num_hidden, num_head=4):
|
def __init__(self, embedding_size, num_hidden, num_head=4):
|
||||||
super(Encoder, self).__init__()
|
super(Encoder, self).__init__()
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
|
self.num_head = num_head
|
||||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
|
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
|
||||||
value=1.0))
|
value=1.0))
|
||||||
self.alpha = self.create_parameter(
|
self.alpha = self.create_parameter(
|
||||||
|
@ -31,7 +32,6 @@ class Encoder(dg.Layer):
|
||||||
1024, self.num_hidden, padding_idx=0)
|
1024, self.num_hidden, padding_idx=0)
|
||||||
self.pos_emb = dg.Embedding(
|
self.pos_emb = dg.Embedding(
|
||||||
size=[1024, num_hidden],
|
size=[1024, num_hidden],
|
||||||
padding_idx=0,
|
|
||||||
param_attr=fluid.ParamAttr(
|
param_attr=fluid.ParamAttr(
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||||
self.pos_inp),
|
self.pos_inp),
|
||||||
|
@ -56,13 +56,15 @@ class Encoder(dg.Layer):
|
||||||
for i, layer in enumerate(self.ffns):
|
for i, layer in enumerate(self.ffns):
|
||||||
self.add_sublayer("ffns_{}".format(i), layer)
|
self.add_sublayer("ffns_{}".format(i), layer)
|
||||||
|
|
||||||
def forward(self, x, positional):
|
def forward(self, x, positional, mask=None, query_mask=None):
|
||||||
|
|
||||||
if fluid.framework._dygraph_tracer()._train_mode:
|
if fluid.framework._dygraph_tracer()._train_mode:
|
||||||
query_mask = get_non_pad_mask(positional)
|
seq_len_key = x.shape[1]
|
||||||
mask = get_attn_key_pad_mask(positional, x)
|
query_mask = layers.expand(query_mask,
|
||||||
|
[self.num_head, 1, seq_len_key])
|
||||||
|
mask = layers.expand(mask, [self.num_head, 1, 1])
|
||||||
else:
|
else:
|
||||||
query_mask, mask = None, None
|
query_mask, mask = None, None
|
||||||
|
|
||||||
# Encoder pre_network
|
# Encoder pre_network
|
||||||
x = self.encoder_prenet(x) #(N,T,C)
|
x = self.encoder_prenet(x) #(N,T,C)
|
||||||
|
|
||||||
|
@ -72,7 +74,7 @@ class Encoder(dg.Layer):
|
||||||
x = positional * self.alpha + x #(N, T, C)
|
x = positional * self.alpha + x #(N, T, C)
|
||||||
|
|
||||||
# Positional dropout
|
# Positional dropout
|
||||||
x = layers.dropout(x, 0.1)
|
x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train')
|
||||||
|
|
||||||
# Self attention encoder
|
# Self attention encoder
|
||||||
attentions = list()
|
attentions = list()
|
||||||
|
@ -81,4 +83,4 @@ class Encoder(dg.Layer):
|
||||||
x = ffn(x)
|
x = ffn(x)
|
||||||
attentions.append(attention)
|
attentions.append(attention)
|
||||||
|
|
||||||
return x, query_mask, attentions
|
return x, attentions
|
||||||
|
|
|
@ -27,7 +27,10 @@ class EncoderPrenet(dg.Layer):
|
||||||
self.num_hidden = num_hidden
|
self.num_hidden = num_hidden
|
||||||
self.use_cudnn = use_cudnn
|
self.use_cudnn = use_cudnn
|
||||||
self.embedding = dg.Embedding(
|
self.embedding = dg.Embedding(
|
||||||
size=[len(symbols), embedding_size], padding_idx=None)
|
size=[len(symbols), embedding_size],
|
||||||
|
padding_idx=0,
|
||||||
|
param_attr=fluid.initializer.Normal(
|
||||||
|
loc=0.0, scale=1.0))
|
||||||
self.conv_list = []
|
self.conv_list = []
|
||||||
k = math.sqrt(1 / embedding_size)
|
k = math.sqrt(1 / embedding_size)
|
||||||
self.conv_list.append(
|
self.conv_list.append(
|
||||||
|
@ -78,10 +81,14 @@ class EncoderPrenet(dg.Layer):
|
||||||
low=-k, high=k)))
|
low=-k, high=k)))
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
|
||||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
||||||
x = layers.transpose(x, [0, 2, 1])
|
x = layers.transpose(x, [0, 2, 1])
|
||||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||||
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
|
x = layers.dropout(
|
||||||
|
layers.relu(batch_norm(conv(x))),
|
||||||
|
0.2,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
|
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
|
||||||
x = self.projection(x)
|
x = self.projection(x)
|
||||||
|
|
||||||
|
|
|
@ -108,11 +108,16 @@ class PostConvNet(dg.Layer):
|
||||||
conv = self.conv_list[i]
|
conv = self.conv_list[i]
|
||||||
|
|
||||||
input = layers.dropout(
|
input = layers.dropout(
|
||||||
layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
|
layers.tanh(batch_norm(conv(input)[:, :, :len])),
|
||||||
|
self.dropout,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
conv = self.conv_list[self.num_conv - 1]
|
conv = self.conv_list[self.num_conv - 1]
|
||||||
input = conv(input)[:, :, :len]
|
input = conv(input)[:, :, :len]
|
||||||
if self.batchnorm_last:
|
if self.batchnorm_last:
|
||||||
batch_norm = self.batch_norm_list[self.num_conv - 1]
|
batch_norm = self.batch_norm_list[self.num_conv - 1]
|
||||||
input = layers.dropout(batch_norm(input), self.dropout)
|
input = layers.dropout(
|
||||||
|
batch_norm(input),
|
||||||
|
self.dropout,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
output = layers.transpose(input, [0, 2, 1])
|
output = layers.transpose(input, [0, 2, 1])
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -56,6 +56,12 @@ class PreNet(dg.Layer):
|
||||||
Returns:
|
Returns:
|
||||||
x (Variable), Shape(B, T, C), the result after pernet.
|
x (Variable), Shape(B, T, C), the result after pernet.
|
||||||
"""
|
"""
|
||||||
x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate)
|
x = layers.dropout(
|
||||||
x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate)
|
layers.relu(self.linear1(x)),
|
||||||
|
self.dropout_rate,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
|
x = layers.dropout(
|
||||||
|
layers.relu(self.linear2(x)),
|
||||||
|
self.dropout_rate,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
return x
|
return x
|
||||||
|
|
|
@ -24,11 +24,29 @@ class TransformerTTS(dg.Layer):
|
||||||
self.decoder = Decoder(config['hidden_size'], config)
|
self.decoder = Decoder(config['hidden_size'], config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
def forward(self,
|
||||||
|
characters,
|
||||||
key, c_mask, attns_enc = self.encoder(characters, pos_text)
|
mel_input,
|
||||||
|
pos_text,
|
||||||
|
pos_mel,
|
||||||
|
dec_slf_mask,
|
||||||
|
enc_slf_mask=None,
|
||||||
|
enc_query_mask=None,
|
||||||
|
enc_dec_mask=None,
|
||||||
|
dec_query_slf_mask=None,
|
||||||
|
dec_query_mask=None):
|
||||||
|
key, attns_enc = self.encoder(
|
||||||
|
characters, pos_text, mask=enc_slf_mask, query_mask=enc_query_mask)
|
||||||
|
|
||||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
|
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
|
||||||
key, key, mel_input, c_mask, pos_mel)
|
key,
|
||||||
|
key,
|
||||||
|
mel_input,
|
||||||
|
pos_mel,
|
||||||
|
mask=dec_slf_mask,
|
||||||
|
zero_mask=enc_dec_mask,
|
||||||
|
m_self_mask=dec_query_slf_mask,
|
||||||
|
m_mask=dec_query_mask)
|
||||||
|
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
||||||
|
|
||||||
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
||||||
|
|
|
@ -51,7 +51,9 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
||||||
|
|
||||||
|
|
||||||
def get_non_pad_mask(seq):
|
def get_non_pad_mask(seq):
|
||||||
return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
|
mask = (seq != 0).astype(np.float32)
|
||||||
|
mask = np.expand_dims(mask, axis=-1)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
def get_attn_key_pad_mask(seq_k, seq_q):
|
def get_attn_key_pad_mask(seq_k, seq_q):
|
||||||
|
@ -60,8 +62,22 @@ def get_attn_key_pad_mask(seq_k, seq_q):
|
||||||
# Expand to fit the shape of key query attention matrix.
|
# Expand to fit the shape of key query attention matrix.
|
||||||
len_q = seq_q.shape[1]
|
len_q = seq_q.shape[1]
|
||||||
padding_mask = (seq_k != 0).astype(np.float32)
|
padding_mask = (seq_k != 0).astype(np.float32)
|
||||||
padding_mask = layers.expand(
|
padding_mask = np.expand_dims(padding_mask, axis=1)
|
||||||
layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
|
padding_mask = padding_mask.repeat([len_q], axis=1)
|
||||||
|
padding_mask = (padding_mask == 0).astype(np.float32) * (-2**32 + 1)
|
||||||
|
return padding_mask
|
||||||
|
|
||||||
|
|
||||||
|
def get_dec_attn_key_pad_mask(seq_k, seq_q):
|
||||||
|
''' For masking out the padding part of key sequence. '''
|
||||||
|
|
||||||
|
# Expand to fit the shape of key query attention matrix.
|
||||||
|
len_q = seq_q.shape[1]
|
||||||
|
padding_mask = (seq_k == 0).astype(np.float32)
|
||||||
|
padding_mask = np.expand_dims(padding_mask, axis=1)
|
||||||
|
triu_tensor = get_triu_tensor(seq_q, seq_q)
|
||||||
|
padding_mask = padding_mask.repeat([len_q], axis=1) + triu_tensor
|
||||||
|
padding_mask = (padding_mask != 0).astype(np.float32) * (-2**32 + 1)
|
||||||
return padding_mask
|
return padding_mask
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -53,11 +53,9 @@ class DynamicGRU(dg.Layer):
|
||||||
if self.is_reverse:
|
if self.is_reverse:
|
||||||
i = inputs.shape[1] - 1 - i
|
i = inputs.shape[1] - 1 - i
|
||||||
input_ = inputs[:, i:i + 1, :]
|
input_ = inputs[:, i:i + 1, :]
|
||||||
input_ = layers.reshape(
|
input_ = layers.reshape(input_, [-1, input_.shape[2]])
|
||||||
input_, [-1, input_.shape[2]], inplace=False)
|
|
||||||
hidden, reset, gate = self.gru_unit(input_, hidden)
|
hidden, reset, gate = self.gru_unit(input_, hidden)
|
||||||
hidden_ = layers.reshape(
|
hidden_ = layers.reshape(hidden, [-1, 1, hidden.shape[1]])
|
||||||
hidden, [-1, 1, hidden.shape[1]], inplace=False)
|
|
||||||
res.append(hidden_)
|
res.append(hidden_)
|
||||||
if self.is_reverse:
|
if self.is_reverse:
|
||||||
res = res[::-1]
|
res = res[::-1]
|
||||||
|
|
|
@ -71,7 +71,8 @@ class PositionwiseFeedForward(dg.Layer):
|
||||||
x = self.w_2(layers.relu(self.w_1(x)))
|
x = self.w_2(layers.relu(self.w_1(x)))
|
||||||
|
|
||||||
# dropout
|
# dropout
|
||||||
x = layers.dropout(x, self.dropout)
|
x = layers.dropout(
|
||||||
|
x, self.dropout, dropout_implementation='upscale_in_train')
|
||||||
|
|
||||||
x = layers.transpose(x, [0, 2, 1])
|
x = layers.transpose(x, [0, 2, 1])
|
||||||
# residual connection
|
# residual connection
|
||||||
|
|
|
@ -0,0 +1,610 @@
|
||||||
|
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import fluid
|
||||||
|
import paddle.fluid.dygraph as dg
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from . import conv
|
||||||
|
from . import weight_norm
|
||||||
|
|
||||||
|
|
||||||
|
def FC(name_scope,
|
||||||
|
in_features,
|
||||||
|
size,
|
||||||
|
num_flatten_dims=1,
|
||||||
|
relu=False,
|
||||||
|
dropout=0.0,
|
||||||
|
epsilon=1e-30,
|
||||||
|
act=None,
|
||||||
|
is_test=False,
|
||||||
|
dtype="float32"):
|
||||||
|
"""
|
||||||
|
A special Linear Layer, when it is used with dropout, the weight is
|
||||||
|
initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
|
||||||
|
"""
|
||||||
|
|
||||||
|
# stds
|
||||||
|
if isinstance(in_features, int):
|
||||||
|
in_features = [in_features]
|
||||||
|
|
||||||
|
stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
|
||||||
|
if relu:
|
||||||
|
stds = [std * np.sqrt(2.0) for std in stds]
|
||||||
|
|
||||||
|
weight_inits = [
|
||||||
|
fluid.initializer.NormalInitializer(scale=std) for std in stds
|
||||||
|
]
|
||||||
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||||
|
|
||||||
|
# param attrs
|
||||||
|
weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
|
||||||
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||||
|
|
||||||
|
layer = weight_norm.FC(name_scope,
|
||||||
|
size,
|
||||||
|
num_flatten_dims=num_flatten_dims,
|
||||||
|
param_attr=weight_attrs,
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
act=act,
|
||||||
|
dtype=dtype)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
def Conv1D(name_scope,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size=3,
|
||||||
|
dilation=1,
|
||||||
|
groups=None,
|
||||||
|
causal=False,
|
||||||
|
std_mul=1.0,
|
||||||
|
dropout=0.0,
|
||||||
|
use_cudnn=True,
|
||||||
|
act=None,
|
||||||
|
dtype="float32"):
|
||||||
|
"""
|
||||||
|
A special Conv1D Layer, when it is used with dropout, the weight is
|
||||||
|
initialized as
|
||||||
|
normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
|
||||||
|
"""
|
||||||
|
# std
|
||||||
|
std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
|
||||||
|
weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
|
||||||
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||||
|
|
||||||
|
# param attrs
|
||||||
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
||||||
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||||
|
|
||||||
|
layer = conv.Conv1D(
|
||||||
|
name_scope,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
groups=groups,
|
||||||
|
causal=causal,
|
||||||
|
param_attr=weight_attr,
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
use_cudnn=use_cudnn,
|
||||||
|
act=act,
|
||||||
|
dtype=dtype)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
def Embedding(name_scope,
|
||||||
|
num_embeddings,
|
||||||
|
embed_dim,
|
||||||
|
is_sparse=False,
|
||||||
|
is_distributed=False,
|
||||||
|
padding_idx=None,
|
||||||
|
std=0.01,
|
||||||
|
dtype="float32"):
|
||||||
|
# param attrs
|
||||||
|
weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
|
||||||
|
scale=std))
|
||||||
|
layer = dg.Embedding(
|
||||||
|
name_scope, (num_embeddings, embed_dim),
|
||||||
|
padding_idx=padding_idx,
|
||||||
|
param_attr=weight_attr,
|
||||||
|
dtype=dtype)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1DGLU(dg.Layer):
|
||||||
|
"""
|
||||||
|
A Convolution 1D block with GLU activation. It also applys dropout for the
|
||||||
|
input x. It fuses speaker embeddings through a FC activated by softsign. It
|
||||||
|
has residual connection from the input x, and scale the output by
|
||||||
|
np.sqrt(0.5).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
name_scope,
|
||||||
|
n_speakers,
|
||||||
|
speaker_dim,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
std_mul=4.0,
|
||||||
|
dropout=0.0,
|
||||||
|
causal=False,
|
||||||
|
residual=True,
|
||||||
|
dtype="float32"):
|
||||||
|
super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
|
||||||
|
|
||||||
|
# conv spec
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.n_speakers = n_speakers
|
||||||
|
self.speaker_dim = speaker_dim
|
||||||
|
self.num_filters = num_filters
|
||||||
|
self.filter_size = filter_size
|
||||||
|
self.dilation = dilation
|
||||||
|
self.causal = causal
|
||||||
|
self.residual = residual
|
||||||
|
|
||||||
|
# weight init and dropout
|
||||||
|
self.std_mul = std_mul
|
||||||
|
self.dropout = dropout
|
||||||
|
|
||||||
|
if residual:
|
||||||
|
assert (
|
||||||
|
in_channels == num_filters
|
||||||
|
), "this block uses residual connection"\
|
||||||
|
"the input_channes should equals num_filters"
|
||||||
|
|
||||||
|
self.conv = Conv1D(
|
||||||
|
self.full_name(),
|
||||||
|
in_channels,
|
||||||
|
2 * num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
causal=causal,
|
||||||
|
std_mul=std_mul,
|
||||||
|
dropout=dropout,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
if n_speakers > 1:
|
||||||
|
assert (speaker_dim is not None
|
||||||
|
), "speaker embed should not be null in multi-speaker case"
|
||||||
|
self.fc = Conv1D(
|
||||||
|
self.full_name(),
|
||||||
|
speaker_dim,
|
||||||
|
num_filters,
|
||||||
|
filter_size=1,
|
||||||
|
dilation=1,
|
||||||
|
causal=False,
|
||||||
|
act="softsign",
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
def forward(self, x, speaker_embed_bc1t=None):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
|
||||||
|
layer, where B means batch_size, C_in means the input channels
|
||||||
|
T means input time steps.
|
||||||
|
speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
|
||||||
|
speaker embed, where C_sp means speaker embedding size. Note
|
||||||
|
that when using residual connection, the Conv1DGLU does not
|
||||||
|
change the number of channels, so out channels equals input
|
||||||
|
channels.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
|
||||||
|
C_out means the output channels of Conv1DGLU.
|
||||||
|
"""
|
||||||
|
|
||||||
|
residual = x
|
||||||
|
x = fluid.layers.dropout(x, self.dropout)
|
||||||
|
x = self.conv(x)
|
||||||
|
|
||||||
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||||
|
|
||||||
|
if speaker_embed_bc1t is not None:
|
||||||
|
sp = self.fc(speaker_embed_bc1t)
|
||||||
|
content = content + sp
|
||||||
|
|
||||||
|
# glu
|
||||||
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
|
||||||
|
|
||||||
|
if self.residual:
|
||||||
|
x = fluid.layers.scale(x + residual, np.sqrt(0.5))
|
||||||
|
return x
|
||||||
|
|
||||||
|
def add_input(self, x, speaker_embed_bc11=None):
|
||||||
|
"""
|
||||||
|
Inputs:
|
||||||
|
x: shape(B, num_filters, 1, time_steps)
|
||||||
|
speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
out: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
# add step input and produce step output
|
||||||
|
x = fluid.layers.dropout(x, self.dropout)
|
||||||
|
x = self.conv.add_input(x)
|
||||||
|
|
||||||
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||||
|
|
||||||
|
if speaker_embed_bc11 is not None:
|
||||||
|
sp = self.fc(speaker_embed_bc11)
|
||||||
|
content = content + sp
|
||||||
|
|
||||||
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
|
||||||
|
|
||||||
|
if self.residual:
|
||||||
|
x = fluid.layers.scale(x + residual, np.sqrt(0.5))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def Conv1DTranspose(name_scope,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
padding=0,
|
||||||
|
stride=1,
|
||||||
|
dilation=1,
|
||||||
|
groups=None,
|
||||||
|
std_mul=1.0,
|
||||||
|
dropout=0.0,
|
||||||
|
use_cudnn=True,
|
||||||
|
act=None,
|
||||||
|
dtype="float32"):
|
||||||
|
std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
|
||||||
|
weight_init = fluid.initializer.NormalInitializer(scale=std)
|
||||||
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
||||||
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||||
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||||
|
layer = conv.Conv1DTranspose(
|
||||||
|
name_scope,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
padding=padding,
|
||||||
|
stride=stride,
|
||||||
|
dilation=dilation,
|
||||||
|
groups=groups,
|
||||||
|
param_attr=weight_attr,
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
use_cudnn=use_cudnn,
|
||||||
|
act=act,
|
||||||
|
dtype=dtype)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
def compute_position_embedding(rad):
|
||||||
|
# rad is a transposed radius, shape(embed_dim, n_vocab)
|
||||||
|
embed_dim, n_vocab = rad.shape
|
||||||
|
|
||||||
|
even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
|
||||||
|
odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
|
||||||
|
|
||||||
|
even_rads = fluid.layers.gather(rad, even_dims)
|
||||||
|
odd_rads = fluid.layers.gather(rad, odd_dims)
|
||||||
|
|
||||||
|
sines = fluid.layers.sin(even_rads)
|
||||||
|
cosines = fluid.layers.cos(odd_rads)
|
||||||
|
|
||||||
|
temp = fluid.layers.scatter(rad, even_dims, sines)
|
||||||
|
out = fluid.layers.scatter(temp, odd_dims, cosines)
|
||||||
|
out = fluid.layers.transpose(out, perm=[1, 0])
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def position_encoding_init(n_position,
|
||||||
|
d_pos_vec,
|
||||||
|
position_rate=1.0,
|
||||||
|
sinusoidal=True):
|
||||||
|
""" Init the sinusoid position encoding table """
|
||||||
|
|
||||||
|
# keep idx 0 for padding token position encoding zero vector
|
||||||
|
position_enc = np.array([[
|
||||||
|
position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
|
||||||
|
for i in range(d_pos_vec)
|
||||||
|
] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
||||||
|
|
||||||
|
if sinusoidal:
|
||||||
|
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
||||||
|
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
||||||
|
|
||||||
|
return position_enc
|
||||||
|
|
||||||
|
|
||||||
|
class PositionEmbedding(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
name_scope,
|
||||||
|
n_position,
|
||||||
|
d_pos_vec,
|
||||||
|
position_rate=1.0,
|
||||||
|
is_sparse=False,
|
||||||
|
is_distributed=False,
|
||||||
|
param_attr=None,
|
||||||
|
max_norm=None,
|
||||||
|
padding_idx=None,
|
||||||
|
dtype="float32"):
|
||||||
|
super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
|
||||||
|
self.embed = dg.Embedding(
|
||||||
|
self.full_name(),
|
||||||
|
size=(n_position, d_pos_vec),
|
||||||
|
is_sparse=is_sparse,
|
||||||
|
is_distributed=is_distributed,
|
||||||
|
padding_idx=None,
|
||||||
|
param_attr=param_attr,
|
||||||
|
dtype=dtype)
|
||||||
|
self.set_weight(
|
||||||
|
position_encoding_init(
|
||||||
|
n_position,
|
||||||
|
d_pos_vec,
|
||||||
|
position_rate=position_rate,
|
||||||
|
sinusoidal=False).astype(dtype))
|
||||||
|
|
||||||
|
self._is_sparse = is_sparse
|
||||||
|
self._is_distributed = is_distributed
|
||||||
|
self._remote_prefetch = self._is_sparse and (not self._is_distributed)
|
||||||
|
if self._remote_prefetch:
|
||||||
|
assert self._is_sparse is True and self._is_distributed is False
|
||||||
|
|
||||||
|
self._padding_idx = (-1 if padding_idx is None else padding_idx if
|
||||||
|
padding_idx >= 0 else (n_position + padding_idx))
|
||||||
|
self._position_rate = position_rate
|
||||||
|
self._max_norm = max_norm
|
||||||
|
self._dtype = dtype
|
||||||
|
|
||||||
|
def set_weight(self, array):
|
||||||
|
assert self.embed._w.shape == list(array.shape), "shape does not match"
|
||||||
|
self.embed._w._ivar.value().get_tensor().set(
|
||||||
|
array, fluid.framework._current_expected_place())
|
||||||
|
|
||||||
|
def forward(self, indices, speaker_position_rate=None):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
indices (Variable): Shape (B, T, 1), dtype: int64, position
|
||||||
|
indices, where B means the batch size, T means the time steps.
|
||||||
|
speaker_position_rate (Variable | float, optional), position
|
||||||
|
rate. It can be a float point number or a Variable with
|
||||||
|
shape (1,), then this speaker_position_rate is used for every
|
||||||
|
example. It can also be a Variable with shape (B, 1), which
|
||||||
|
contains a speaker position rate for each speaker.
|
||||||
|
Returns:
|
||||||
|
out (Variable): Shape(B, C_pos), position embedding, where C_pos
|
||||||
|
means position embedding size.
|
||||||
|
"""
|
||||||
|
rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
|
||||||
|
batch_size = indices.shape[0]
|
||||||
|
|
||||||
|
if speaker_position_rate is None:
|
||||||
|
weight = compute_position_embedding(rad)
|
||||||
|
out = self._helper.create_variable_for_type_inference(self._dtype)
|
||||||
|
self._helper.append_op(
|
||||||
|
type="lookup_table",
|
||||||
|
inputs={"Ids": indices,
|
||||||
|
"W": weight},
|
||||||
|
outputs={"Out": out},
|
||||||
|
attrs={
|
||||||
|
"is_sparse": self._is_sparse,
|
||||||
|
"is_distributed": self._is_distributed,
|
||||||
|
"remote_prefetch": self._remote_prefetch,
|
||||||
|
"padding_idx":
|
||||||
|
self._padding_idx, # special value for lookup table op
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
elif (np.isscalar(speaker_position_rate) or
|
||||||
|
isinstance(speaker_position_rate, fluid.framework.Variable) and
|
||||||
|
speaker_position_rate.shape == [1, 1]):
|
||||||
|
# # make a weight
|
||||||
|
# scale the weight (the operand for sin & cos)
|
||||||
|
if np.isscalar(speaker_position_rate):
|
||||||
|
scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
|
||||||
|
else:
|
||||||
|
scaled_rad = fluid.layers.elementwise_mul(
|
||||||
|
rad, speaker_position_rate[0])
|
||||||
|
weight = compute_position_embedding(scaled_rad)
|
||||||
|
out = self._helper.create_variable_for_type_inference(self._dtype)
|
||||||
|
self._helper.append_op(
|
||||||
|
type="lookup_table",
|
||||||
|
inputs={"Ids": indices,
|
||||||
|
"W": weight},
|
||||||
|
outputs={"Out": out},
|
||||||
|
attrs={
|
||||||
|
"is_sparse": self._is_sparse,
|
||||||
|
"is_distributed": self._is_distributed,
|
||||||
|
"remote_prefetch": self._remote_prefetch,
|
||||||
|
"padding_idx":
|
||||||
|
self._padding_idx, # special value for lookup table op
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
elif np.prod(speaker_position_rate.shape) > 1:
|
||||||
|
assert speaker_position_rate.shape == [batch_size, 1]
|
||||||
|
outputs = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
rate = speaker_position_rate[i] # rate has shape [1]
|
||||||
|
scaled_rad = fluid.layers.elementwise_mul(rad, rate)
|
||||||
|
weight = compute_position_embedding(scaled_rad)
|
||||||
|
out = self._helper.create_variable_for_type_inference(
|
||||||
|
self._dtype)
|
||||||
|
sequence = indices[i]
|
||||||
|
self._helper.append_op(
|
||||||
|
type="lookup_table",
|
||||||
|
inputs={"Ids": sequence,
|
||||||
|
"W": weight},
|
||||||
|
outputs={"Out": out},
|
||||||
|
attrs={
|
||||||
|
"is_sparse": self._is_sparse,
|
||||||
|
"is_distributed": self._is_distributed,
|
||||||
|
"remote_prefetch": self._remote_prefetch,
|
||||||
|
"padding_idx": -1,
|
||||||
|
})
|
||||||
|
outputs.append(out)
|
||||||
|
out = fluid.layers.stack(outputs)
|
||||||
|
return out
|
||||||
|
else:
|
||||||
|
raise Exception("Then you can just use position rate at init")
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1D_GU(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
name_scope,
|
||||||
|
conditioner_dim,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
causal=False,
|
||||||
|
residual=True,
|
||||||
|
dtype="float32"):
|
||||||
|
super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)
|
||||||
|
|
||||||
|
self.conditioner_dim = conditioner_dim
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.num_filters = num_filters
|
||||||
|
self.filter_size = filter_size
|
||||||
|
self.dilation = dilation
|
||||||
|
self.causal = causal
|
||||||
|
self.residual = residual
|
||||||
|
|
||||||
|
if residual:
|
||||||
|
assert (
|
||||||
|
in_channels == num_filters
|
||||||
|
), "this block uses residual connection"\
|
||||||
|
"the input_channels should equals num_filters"
|
||||||
|
|
||||||
|
self.conv = Conv1D(
|
||||||
|
self.full_name(),
|
||||||
|
in_channels,
|
||||||
|
2 * num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
causal=causal,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
self.fc = Conv1D(
|
||||||
|
self.full_name(),
|
||||||
|
conditioner_dim,
|
||||||
|
2 * num_filters,
|
||||||
|
filter_size=1,
|
||||||
|
dilation=1,
|
||||||
|
causal=False,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
def forward(self, x, skip=None, conditioner=None):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU
|
||||||
|
layer, where B means batch_size, C_in means the input channels
|
||||||
|
T means input time steps.
|
||||||
|
skip (Variable): Shape(B, C_in, 1, T), skip connection.
|
||||||
|
conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
|
||||||
|
conditioner, where C_con is conditioner hidden dim which
|
||||||
|
equals the num of mel bands. Note that when using residual
|
||||||
|
connection, the Conv1D_GU does not change the number of
|
||||||
|
channels, so out channels equals input channels.
|
||||||
|
Returns:
|
||||||
|
x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where
|
||||||
|
C_out means the output channels of Conv1D_GU.
|
||||||
|
skip (Variable): Shape(B, C_out, 1, T), skip connection.
|
||||||
|
"""
|
||||||
|
residual = x
|
||||||
|
x = self.conv(x)
|
||||||
|
|
||||||
|
if conditioner is not None:
|
||||||
|
cond_bias = self.fc(conditioner)
|
||||||
|
x += cond_bias
|
||||||
|
|
||||||
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||||
|
|
||||||
|
# Gated Unit.
|
||||||
|
x = fluid.layers.elementwise_mul(
|
||||||
|
fluid.layers.sigmoid(gate), fluid.layers.tanh(content))
|
||||||
|
|
||||||
|
if skip is None:
|
||||||
|
skip = x
|
||||||
|
else:
|
||||||
|
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
if self.residual:
|
||||||
|
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
return x, skip
|
||||||
|
|
||||||
|
def add_input(self, x, skip=None, conditioner=None):
|
||||||
|
"""
|
||||||
|
Inputs:
|
||||||
|
x: shape(B, num_filters, 1, time_steps)
|
||||||
|
skip: shape(B, num_filters, 1, time_steps), skip connection
|
||||||
|
conditioner: shape(B, conditioner_dim, 1, time_steps)
|
||||||
|
Outputs:
|
||||||
|
x: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
||||||
|
skip: skip connection, same shape as x
|
||||||
|
"""
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
# add step input and produce step output
|
||||||
|
x = self.conv.add_input(x)
|
||||||
|
|
||||||
|
if conditioner is not None:
|
||||||
|
cond_bias = self.fc(conditioner)
|
||||||
|
x += cond_bias
|
||||||
|
|
||||||
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||||
|
|
||||||
|
# Gated Unit.
|
||||||
|
x = fluid.layers.elementwise_mul(
|
||||||
|
fluid.layers.sigmoid(gate), fluid.layers.tanh(content))
|
||||||
|
|
||||||
|
if skip is None:
|
||||||
|
skip = x
|
||||||
|
else:
|
||||||
|
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
if self.residual:
|
||||||
|
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
return x, skip
|
||||||
|
|
||||||
|
|
||||||
|
def Conv2DTranspose(name_scope,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
padding=0,
|
||||||
|
stride=1,
|
||||||
|
dilation=1,
|
||||||
|
use_cudnn=True,
|
||||||
|
act=None,
|
||||||
|
dtype="float32"):
|
||||||
|
val = 1.0 / (filter_size[0] * filter_size[1])
|
||||||
|
weight_init = fluid.initializer.ConstantInitializer(val)
|
||||||
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
||||||
|
|
||||||
|
layer = weight_norm.Conv2DTranspose(
|
||||||
|
name_scope,
|
||||||
|
num_filters,
|
||||||
|
filter_size=filter_size,
|
||||||
|
padding=padding,
|
||||||
|
stride=stride,
|
||||||
|
dilation=dilation,
|
||||||
|
param_attr=weight_attr,
|
||||||
|
use_cudnn=use_cudnn,
|
||||||
|
act=act,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
return layer
|
|
@ -78,17 +78,15 @@ class ScaledDotProductAttention(dg.Layer):
|
||||||
"""
|
"""
|
||||||
# Compute attention score
|
# Compute attention score
|
||||||
attention = layers.matmul(
|
attention = layers.matmul(
|
||||||
query, key, transpose_y=True) #transpose the last dim in y
|
query, key, transpose_y=True, alpha=self.d_key
|
||||||
attention = attention / math.sqrt(self.d_key)
|
**-0.5) #transpose the last dim in y
|
||||||
|
|
||||||
# Mask key to ignore padding
|
# Mask key to ignore padding
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
attention = attention * mask
|
|
||||||
mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
|
|
||||||
attention = attention + mask
|
attention = attention + mask
|
||||||
|
|
||||||
attention = layers.softmax(attention)
|
attention = layers.softmax(attention)
|
||||||
attention = layers.dropout(attention, dropout)
|
attention = layers.dropout(
|
||||||
|
attention, dropout, dropout_implementation='upscale_in_train')
|
||||||
|
|
||||||
# Mask query to ignore padding
|
# Mask query to ignore padding
|
||||||
if query_mask is not None:
|
if query_mask is not None:
|
||||||
|
@ -142,17 +140,11 @@ class MultiheadAttention(dg.Layer):
|
||||||
result (Variable), Shape(B, T, C), the result of mutihead attention.
|
result (Variable), Shape(B, T, C), the result of mutihead attention.
|
||||||
attention (Variable), Shape(n_head * B, T, C), the attention of key.
|
attention (Variable), Shape(n_head * B, T, C), the attention of key.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
batch_size = key.shape[0]
|
batch_size = key.shape[0]
|
||||||
seq_len_key = key.shape[1]
|
seq_len_key = key.shape[1]
|
||||||
seq_len_query = query_input.shape[1]
|
seq_len_query = query_input.shape[1]
|
||||||
|
|
||||||
# repeat masks h times
|
|
||||||
if query_mask is not None:
|
|
||||||
query_mask = layers.expand(query_mask,
|
|
||||||
[self.num_head, 1, seq_len_key])
|
|
||||||
if mask is not None:
|
|
||||||
mask = layers.expand(mask, (self.num_head, 1, 1))
|
|
||||||
|
|
||||||
# Make multihead attention
|
# Make multihead attention
|
||||||
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
|
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
|
||||||
key = layers.reshape(
|
key = layers.reshape(
|
||||||
|
@ -176,6 +168,18 @@ class MultiheadAttention(dg.Layer):
|
||||||
result, attention = self.scal_attn(
|
result, attention = self.scal_attn(
|
||||||
key, value, query, mask=mask, query_mask=query_mask)
|
key, value, query, mask=mask, query_mask=query_mask)
|
||||||
|
|
||||||
|
key = layers.reshape(
|
||||||
|
layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||||
|
value = layers.reshape(
|
||||||
|
layers.transpose(value, [2, 0, 1, 3]),
|
||||||
|
[-1, seq_len_key, self.d_k])
|
||||||
|
query = layers.reshape(
|
||||||
|
layers.transpose(query, [2, 0, 1, 3]),
|
||||||
|
[-1, seq_len_query, self.d_q])
|
||||||
|
|
||||||
|
result, attention = self.scal_attn(
|
||||||
|
key, value, query, mask=mask, query_mask=query_mask)
|
||||||
|
|
||||||
# concat all multihead result
|
# concat all multihead result
|
||||||
result = layers.reshape(
|
result = layers.reshape(
|
||||||
result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
||||||
|
@ -184,7 +188,10 @@ class MultiheadAttention(dg.Layer):
|
||||||
[batch_size, seq_len_query, -1])
|
[batch_size, seq_len_query, -1])
|
||||||
if self.is_concat:
|
if self.is_concat:
|
||||||
result = layers.concat([query_input, result], axis=-1)
|
result = layers.concat([query_input, result], axis=-1)
|
||||||
result = layers.dropout(self.fc(result), self.dropout)
|
result = layers.dropout(
|
||||||
|
self.fc(result),
|
||||||
|
self.dropout,
|
||||||
|
dropout_implementation='upscale_in_train')
|
||||||
result = result + query_input
|
result = result + query_input
|
||||||
|
|
||||||
result = self.layer_norm(result)
|
result = self.layer_norm(result)
|
||||||
|
|
Loading…
Reference in New Issue