modified the process of generating masks to speed up batching
This commit is contained in:
parent
ae01b8e14c
commit
75d464221c
|
@ -18,7 +18,7 @@ def add_config_options_to_parser(parser):
|
|||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
default='config/fastspeech.yaml',
|
||||
default='configs/fastspeech.yaml',
|
||||
help="the yaml config file path.")
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=32, help="batch size for training.")
|
||||
|
@ -87,7 +87,7 @@ def add_config_options_to_parser(parser):
|
|||
parser.add_argument(
|
||||
'--transtts_path',
|
||||
type=str,
|
||||
default='./log',
|
||||
default='../transformer_tts/checkpoint',
|
||||
help="the directory to load pretrain transformerTTS model.")
|
||||
parser.add_argument(
|
||||
'--transformer_step',
|
||||
|
|
|
@ -10,7 +10,7 @@ python -u train.py \
|
|||
--use_data_parallel=0 \
|
||||
--data_path='../../dataset/LJSpeech-1.1' \
|
||||
--transtts_path='../transformer_tts/checkpoint' \
|
||||
--transformer_step=160000 \
|
||||
--transformer_step=120000 \
|
||||
--save_path='./checkpoint' \
|
||||
--log_dir='./log' \
|
||||
--config_path='configs/fastspeech.yaml' \
|
||||
|
|
|
@ -55,6 +55,8 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
|
|||
|
||||
If you wish to resume from an existing model, please set ``--checkpoint_path`` and ``--transformer_step``.
|
||||
|
||||
**Note: In order to ensure the training effect, we recommend using multi-GPU training to enlarge the batch size, and at least 16 samples in single batch per GPU.**
|
||||
|
||||
For more help on arguments:
|
||||
``python train_transformer.py --help``.
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ from parakeet import audio
|
|||
from parakeet.data.sampler import *
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset
|
||||
from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset, SliceDataset
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
|
||||
|
||||
|
@ -44,7 +44,7 @@ class LJSpeechLoader:
|
|||
dataset = CacheDataset(dataset)
|
||||
|
||||
sampler = DistributedSampler(
|
||||
len(metadata), nranks, rank, shuffle=shuffle)
|
||||
len(dataset), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert args.batch_size % nranks == 0
|
||||
each_bs = args.batch_size // nranks
|
||||
|
@ -64,7 +64,6 @@ class LJSpeechLoader:
|
|||
shuffle=shuffle,
|
||||
batch_fn=batch_examples,
|
||||
drop_last=True)
|
||||
|
||||
self.reader = fluid.io.DataLoader.from_generator(
|
||||
capacity=32,
|
||||
iterable=True,
|
||||
|
@ -199,12 +198,13 @@ def batch_examples(batch):
|
|||
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
mel_inputs = np.transpose(
|
||||
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
enc_slf_mask = get_attn_key_pad_mask(pos_texts, texts).astype(np.float32)
|
||||
|
||||
enc_slf_mask = get_attn_key_pad_mask(pos_texts).astype(np.float32)
|
||||
enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32)
|
||||
dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels,
|
||||
mel_inputs).astype(np.float32)
|
||||
enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0],
|
||||
mel_inputs).astype(np.float32)
|
||||
enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0]).astype(
|
||||
np.float32)
|
||||
dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32)
|
||||
dec_query_mask = get_non_pad_mask(pos_mels).astype(np.float32)
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ def add_config_options_to_parser(parser):
|
|||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
default='config/train_transformer.yaml',
|
||||
default='configs/train_transformer.yaml',
|
||||
help="the yaml config file path.")
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=32, help="batch size for training.")
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
from tensorboardX import SummaryWriter
|
||||
#from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
import argparse
|
||||
from parse import add_config_options_to_parser
|
||||
|
@ -69,9 +68,6 @@ def main(args):
|
|||
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
reader = LJSpeechLoader(
|
||||
cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(
|
||||
str(args.transformer_step),
|
||||
|
@ -85,6 +81,9 @@ def main(args):
|
|||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
reader = LJSpeechLoader(
|
||||
cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
for epoch in range(args.epochs):
|
||||
pbar = tqdm(reader)
|
||||
for i, data in enumerate(pbar):
|
||||
|
@ -148,7 +147,8 @@ def main(args):
|
|||
for i, prob in enumerate(attn_probs):
|
||||
for j in range(4):
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * 16]) * 255)
|
||||
cm.viridis(prob.numpy()[j * args.batch_size
|
||||
// 2]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_%d_0' % global_step,
|
||||
x,
|
||||
|
@ -158,7 +158,8 @@ def main(args):
|
|||
for i, prob in enumerate(attn_enc):
|
||||
for j in range(4):
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * 16]) * 255)
|
||||
cm.viridis(prob.numpy()[j * args.batch_size
|
||||
// 2]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_enc_%d_0' % global_step,
|
||||
x,
|
||||
|
@ -168,7 +169,8 @@ def main(args):
|
|||
for i, prob in enumerate(attn_dec):
|
||||
for j in range(4):
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * 16]) * 255)
|
||||
cm.viridis(prob.numpy()[j * args.batch_size
|
||||
// 2]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_dec_%d_0' % global_step,
|
||||
x,
|
||||
|
|
|
@ -56,15 +56,13 @@ def get_non_pad_mask(seq):
|
|||
return mask
|
||||
|
||||
|
||||
def get_attn_key_pad_mask(seq_k, seq_q):
|
||||
def get_attn_key_pad_mask(seq_k):
|
||||
''' For masking out the padding part of key sequence. '''
|
||||
|
||||
# Expand to fit the shape of key query attention matrix.
|
||||
len_q = seq_q.shape[1]
|
||||
padding_mask = (seq_k != 0).astype(np.float32)
|
||||
padding_mask = np.expand_dims(padding_mask, axis=1)
|
||||
padding_mask = padding_mask.repeat([len_q], axis=1)
|
||||
padding_mask = (padding_mask == 0).astype(np.float32) * (-2**32 + 1)
|
||||
padding_mask = (
|
||||
padding_mask == 0).astype(np.float32) * -1e30 #* (-2**32 + 1)
|
||||
return padding_mask
|
||||
|
||||
|
||||
|
@ -72,12 +70,12 @@ def get_dec_attn_key_pad_mask(seq_k, seq_q):
|
|||
''' For masking out the padding part of key sequence. '''
|
||||
|
||||
# Expand to fit the shape of key query attention matrix.
|
||||
len_q = seq_q.shape[1]
|
||||
padding_mask = (seq_k == 0).astype(np.float32)
|
||||
padding_mask = np.expand_dims(padding_mask, axis=1)
|
||||
triu_tensor = get_triu_tensor(seq_q, seq_q)
|
||||
padding_mask = padding_mask.repeat([len_q], axis=1) + triu_tensor
|
||||
padding_mask = (padding_mask != 0).astype(np.float32) * (-2**32 + 1)
|
||||
padding_mask = padding_mask + triu_tensor
|
||||
padding_mask = (
|
||||
padding_mask != 0).astype(np.float32) * -1e30 #* (-2**32 + 1)
|
||||
return padding_mask
|
||||
|
||||
|
||||
|
@ -85,12 +83,7 @@ def get_triu_tensor(seq_k, seq_q):
|
|||
''' For make a triu tensor '''
|
||||
len_k = seq_k.shape[1]
|
||||
len_q = seq_q.shape[1]
|
||||
batch_size = seq_k.shape[0]
|
||||
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
|
||||
triu_tensor = np.repeat(
|
||||
np.expand_dims(
|
||||
triu_tensor, axis=0), batch_size, axis=0)
|
||||
|
||||
return triu_tensor
|
||||
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ class ScaledDotProductAttention(dg.Layer):
|
|||
# Mask key to ignore padding
|
||||
if mask is not None:
|
||||
attention = attention + mask
|
||||
attention = layers.softmax(attention)
|
||||
attention = layers.softmax(attention, use_cudnn=True)
|
||||
attention = layers.dropout(
|
||||
attention, dropout, dropout_implementation='upscale_in_train')
|
||||
|
||||
|
|
Loading…
Reference in New Issue