add a simple strategy to support multispeaker for tacotron.

This commit is contained in:
iclementine 2021-03-31 15:23:41 +08:00
parent 2dd393349f
commit 7cc3e8c340
1 changed files with 19 additions and 46 deletions

View File

@ -551,6 +551,8 @@ class Tacotron2(nn.Layer):
"""
def __init__(self,
vocab_size,
num_speakers=1,
d_speaker:int = 32,
d_mels: int = 80,
d_encoder: int = 512,
encoder_conv_layers: int = 3,
@ -577,6 +579,11 @@ class Tacotron2(nn.Layer):
self.embedding = nn.Embedding(vocab_size,
d_encoder,
weight_attr=I.Uniform(-val, val))
if num_speakers > 1:
self.num_speakers = num_speakers
self.speaker_embedding = nn.Embedding(num_speakers, d_speaker)
self.speaker_fc = nn.Linear(d_speaker, d_encoder)
self.encoder = Tacotron2Encoder(d_encoder, encoder_conv_layers,
encoder_kernel_size, p_encoder_dropout)
self.decoder = Tacotron2Decoder(
@ -590,7 +597,7 @@ class Tacotron2(nn.Layer):
num_layers=postnet_conv_layers,
dropout=p_postnet_dropout)
def forward(self, text_inputs, mels, text_lens, output_lens=None):
def forward(self, text_inputs, mels, text_lens, output_lens=None, speaker_ids=None):
"""Calculate forward propagation of tacotron2.
Parameters
@ -621,6 +628,11 @@ class Tacotron2(nn.Layer):
"""
embedded_inputs = self.embedding(text_inputs)
encoder_outputs = self.encoder(embedded_inputs, text_lens)
if self.num_speakers > 1:
speaker_embedding = self.speaker_embedding(speaker_ids)
speaker_feature = F.softplus(self.speaker_fc(speaker_embedding))
encoder_outputs += speaker_feature.unsqueeze(1)
# [B, T_enc, 1]
mask = paddle.unsqueeze(
@ -646,7 +658,7 @@ class Tacotron2(nn.Layer):
return outputs
@paddle.no_grad()
def infer(self, text_inputs, max_decoder_steps=1000):
def infer(self, text_inputs, max_decoder_steps=1000, speaker_ids=None):
"""Generate the mel sepctrogram of features given the sequences of character ids.
Parameters
@ -671,6 +683,11 @@ class Tacotron2(nn.Layer):
"""
embedded_inputs = self.embedding(text_inputs)
encoder_outputs = self.encoder(embedded_inputs)
if self.num_speakers > 1:
speaker_embedding = self.speaker_embedding(speaker_ids)
speaker_feature = F.softplus(self.speaker_fc(speaker_embedding))
encoder_outputs += speaker_feature.unsqueeze(1)
mel_outputs, alignments = self.decoder.infer(
encoder_outputs, max_decoder_steps=max_decoder_steps)
@ -685,50 +702,6 @@ class Tacotron2(nn.Layer):
return outputs
@classmethod
def from_pretrained(cls, config, checkpoint_path):
"""Build a tacotron2 model from a pretrained model.
Parameters
----------
frontend: parakeet.frontend.Phonetics
Frontend used to preprocess text.
config: yacs.config.CfgNode
Model configs.
checkpoint_path: Path or str
The path of pretrained model checkpoint, without extension name.
Returns
-------
Tacotron2
The model build from pretrined result.
"""
model = cls(vocab_size=config.model.vocab_size,
d_mels=config.data.d_mels,
d_encoder=config.model.d_encoder,
encoder_conv_layers=config.model.encoder_conv_layers,
encoder_kernel_size=config.model.encoder_kernel_size,
d_prenet=config.model.d_prenet,
d_attention_rnn=config.model.d_attention_rnn,
d_decoder_rnn=config.model.d_decoder_rnn,
attention_filters=config.model.attention_filters,
attention_kernel_size=config.model.attention_kernel_size,
d_attention=config.model.d_attention,
d_postnet=config.model.d_postnet,
postnet_kernel_size=config.model.postnet_kernel_size,
postnet_conv_layers=config.model.postnet_conv_layers,
reduction_factor=config.model.reduction_factor,
p_encoder_dropout=config.model.p_encoder_dropout,
p_prenet_dropout=config.model.p_prenet_dropout,
p_attention_dropout=config.model.p_attention_dropout,
p_decoder_dropout=config.model.p_decoder_dropout,
p_postnet_dropout=config.model.p_postnet_dropout)
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
return model
class Tacotron2Loss(nn.Layer):
""" Tacotron2 Loss module