1. add interfaces for inference;;

2. add a function to recursively remove weight norm;
3. wavenet: fix weight norm dimension: explicitly specify dim=1 instead of -1.
This commit is contained in:
chenfeiyu 2020-12-12 18:21:20 +08:00
parent b2bd479f46
commit 796e0b1e1f
4 changed files with 143 additions and 95 deletions

View File

@ -26,6 +26,7 @@ from parakeet.modules import masking
from parakeet.modules.conv import Conv1dBatchNorm from parakeet.modules.conv import Conv1dBatchNorm
from parakeet.modules import positional_encoding as pe from parakeet.modules import positional_encoding as pe
from parakeet.modules import losses as L from parakeet.modules import losses as L
from parakeet.utils import checkpoint, scheduler
__all__ = ["TransformerTTS", "TransformerTTSLoss"] __all__ = ["TransformerTTS", "TransformerTTSLoss"]
@ -285,8 +286,7 @@ class TransformerDecoder(nn.LayerList):
d_model, n_heads, d_ffn, dropout, d_encoder=d_encoder)) d_model, n_heads, d_ffn, dropout, d_encoder=d_encoder))
def forward(self, q, k, v, encoder_mask, decoder_mask, drop_n_heads=0): def forward(self, q, k, v, encoder_mask, decoder_mask, drop_n_heads=0):
"""[summary] """
Args: Args:
q (Tensor): shape(batch_size, time_steps_q, d_model) q (Tensor): shape(batch_size, time_steps_q, d_model)
k (Tensor): shape(batch_size, time_steps_k, d_encoder) k (Tensor): shape(batch_size, time_steps_k, d_encoder)
@ -330,40 +330,6 @@ class MLPPreNet(nn.Layer):
return l3 return l3
# NOTE: not used in
class CNNPreNet(nn.Layer):
def __init__(self,
d_input,
d_hidden,
d_output,
kernel_size,
n_layers,
dropout=0.):
# (conv + bn + relu + dropout) * n + last projection
super(CNNPreNet, self).__init__()
self.convs = nn.LayerList()
c_in = d_input
for _ in range(n_layers):
self.convs.append(
Conv1dBatchNorm(
c_in,
d_hidden,
kernel_size,
weight_attr=I.XavierUniform(),
padding="same",
data_format="NLC"))
c_in = d_hidden
self.affine_out = nn.Linear(d_hidden, d_output)
self.dropout = dropout
def forward(self, x):
for layer in self.convs:
x = F.dropout(
F.relu(layer(x)), self.dropout, training=self.training)
x = self.affine_out(x)
return x
class CNNPostNet(nn.Layer): class CNNPostNet(nn.Layer):
def __init__(self, d_input, d_hidden, d_output, kernel_size, n_layers): def __init__(self, d_input, d_hidden, d_output, kernel_size, n_layers):
super(CNNPostNet, self).__init__() super(CNNPostNet, self).__init__()
@ -536,7 +502,8 @@ class TransformerTTS(nn.Layer):
return mel_output, mel_intermediate, cross_attention_weights, stop_logits return mel_output, mel_intermediate, cross_attention_weights, stop_logits
def predict(self, input, raw_input=True, max_length=1000, verbose=True): @paddle.no_grad()
def infer(self, input, max_length=1000, verbose=True):
"""Predict log scale magnitude mel spectrogram from text input. """Predict log scale magnitude mel spectrogram from text input.
Args: Args:
@ -544,19 +511,13 @@ class TransformerTTS(nn.Layer):
max_length (int, optional): max decoder steps. Defaults to 1000. max_length (int, optional): max decoder steps. Defaults to 1000.
verbose (bool, optional): display progress bar. Defaults to True. verbose (bool, optional): display progress bar. Defaults to True.
""" """
if raw_input:
text_ids = paddle.to_tensor(self.frontend(input))
text_input = paddle.unsqueeze(text_ids, 0) # (1, T)
else:
text_input = input
decoder_input = paddle.unsqueeze(self.start_vec, 0) # (B=1, T, C) decoder_input = paddle.unsqueeze(self.start_vec, 0) # (B=1, T, C)
decoder_output = paddle.unsqueeze(self.start_vec, 0) # (B=1, T, C) decoder_output = paddle.unsqueeze(self.start_vec, 0) # (B=1, T, C)
# encoder the text sequence # encoder the text sequence
encoder_output, encoder_attentions, encoder_padding_mask = self.encode( encoder_output, encoder_attentions, encoder_padding_mask = self.encode(
text_input) input)
for _ in range(int(max_length // self.r) + 1): for _ in trange(int(max_length // self.r) + 1):
mel_output, _, cross_attention_weights, stop_logits = self.decode( mel_output, _, cross_attention_weights, stop_logits = self.decode(
encoder_output, decoder_input, encoder_padding_mask) encoder_output, decoder_input, encoder_padding_mask)
@ -584,10 +545,45 @@ class TransformerTTS(nn.Layer):
} }
return outputs return outputs
@paddle.no_grad()
def predict(self, input, max_length=1000, verbose=True):
text_ids = paddle.to_tensor(self.frontend(input))
input = paddle.unsqueeze(text_ids, 0) # (1, T)
outputs = self.infer(input, max_length=max_length, verbose=verbose)
outputs = {k: v[0].numpy() for k, v in outputs.items()}
return outputs
def set_constants(self, reduction_factor, drop_n_heads): def set_constants(self, reduction_factor, drop_n_heads):
self.r = reduction_factor self.r = reduction_factor
self.drop_n_heads = drop_n_heads self.drop_n_heads = drop_n_heads
@classmethod
def from_pretrained(cls, frontend, config, checkpoint_path):
model = TransformerTTS(
frontend,
d_encoder=config.model.d_encoder,
d_decoder=config.model.d_decoder,
d_mel=config.data.d_mel,
n_heads=config.model.n_heads,
d_ffn=config.model.d_ffn,
encoder_layers=config.model.encoder_layers,
decoder_layers=config.model.decoder_layers,
d_prenet=config.model.d_prenet,
d_postnet=config.model.d_postnet,
postnet_layers=config.model.postnet_layers,
postnet_kernel_size=config.model.postnet_kernel_size,
max_reduction_factor=config.model.max_reduction_factor,
decoder_prenet_dropout=config.model.decoder_prenet_dropout,
dropout=config.model.dropout)
iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
model.set_constants(
reduction_factor=reduction_factor(iteration),
drop_n_heads=drop_n_heads(iteration))
return model
class TransformerTTSLoss(nn.Layer): class TransformerTTSLoss(nn.Layer):
def __init__(self, stop_loss_scale): def __init__(self, stop_loss_scale):
@ -618,34 +614,3 @@ class TransformerTTSLoss(nn.Layer):
stop_loss=stop_loss # stop prob loss stop_loss=stop_loss # stop prob loss
) )
return losses return losses
class AdaptiveTransformerTTSLoss(nn.Layer):
def __init__(self):
super(AdaptiveTransformerTTSLoss, self).__init__()
def forward(self, mel_output, mel_intermediate, mel_target, stop_logits,
stop_probs):
mask = masking.feature_mask(
mel_target, axis=-1, dtype=mel_target.dtype)
mask1 = paddle.unsqueeze(mask, -1)
mel_loss1 = L.masked_l1_loss(mel_output, mel_target, mask1)
mel_loss2 = L.masked_l1_loss(mel_intermediate, mel_target, mask1)
batch_size, mel_len = mask.shape
valid_lengths = mask.sum(-1).astype("int64")
last_position = F.one_hot(valid_lengths - 1, num_classes=mel_len)
stop_loss_scale = valid_lengths.sum() / batch_size - 1
mask2 = mask + last_position.scale(stop_loss_scale - 1).astype(
mask.dtype)
stop_loss = L.masked_softmax_with_cross_entropy(
stop_logits, stop_probs.unsqueeze(-1), mask2.unsqueeze(-1))
loss = mel_loss1 + mel_loss2 + stop_loss
losses = dict(
loss=loss, # total loss
mel_loss1=mel_loss1, # ouput mel loss
mel_loss2=mel_loss2, # intermediate mel loss
stop_loss=stop_loss # stop prob loss
)
return losses

View File

@ -1,10 +1,12 @@
import math import math
import numpy as np import numpy as np
from typing import List, Union
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from paddle.nn import initializer as I from paddle.nn import initializer as I
from parakeet.utils import checkpoint
from parakeet.modules import geometry as geo from parakeet.modules import geometry as geo
__all__ = ["UpsampleNet", "WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] __all__ = ["UpsampleNet", "WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
@ -478,10 +480,23 @@ class WaveFlow(nn.LayerList):
class ConditionalWaveFlow(nn.LayerList): class ConditionalWaveFlow(nn.LayerList):
def __init__(self, encoder, decoder): def __init__(self,
upsample_factors: List[int],
n_flows: int,
n_layers: int,
n_group: int,
channels: int,
n_mels: int,
kernel_size: Union[int, List[int]]):
super(ConditionalWaveFlow, self).__init__() super(ConditionalWaveFlow, self).__init__()
self.encoder = encoder self.encoder = UpsampleNet(upsample_factors)
self.decoder = decoder self.decoder = WaveFlow(
n_flows=n_flows,
n_layers=n_layers,
n_group=n_group,
channels=channels,
mel_bands=n_mels,
kernel_size=kernel_size)
def forward(self, audio, mel): def forward(self, audio, mel):
condition = self.encoder(mel) condition = self.encoder(mel)
@ -489,13 +504,34 @@ class ConditionalWaveFlow(nn.LayerList):
return z, log_det_jacobian return z, log_det_jacobian
@paddle.no_grad() @paddle.no_grad()
def synthesize(self, mel): def infer(self, mel):
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
batch_size, _, time_steps = condition.shape batch_size, _, time_steps = condition.shape
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype) z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
x = self.decoder.inverse(z, condition) x = self.decoder.inverse(z, condition)
return x return x
@paddle.no_grad()
def predict(self, mel):
mel = paddle.to_tensor(mel)
mel = paddle.unsqueeze(mel, 0)
audio = self.infer(mel)
audio = audio[0].numpy()
return audio
@classmethod
def from_pretrained(cls, config, checkpoint_path):
model = cls(
upsample_factors=config.model.upsample_factors,
n_flows=config.model.n_flows,
n_layers=config.model.n_layers,
n_group=config.model.n_group,
channels=config.model.channels,
n_mels=config.data.n_mels,
kernel_size=config.model.kernel_size)
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
return model
class WaveFlowLoss(nn.Layer): class WaveFlowLoss(nn.Layer):
def __init__(self, sigma=1.0): def __init__(self, sigma=1.0):

View File

@ -14,7 +14,7 @@
import math import math
import time import time
from typing import Union, Sequence from typing import Union, Sequence, List
from tqdm import trange from tqdm import trange
import numpy as np import numpy as np
@ -26,6 +26,7 @@ import paddle.fluid.layers.distributions as D
from parakeet.modules.conv import Conv1dCell from parakeet.modules.conv import Conv1dCell
from parakeet.modules.audio import quantize, dequantize, STFT from parakeet.modules.audio import quantize, dequantize, STFT
from parakeet.utils import checkpoint, layer_tools
def crop(x, audio_start, audio_length): def crop(x, audio_start, audio_length):
@ -290,18 +291,18 @@ class WaveNet(nn.Layer):
if (output_dim % 3 != 0): if (output_dim % 3 != 0):
raise ValueError( raise ValueError(
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim)) "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim))
self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=-1) self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1)
self.resnet = ResidualNet(n_stack, n_loop, residual_channels, self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
condition_dim, filter_size) condition_dim, filter_size)
self.context_size = self.resnet.context_size self.context_size = self.resnet.context_size
skip_channels = residual_channels # assume the same channel skip_channels = residual_channels # assume the same channel
self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=-1) self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=-1) self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
# if loss_type is softmax, output_dim is n_vocab of waveform magnitude. # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
# if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev) # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=-1) self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1)
self.loss_type = loss_type self.loss_type = loss_type
self.output_dim = output_dim self.output_dim = output_dim
@ -509,17 +510,29 @@ class WaveNet(nn.Layer):
return self.compute_mog_loss(y, t) return self.compute_mog_loss(y, t)
class ConditionalWavenet(nn.Layer): class ConditionalWaveNet(nn.Layer):
def __init__(self, encoder, decoder): def __init__(self,
upsample_factors: List[int],
n_stack: int,
n_loop: int,
residual_channels: int,
output_dim: int,
n_mels: int,
filter_size: int=2,
loss_type: str="mog",
log_scale_min: float=-9.0):
"""Conditional Wavenet, which contains an UpsampleNet as the encoder and a WaveNet as the decoder. It is an autoregressive model. """Conditional Wavenet, which contains an UpsampleNet as the encoder and a WaveNet as the decoder. It is an autoregressive model.
Args:
encoder (UpsampleNet): the UpsampleNet as the encoder.
decoder (WaveNet): the WaveNet as the decoder.
""" """
super(ConditionalWavenet, self).__init__() super(ConditionalWaveNet, self).__init__()
self.encoder = encoder self.encoder = UpsampleNet(upsample_factors)
self.decoder = decoder self.decoder = WaveNet(n_stack=n_stack,
n_loop=n_loop,
residual_channels=residual_channels,
output_dim=output_dim,
condition_dim=n_mels,
filter_size=filter_size,
loss_type=loss_type,
log_scale_min=log_scale_min)
def forward(self, audio, mel, audio_start): def forward(self, audio, mel, audio_start):
"""Compute the output distribution given the mel spectrogram and the input(for teacher force training). """Compute the output distribution given the mel spectrogram and the input(for teacher force training).
@ -570,7 +583,7 @@ class ConditionalWavenet(nn.Layer):
return samples return samples
@paddle.no_grad() @paddle.no_grad()
def synthesis(self, mel): def infer(self, mel):
"""Synthesize waveform from mel spectrogram. """Synthesize waveform from mel spectrogram.
Args: Args:
@ -595,3 +608,29 @@ class ConditionalWavenet(nn.Layer):
samples = paddle.concat(samples, -1) samples = paddle.concat(samples, -1)
return samples return samples
@paddle.no_grad()
def predict(self, mel):
mel = paddle.to_tensor(mel)
mel = paddle.unsqueeze(mel, 0)
audio = self.infer(mel)
audio = audio[0].numpy()
return audio
@classmethod
def from_pretrained(cls, config, checkpoint_path):
model = cls(
upsample_factors=config.model.upsample_factors,
n_stack=config.model.n_stack,
n_loop=config.model.n_loop,
residual_channels=config.model.residual_channels,
output_dim=config.model.output_dim,
n_mels=config.data.n_mels,
filter_size=config.model.filter_size,
loss_type=config.model.loss_type,
log_scale_min=config.model.log_scale_min)
layer_tools.summary(model)
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
return model

View File

@ -36,6 +36,14 @@ def gradient_norm(layer: nn.Layer):
grad_norm_dict[name] = np.linalg.norm(grad) / grad.size grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
return grad_norm_dict return grad_norm_dict
def recursively_remove_weight_norm(layer: nn.Layer):
for layer in layer.sublayers():
try:
nn.utils.remove_weight_norm(layer)
except:
# ther is not weight norm hoom in this layer
pass
def freeze(layer: nn.Layer): def freeze(layer: nn.Layer):
for param in layer.parameters(): for param in layer.parameters():
param.trainable = False param.trainable = False