update docstring for waveflow

This commit is contained in:
iclementine 2020-12-19 18:33:07 +08:00
parent f2a35a17d4
commit b6efb43990
3 changed files with 622 additions and 192 deletions

View File

@ -80,10 +80,6 @@ class Experiment(ExperimentBase):
z, log_det_jocobian = self.model(wav, mel) z, log_det_jocobian = self.model(wav, mel)
return z, log_det_jocobian return z, log_det_jocobian
def compute_losses(self, outputs):
loss = self.criterion(outputs)
return loss
def train_batch(self): def train_batch(self):
start = time.time() start = time.time()
batch = self.read_batch() batch = self.read_batch()
@ -92,8 +88,8 @@ class Experiment(ExperimentBase):
self.model.train() self.model.train()
self.optimizer.clear_grad() self.optimizer.clear_grad()
mel, wav = batch mel, wav = batch
outputs = self.compute_outputs(mel, wav) z, log_det_jocobian = self.compute_outputs(mel, wav)
loss = self.compute_losses(outputs) loss = self.criterion(z, log_det_jocobian)
loss.backward() loss.backward()
self.optimizer.step() self.optimizer.step()
iteration_time = time.time() - start iteration_time = time.time() - start
@ -112,8 +108,8 @@ class Experiment(ExperimentBase):
valid_iterator = iter(self.valid_loader) valid_iterator = iter(self.valid_loader)
valid_losses = [] valid_losses = []
mel, wav = next(valid_iterator) mel, wav = next(valid_iterator)
outputs = self.compute_outputs(mel, wav) z, log_det_jocobian = self.compute_outputs(mel, wav)
loss = self.compute_losses(outputs) loss = self.criterion(z, log_det_jocobian)
valid_losses.append(float(loss)) valid_losses.append(float(loss))
valid_loss = np.mean(valid_losses) valid_loss = np.mean(valid_losses)
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)

View File

@ -1,6 +1,6 @@
import math import math
import numpy as np import numpy as np
from typing import List, Union from typing import List, Union, Tuple
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
@ -9,27 +9,56 @@ from paddle.nn import initializer as I
from parakeet.utils import checkpoint from parakeet.utils import checkpoint
from parakeet.modules import geometry as geo from parakeet.modules import geometry as geo
__all__ = ["UpsampleNet", "WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
def fold(x, n_group): def fold(x, n_group):
"""Fold audio or spectrogram's temporal dimension in to groups. r"""Fold audio or spectrogram's temporal dimension in to groups.
Args: Parameters
x (Tensor): shape(*, time_steps), the input tensor ----------
n_group (int): the size of a group. x : Tensor [shape=(\*, time_steps)
The input tensor.
n_group : int
The size of a group.
Returns: Returns
Tensor: shape(*, time_steps // n_group, group), folded tensor. ---------
Tensor : [shape=(`*, time_steps // n_group, group)]
Folded tensor.
""" """
*spatial_shape, time_steps = x.shape *spatial_shape, time_steps = x.shape
new_shape = spatial_shape + [time_steps // n_group, n_group] new_shape = spatial_shape + [time_steps // n_group, n_group]
return paddle.reshape(x, new_shape) return paddle.reshape(x, new_shape)
class UpsampleNet(nn.LayerList): class UpsampleNet(nn.LayerList):
""" """Layer to upsample mel spectrogram to the same temporal resolution with
Layer to upsample mel spectrogram to the same temporal resolution with the corresponding waveform.
the corresponding waveform. It consists of several conv2dtranspose layers
which perform de convolution on mel and time dimension. It consists of several conv2dtranspose layers which perform deconvolution
on mel and time dimension.
Parameters
----------
upscale_factors : List[int], optional
Time upsampling factors for each Conv2DTranspose Layer.
The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
Layers. Each upscale_factor is used as the ``stride`` for the
corresponding Conv2DTranspose. Defaults to [16, 16], this the default
upsampling factor is 256.
Notes
------
``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
transformation used to extract spectrogram features from audio.
For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
transformation whose ``hop_length`` equals 256 is suitable.
See Also
---------
``librosa.core.stft``
""" """
def __init__(self, upsample_factors): def __init__(self, upsample_factors):
super(UpsampleNet, self).__init__() super(UpsampleNet, self).__init__()
@ -49,17 +78,25 @@ class UpsampleNet(nn.LayerList):
self.upsample_factors = upsample_factors self.upsample_factors = upsample_factors
def forward(self, x, trim_conv_artifact=False): def forward(self, x, trim_conv_artifact=False):
""" r"""Forward pass of the ``UpsampleNet``.
Args:
x (Tensor): shape(batch_size, input_channels, time_steps), the input Parameters
spectrogram. -----------
trim_conv_artifact (bool, optional): trim deconvolution artifact at x : Tensor [shape=(batch_size, input_channels, time_steps)]
each layer. Defaults to False. The input spectrogram.
trim_conv_artifact : bool, optional
Trim deconvolution artifact at each layer. Defaults to False.
Returns: Returns
Tensor: shape(batch_size, input_channels, time_steps * upsample_factor). --------
If trim_conv_artifact is True, the output time steps is less Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)]
than time_steps * upsample_factors. The upsampled spectrogram.
Notes
--------
If trim_conv_artifact is ``True``, the output time steps is less
than ``time_steps \* upsample_factors``.
""" """
x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T) x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T)
for layer in self: for layer in self:
@ -72,11 +109,27 @@ class UpsampleNet(nn.LayerList):
return x return x
#TODO write doc
class ResidualBlock(nn.Layer): class ResidualBlock(nn.Layer):
""" """ResidualBlock, the basic unit of ResidualNet used in WaveFlow.
ResidualBlock, the basic unit of ResidualNet. It has a conv2d layer, which
has causal padding in height dimension and same paddign in width dimension. It has a conv2d layer, which has causal padding in height dimension and
It also has projection for the condition and output. same paddign in width dimension. It also has projection for the condition
and output.
Parameters
----------
channels : int
Feature size of the input.
cond_channels : int
Featuer size of the condition.
kernel_size : Tuple[int]
Kernel size of the Convolution2d applied to the input.
dilations : int
Dilations of the Convolution2d applied to the input.
""" """
def __init__(self, channels, cond_channels, kernel_size, dilations): def __init__(self, channels, cond_channels, kernel_size, dilations):
super(ResidualBlock, self).__init__() super(ResidualBlock, self).__init__()
@ -113,14 +166,21 @@ class ResidualBlock(nn.Layer):
def forward(self, x, condition): def forward(self, x, condition):
"""Compute output for a whole folded sequence. """Compute output for a whole folded sequence.
Args: Parameters
x (Tensor): shape(batch_size, channel, height, width), the input. ----------
condition (Tensor): shape(batch_size, condition_channel, height, width), x : Tensor [shape=(batch_size, channel, height, width)]
the local condition. The input.
condition : Tensor [shape=(batch_size, condition_channel, height, width)]
The local condition.
Returns: Returns
res (Tensor): shape(batch_size, channel, height, width), the residual output. -------
res (Tensor): shape(batch_size, channel, height, width), the skip output. res : Tensor [shape=(batch_size, channel, height, width)]
The residual output.
skip : Tensor [shape=(batch_size, channel, height, width)]
The skip output.
""" """
x_in = x x_in = x
x = self.conv(x) x = self.conv(x)
@ -131,10 +191,12 @@ class ResidualBlock(nn.Layer):
x = self.out_proj(x) x = self.out_proj(x)
res, skip = paddle.chunk(x, 2, axis=1) res, skip = paddle.chunk(x, 2, axis=1)
return x_in + res, skip res = x_in + res
return res, skip
def start_sequence(self): def start_sequence(self):
"""Prepare the layer for incremental computation of causal convolution. Reset the buffer for causal convolution. """Prepare the layer for incremental computation of causal
convolution. Reset the buffer for causal convolution.
Raises: Raises:
ValueError: If not in evaluation mode. ValueError: If not in evaluation mode.
@ -155,13 +217,21 @@ class ResidualBlock(nn.Layer):
def add_input(self, x_row, condition_row): def add_input(self, x_row, condition_row):
"""Compute the output for a row and update the buffer. """Compute the output for a row and update the buffer.
Args: Parameters
x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input. ----------
condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input. x_row : Tensor [shape=(batch_size, channel, 1, width)]
A row of the input.
condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
A row of the condition.
Returns: Returns
res (Tensor): shape(batch_size, channel, 1, width), the residual output. -------
res (Tensor): shape(batch_size, channel, 1, width), the skip output. res : Tensor [shape=(batch_size, channel, 1, width)]
A row of the the residual output.
res : Tensor [shape=(batch_size, channel, 1, width)]
A row of the skip output.
""" """
x_row_in = x_row x_row_in = x_row
if self._conv_buffer is None: if self._conv_buffer is None:
@ -182,7 +252,8 @@ class ResidualBlock(nn.Layer):
x_row = self.out_proj(x_row) x_row = self.out_proj(x_row)
res, skip = paddle.chunk(x_row, 2, axis=1) res, skip = paddle.chunk(x_row, 2, axis=1)
return x_row_in + res, skip res = x_row_in + res
return res, skip
def _init_buffer(self, input): def _init_buffer(self, input):
batch_size, channels, _, width = input.shape batch_size, channels, _, width = input.shape
@ -195,11 +266,36 @@ class ResidualBlock(nn.Layer):
class ResidualNet(nn.LayerList): class ResidualNet(nn.LayerList):
"""A stack of several ResidualBlocks. It merges condition at each layer.
Parameters
----------
n_layer : int
Number of ResidualBlocks in the ResidualNet.
residual_channels : int
Feature size of each ResidualBlocks.
condition_channels : int
Feature size of the condition.
kernel_size : Tuple[int]
Kernel size of each ResidualBlock.
dilations_h : List[int]
Dilation in height dimension of every ResidualBlock.
Raises
------
ValueError
If the length of dilations_h does not equals n_layers.
""" """
A stack of several ResidualBlocks. It merges condition at each layer. All def __init__(self,
skip outputs are collected. n_layer: int,
""" residual_channels: int,
def __init__(self, n_layer, residual_channels, condition_channels, kernel_size, dilations_h): condition_channels: int,
kernel_size: Tuple[int],
dilations_h: List[int]):
if len(dilations_h) != n_layer: if len(dilations_h) != n_layer:
raise ValueError("number of dilations_h should equals num of layers") raise ValueError("number of dilations_h should equals num of layers")
super(ResidualNet, self).__init__() super(ResidualNet, self).__init__()
@ -211,14 +307,18 @@ class ResidualNet(nn.LayerList):
def forward(self, x, condition): def forward(self, x, condition):
"""Comput the output of given the input and the condition. """Comput the output of given the input and the condition.
Args: Parameters
x (Tensor): shape(batch_size, channel, height, width), the input. -----------
condition (Tensor): shape(batch_size, condition_channel, height, width), x : Tensor [shape=(batch_size, channel, height, width)]
the local condition. The input.
condition : Tensor [shape=(batch_size, condition_channel, height, width)]
The local condition.
Returns: Returns
Tensor: shape(batch_size, channel, height, width), the output, which --------
is an aggregation of all the skip outputs. Tensor : [shape=(batch_size, channel, height, width)]
The output, which is an aggregation of all the skip outputs.
""" """
skip_connections = [] skip_connections = []
for layer in self: for layer in self:
@ -228,20 +328,29 @@ class ResidualNet(nn.LayerList):
return out return out
def start_sequence(self): def start_sequence(self):
"""Prepare the layer for incremental computation.""" """Prepare the layer for incremental computation.
"""
for layer in self: for layer in self:
layer.start_sequence() layer.start_sequence()
def add_input(self, x_row, condition_row): def add_input(self, x_row, condition_row):
"""Compute the output for a row and update the buffer. """Compute the output for a row and update the buffers.
Args: Parameters
x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input. ----------
condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input. x_row : Tensor [shape=(batch_size, channel, 1, width)]
A row of the input.
condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
A row of the condition.
Returns: Returns
Tensor: shape(batch_size, channel, 1, width), the output, which is -------
an aggregation of all the skip outputs. res : Tensor [shape=(batch_size, channel, 1, width)]
A row of the the residual output.
res : Tensor [shape=(batch_size, channel, 1, width)]
A row of the skip output.
""" """
skip_connections = [] skip_connections = []
for layer in self: for layer in self:
@ -252,12 +361,29 @@ class ResidualNet(nn.LayerList):
class Flow(nn.Layer): class Flow(nn.Layer):
""" """A bijection (Reversable layer) that transform a density of latent
A bijection (Reversable layer) that transform a density of latent variables variables p(Z) into a complex data distribution p(X).
p(Z) into a complex data distribution p(X).
It's a auto regressive flow. The `forward` method implements the probability It's an auto regressive flow. The `forward` method implements the
density estimation. The `inverse` method implements the sampling. probability density estimation. The `inverse` method implements the
sampling.
Parameters
----------
n_layers : int
Number of ResidualBlocks in the Flow.
channels : int
Feature size of the ResidualBlocks.
mel_bands : int
Feature size of the mel spectrogram (mel bands).
kernel_size : Tuple[int]
Kernel size of each ResisualBlocks in the Flow.
n_group : int
Number of timesteps to the folded into a group.
""" """
dilations_dict = { dilations_dict = {
8: [1, 1, 1, 1, 1, 1, 1, 1], 8: [1, 1, 1, 1, 1, 1, 1, 1],
@ -301,18 +427,29 @@ class Flow(nn.Layer):
return z_out return z_out
def forward(self, x, condition): def forward(self, x, condition):
"""Probability density estimation. It is done by inversely transform a sample """Probability density estimation. It is done by inversely transform
from p(X) back into a sample from p(Z). a sample from p(X) into a sample from p(Z).
Args: Parameters
x (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(X). -----------
condition (Tensor): shape(batch, condition_channel, height, width), the local condition. x : Tensor [shape=(batch, 1, height, width)]
A input sample of the distribution p(X).
condition : Tensor [shape=(batch, condition_channel, height, width)]
The local condition.
Returns: Returns
(z, (logs, b)) --------
z (Tensor): shape(batch, 1, height, width), the transformed sample. z (Tensor): shape(batch, 1, height, width), the transformed sample.
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation. Tuple[Tensor, Tensor]
The parameter of the transformation.
logs (Tensor): shape(batch, 1, height - 1, width), the log scale
of the transformation from x to z.
b (Tensor): shape(batch, 1, height - 1, width), the shift of the
transformation from x to z.
""" """
# (B, C, H-1, W) # (B, C, H-1, W)
logs, b = self._predict_parameters( logs, b = self._predict_parameters(
@ -340,18 +477,30 @@ class Flow(nn.Layer):
self.resnet.start_sequence() self.resnet.start_sequence()
def inverse(self, z, condition): def inverse(self, z, condition):
"""Sampling from the the distrition p(X). It is done by sample form p(Z) """Sampling from the the distrition p(X). It is done by sample form
and transform the sample. It is a auto regressive transformation. p(Z) and transform the sample. It is a auto regressive transformation.
Args: Parameters
z (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(Z). -----------
condition (Tensor): shape(batch, condition_channel, height, width), the local condition. z : Tensor [shape=(batch, 1, height, width)]
A sample of the distribution p(Z).
condition : Tensor [shape=(batch, condition_channel, height, width)]
The local condition.
Returns: Returns
(x, (logs, b)) ---------
x (Tensor): shape(batch, 1, height, width), the transformed sample. x : Tensor [shape=(batch, 1, height, width)]
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation. The transformed sample.
b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
Tuple[Tensor, Tensor]
The parameter of the transformation.
logs (Tensor): shape(batch, 1, height - 1, width), the log scale
of the transformation from x to z.
b (Tensor): shape(batch, 1, height - 1, width), the shift of the
transformation from x to z.
""" """
z_0 = z[:, :, :1, :] z_0 = z[:, :, :1, :]
x = [] x = []
@ -377,7 +526,29 @@ class Flow(nn.Layer):
class WaveFlow(nn.LayerList): class WaveFlow(nn.LayerList):
"""An Deep Reversible layer that is composed of a stack of auto regressive flows.s""" """An Deep Reversible layer that is composed of severel auto regressive
flows.
Parameters
-----------
n_flows : int
Number of flows in the WaveFlow model.
n_layers : int
Number of ResidualBlocks in each Flow.
n_group : int
Number of timesteps to fold as a group.
channels : int
Feature size of each ResidualBlock.
mel_bands : int
Feature size of mel spectrogram (mel bands).
kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size): def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
if n_group % 2 or n_flows % 2: if n_group % 2 or n_flows % 2:
raise ValueError("number of flows and number of group must be even " raise ValueError("number of flows and number of group must be even "
@ -416,15 +587,25 @@ class WaveFlow(nn.LayerList):
return x, condition return x, condition
def forward(self, x, condition): def forward(self, x, condition):
"""Probability density estimation. """Probability density estimation of random variable x given the
condition.
Args: Parameters
x (Tensor): shape(batch_size, time_steps), the audio. -----------
condition (Tensor): shape(batch_size, condition channel, time_steps), the local condition. x : Tensor [shape=(batch_size, time_steps)]
The audio.
condition : Tensor [shape=(batch_size, condition channel, time_steps)]
The local condition (mel spectrogram here).
Returns: Returns
z: (Tensor): shape(batch_size, time_steps), the transformed sample. --------
log_det_jacobian: (Tensor), shape(1,), the log determinant of the jacobian of (dz/dx). z : Tensor [shape=(batch_size, time_steps)]
The transformed random variable.
log_det_jacobian: Tensor [shape=(1,)]
The log determinant of the jacobian of the transformation from x
to z.
""" """
# x: (B, T) # x: (B, T)
# condition: (B, C, T) upsampled condition # condition: (B, C, T) upsampled condition
@ -451,15 +632,24 @@ class WaveFlow(nn.LayerList):
return z, log_det_jacobian return z, log_det_jacobian
def inverse(self, z, condition): def inverse(self, z, condition):
"""Sampling from the the distrition p(X). It is done by sample form p(Z) """Sampling from the the distrition p(X).
and transform the sample. It is a auto regressive transformation.
It is done by sample a ``z`` form p(Z) and transform it into ``x``.
Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
autoregressive manner.
Args: Parameters
z (Tensor): shape(batch, 1, time_steps), a input sample of the distribution p(Z). ----------
condition (Tensor): shape(batch, condition_channel, time_steps), the local condition. z : Tensor [shape=(batch, 1, time_steps]
A sample of the distribution p(Z).
condition : Tensor [shape=(batch, condition_channel, time_steps)]
The local condition.
Returns: Returns
x: (Tensor): shape(batch_size, time_steps), the transformed sample. --------
x : Tensor [shape=(batch_size, time_steps)]
The transformed sample (audio here).
""" """
z, condition = self._trim(z, condition) z, condition = self._trim(z, condition)
@ -480,6 +670,31 @@ class WaveFlow(nn.LayerList):
class ConditionalWaveFlow(nn.LayerList): class ConditionalWaveFlow(nn.LayerList):
"""ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
Parameters
----------
upsample_factors : List[int]
Upsample factors for the upsample net.
n_flows : int
Number of flows in the WaveFlow model.
n_layers : int
Number of ResidualBlocks in each Flow.
n_group : int
Number of timesteps to fold as a group.
channels : int
Feature size of each ResidualBlock.
n_mels : int
Feature size of mel spectrogram (mel bands).
kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self, def __init__(self,
upsample_factors: List[int], upsample_factors: List[int],
n_flows: int, n_flows: int,
@ -499,12 +714,44 @@ class ConditionalWaveFlow(nn.LayerList):
kernel_size=kernel_size) kernel_size=kernel_size)
def forward(self, audio, mel): def forward(self, audio, mel):
"""Compute the transformed random variable z (x to z) and the log of
the determinant of the jacobian of the transformation from x to z.
Parameters
----------
audio : Tensor [shape=(B, T)]
The audio.
mel : Tensor [shape=(B, C_mel, T_mel)]
The mel spectrogram.
Returns
-------
z : Tensor [shape=(B, T)]
The inversely transformed random variable z (x to z)
log_det_jacobian: Tensor [shape=(1,)]
the log of the determinant of the jacobian of the transformation
from x to z.
"""
condition = self.encoder(mel) condition = self.encoder(mel)
z, log_det_jacobian = self.decoder(audio, condition) z, log_det_jacobian = self.decoder(audio, condition)
return z, log_det_jacobian return z, log_det_jacobian
@paddle.no_grad() @paddle.no_grad()
def infer(self, mel): def infer(self, mel):
r"""Generate raw audio given mel spectrogram.
Parameters
----------
mel : Tensor [shape=(B, C_mel, T_mel)]
Mel spectrogram (in log-magnitude).
Returns
-------
Tensor : [shape=(B, T)]
The synthesized audio, where``T <= T_mel \* upsample_factors``.
"""
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
batch_size, _, time_steps = condition.shape batch_size, _, time_steps = condition.shape
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype) z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
@ -513,6 +760,18 @@ class ConditionalWaveFlow(nn.LayerList):
@paddle.no_grad() @paddle.no_grad()
def predict(self, mel): def predict(self, mel):
"""Generate raw audio given mel spectrogram.
Parameters
----------
mel : np.ndarray [shape=(C_mel, T_mel)]
Mel spectrogram of an utterance(in log-magnitude).
Returns
-------
np.ndarray [shape=(T,)]
The synthesized audio.
"""
mel = paddle.to_tensor(mel) mel = paddle.to_tensor(mel)
mel = paddle.unsqueeze(mel, 0) mel = paddle.unsqueeze(mel, 0)
audio = self.infer(mel) audio = self.infer(mel)
@ -521,6 +780,21 @@ class ConditionalWaveFlow(nn.LayerList):
@classmethod @classmethod
def from_pretrained(cls, config, checkpoint_path): def from_pretrained(cls, config, checkpoint_path):
"""Build a ConditionalWaveFlow model from a pretrained model.
Parameters
----------
config: yacs.config.CfgNode
model configs
checkpoint_path: Path or str
the path of pretrained model checkpoint, without extension name
Returns
-------
ConditionalWaveFlow
The model built from pretrained result.
"""
model = cls( model = cls(
upsample_factors=config.model.upsample_factors, upsample_factors=config.model.upsample_factors,
n_flows=config.model.n_flows, n_flows=config.model.n_flows,
@ -534,14 +808,37 @@ class ConditionalWaveFlow(nn.LayerList):
class WaveFlowLoss(nn.Layer): class WaveFlowLoss(nn.Layer):
"""Criterion of a WaveFlow model.
Parameters
----------
sigma : float
The standard deviation of the gaussian noise used in WaveFlow, by
default 1.0.
"""
def __init__(self, sigma=1.0): def __init__(self, sigma=1.0):
super(WaveFlowLoss, self).__init__() super(WaveFlowLoss, self).__init__()
self.sigma = sigma self.sigma = sigma
self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma) self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
def forward(self, model_output): def forward(self, z, log_det_jacobian):
z, log_det_jacobian = model_output """Compute the loss given the transformed random variable z and the
log_det_jacobian of transformation from x to z.
Parameters
----------
z : Tensor [shape=(B, T)]
The transformed random variable (x to z).
log_det_jacobian : Tensor [shape=(1,)]
The log of the determinant of the jacobian matrix of the
transformation from x to z.
Returns
-------
Tensor [shape=(1,)]
The loss.
"""
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
loss = loss / np.prod(z.shape) loss = loss / np.prod(z.shape)
return loss + self.const return loss + self.const

View File

@ -28,6 +28,7 @@ from parakeet.modules.conv import Conv1dCell
from parakeet.modules.audio import quantize, dequantize, STFT from parakeet.modules.audio import quantize, dequantize, STFT
from parakeet.utils import checkpoint, layer_tools from parakeet.utils import checkpoint, layer_tools
__all__ = ["WaveNet", "ConditionalWaveNet"]
def crop(x, audio_start, audio_length): def crop(x, audio_start, audio_length):
"""Crop the upsampled condition to match audio_length. """Crop the upsampled condition to match audio_length.
@ -285,21 +286,35 @@ class ResidualBlock(nn.Layer):
class ResidualNet(nn.LayerList): class ResidualNet(nn.LayerList):
"""The residual network in wavenet.
It consists of ``n_stack`` stacks, each of which consists of ``n_loop``
ResidualBlocks.
Parameters
----------
n_stack : int
Number of stacks in the ``ResidualNet``.
n_loop : int
Number of ResidualBlocks in a stack.
residual_channels : int
Input feature size of each ``ResidualBlock``'s input.
condition_dim : int
Feature size of the condition.
filter_size : int
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
"""
def __init__(self, def __init__(self,
n_stack: int, n_stack: int,
n_loop: int, n_loop: int,
residual_channels: int, residual_channels: int,
condition_dim: int, condition_dim: int,
filter_size: int): filter_size: int):
"""The residual network in wavenet. It consists of `n_layer` stacks, each of which consists of `n_loop` ResidualBlocks.
Args:
n_stack (int): number of stacks in the `ResidualNet`.
n_loop (int): number of ResidualBlocks in a stack.
residual_channels (int): channels of each `ResidualBlock`'s input.
condition_dim (int): channels of the condition.
filter_size (int): filter size of the internal Conv1DCell of each `ResidualBlock`.
"""
super(ResidualNet, self).__init__() super(ResidualNet, self).__init__()
# double the dilation at each layer in a stack # double the dilation at each layer in a stack
dilations = [2**i for i in range(n_loop)] * n_stack dilations = [2**i for i in range(n_loop)] * n_stack
@ -308,13 +323,21 @@ class ResidualNet(nn.LayerList):
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation)) self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
def forward(self, x, condition=None): def forward(self, x, condition=None):
""" """Forward pass of ``ResidualNet``.
Args:
x (Tensor): shape(B, C_res, T), dtype float32, the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) Parameters
condition (Tensor, optional): shape(B, C_cond, T), dtype float32, the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels) Defaults to None. ----------
x : Tensor [shape=(B, C, T)]
The input.
condition : Tensor, optional [shape=(B, C_cond, T)]
The condition, it has been upsampled in time steps, so it has the
same time steps as the input does. Defaults to None.
Returns: Returns
skip_connection (Tensor): shape(B, C_res, T), dtype float32, the output. --------
Tensor [shape=(B, C, T)]
The output.
""" """
for i, func in enumerate(self): for i, func in enumerate(self):
x, skip = func(x, condition) x, skip = func(x, condition)
@ -326,22 +349,32 @@ class ResidualNet(nn.LayerList):
return skip_connections return skip_connections
def start_sequence(self): def start_sequence(self):
"""Prepare the ResidualNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times. """Prepare the ResidualNet to generate a new sequence. This method
should be called before starting calling `add_input` multiple times.
""" """
for block in self: for block in self:
block.start_sequence() block.start_sequence()
def add_input(self, x, condition=None): def add_input(self, x, condition=None):
"""Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion. """Take a step input and return a step output.
This method works similarily with ``forward`` but in a
``step-in-step-out`` fashion.
Args: Parameters
x (Tensor): shape(B, C_res), dtype float32, input for a step. ----------
condition (Tensor, optional): shape(B, C_cond), dtype float32, condition for a step. Defaults to None. x : Tensor [shape=(B, C)]
Input for a step.
condition : Tensor, optional [shape=(B, C_cond)]
Condition for a step. Defaults to None.
Returns: Returns
skip_connection (Tensor): shape(B, C_res), dtype float32, the output for a step. ----------
Tensor [shape=(B, C)]
T he skip connection for a step. This output is accumulated with
that of other ResidualBlocks.
""" """
for i, func in enumerate(self): for i, func in enumerate(self):
x, skip = func.add_input(x, condition) x, skip = func.add_input(x, condition)
if i == 0: if i == 0:
@ -353,20 +386,49 @@ class ResidualNet(nn.LayerList):
class WaveNet(nn.Layer): class WaveNet(nn.Layer):
"""Wavenet that transform upsampled mel spectrogram into waveform.
Parameters
-----------
n_stack : int
``n_stack`` for the internal ``ResidualNet``.
n_loop : int
``n_loop`` for the internal ``ResidualNet``.
residual_channels : int
Feature size of the input.
output_dim : int
Feature size of the input.
condition_dim : int
Feature size of the condition (mel spectrogram bands).
filter_size : int
Kernel size of the internal ``ResidualNet``.
loss_type : str, optional ["mog" or "softmax"]
The output type and loss type of the model, by default "mog".
If "softmax", the model input is first quantized audio and the model
outputs a discret categorical distribution.
If "mog", the model input is audio in floating point format, and the
model outputs parameters for a mixture of gaussian distributions.
Namely, the weight, mean and log scale of each gaussian distribution.
Thus, the ``output_size`` should be a multiple of 3.
log_scale_min : float, optional
Minimum value of the log scale of gaussian distributions, by default
-9.0.
This is only used for computing loss when ``loss_type`` is "mog", If
the predicted log scale is less than -9.0, it is clipped at -9.0.
"""
def __init__(self, n_stack, n_loop, residual_channels, output_dim, def __init__(self, n_stack, n_loop, residual_channels, output_dim,
condition_dim, filter_size, loss_type, log_scale_min): condition_dim, filter_size, loss_type, log_scale_min):
"""Wavenet that transform upsampled mel spectrogram into waveform.
Args:
n_stack (int): n_stack for the internal ResidualNet.
n_loop (int): n_loop for the internal ResidualNet.
residual_channels (int): the channel of the input.
output_dim (int): the channel of the output distribution.
condition_dim (int): the channel of the condition.
filter_size (int): the filter size of the internal ResidualNet.
loss_type (str): loss type of the wavenet. Possible values are 'softmax' and 'mog'. If `loss_type` is 'softmax', the output is the logits of the catrgotical(multinomial) distribution, `output_dim` means the number of classes of the categorical distribution. If `loss_type` is mog(mixture of gaussians), the output is the parameters of a mixture of gaussians, which consists of weight(in the form of logit) of each gaussian distribution and its mean and log standard deviaton. So when `loss_type` is 'mog', `output_dim` should be perfectly divided by 3.
log_scale_min (int): the minimum value of log standard deviation of the output gaussian distributions. Note that this value is only used for computing loss if `loss_type` is 'mog', values less than `log_scale_min` is clipped when computing loss.
"""
super(WaveNet, self).__init__() super(WaveNet, self).__init__()
if loss_type not in ["softmax", "mog"]: if loss_type not in ["softmax", "mog"]:
raise ValueError("loss_type {} is not supported".format(loss_type)) raise ValueError("loss_type {} is not supported".format(loss_type))
@ -396,14 +458,19 @@ class WaveNet(nn.Layer):
self.log_scale_min = log_scale_min self.log_scale_min = log_scale_min
def forward(self, x, condition=None): def forward(self, x, condition=None):
"""compute the output distribution (represented by its parameters). """Forward pass of ``WaveNet``.
Args: Parameters
x (Tensor): shape(B, T), dtype float32, the input waveform. -----------
condition (Tensor, optional): shape(B, C_cond, T), dtype float32, the upsampled condition. Defaults to None. x : Tensor [shape=(B, T)]
The input waveform.
condition : Tensor, optional [shape=(B, C_cond, T)]
the upsampled condition. Defaults to None.
Returns: Returns
Tensor: shape(B, T, C_output), dtype float32, the parameter of the output distributions. -------
Tensor: [shape=(B, T, C_output)]
The parameters of the output distributions.
""" """
# Causal Conv # Causal Conv
@ -426,19 +493,28 @@ class WaveNet(nn.Layer):
return y return y
def start_sequence(self): def start_sequence(self):
"""Prepare the WaveNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times. """Prepare the WaveNet to generate a new sequence. This method should
be called before starting calling ``add_input`` multiple times.
""" """
self.resnet.start_sequence() self.resnet.start_sequence()
def add_input(self, x, condition=None): def add_input(self, x, condition=None):
"""compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion. """Compute the output distribution (represented by its parameters) for
a step. It works similarily with the ``forward`` method but in a
``step-in-step-out`` fashion.
Args: Parameters
x (Tensor): shape(B,), dtype float32, a step of the input waveform. -----------
condition (Tensor, optional): shape(B, C_cond, ), dtype float32, a step of the upsampled condition. Defaults to None. x : Tensor [shape=(B,)]
A step of the input waveform.
condition : Tensor, optional [shape=(B, C_cond)]
A step of the upsampled condition. Defaults to None.
Returns: Returns
Tensor: shape(B, C_output), dtype float32, the parameter of the output distributions. --------
Tensor: [shape=(B, C_output)]
A steo of the parameters of the output distributions.
""" """
# Causal Conv # Causal Conv
if self.loss_type == "softmax": if self.loss_type == "softmax":
@ -458,14 +534,28 @@ class WaveNet(nn.Layer):
return y return y
def compute_softmax_loss(self, y, t): def compute_softmax_loss(self, y, t):
"""compute the loss where output distribution is a categorial distribution. """Compute the loss when output distributions are categorial
distributions.
Args: Parameters
y (Tensor): shape(B, T, C_output), dtype float32, the logits of the output distribution. ----------
t (Tensor): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. y : Tensor [shape=(B, T, C_output)]
The logits of the output distributions.
t : Tensor [shape=(B, T)]
The target audio. The audio is first quantized then used as the
target.
Notes
-------
Output distributions whose input contains padding is neglected in
loss computation. So the first ``context_size`` steps does not
contribute to the loss.
Returns: Returns
Tensor: shape(1, ), dtype float32, the loss. --------
Tensor: [shape=(1,)]
The loss.
""" """
# context size is not taken into account # context size is not taken into account
y = y[:, self.context_size:, :] y = y[:, self.context_size:, :]
@ -479,13 +569,18 @@ class WaveNet(nn.Layer):
return reduced_loss return reduced_loss
def sample_from_softmax(self, y): def sample_from_softmax(self, y):
"""Sample from the output distribution where the output distribution is a categorical distriobution. """Sample from the output distribution when the output distributions
are categorical distriobutions.
Args: Parameters
y (Tensor): shape(B, T, C_output), the logits of the output distribution ----------
y : Tensor [shape=(B, T, C_output)]
The logits of the output distributions.
Returns: Returns
Tensor: shape(B, T), waveform sampled from the output distribution. --------
Tensor [shape=(B, T)]
Waveform sampled from the output distribution.
""" """
# dequantize # dequantize
batch_size, time_steps, output_dim, = y.shape batch_size, time_steps, output_dim, = y.shape
@ -497,14 +592,32 @@ class WaveNet(nn.Layer):
return samples return samples
def compute_mog_loss(self, y, t): def compute_mog_loss(self, y, t):
"""compute the loss where output distribution is a mixture of Gaussians. """Compute the loss where output distributions is a mixture of
Gaussians distributions.
Args: Parameters
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. -----------
t (Tensor): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. y : Tensor [shape=(B, T, C_output)]
The parameterd of the output distribution. It is the concatenation
of 3 parts, the logits of every distribution, the mean of each
distribution and the log standard deviation of each distribution.
Each part's shape is (B, T, n_mixture), where ``n_mixture`` means
the number of Gaussians in the mixture.
t : Tensor [shape=(B, T)]
The target audio.
Notes
-------
Output distributions whose input contains padding is neglected in
loss computation. So the first ``context_size`` steps does not
contribute to the loss.
Returns: Returns
Tensor: shape(1, ), dtype float32, the loss. --------
Tensor: [shape=(1,)]
The loss.
""" """
n_mixture = self.output_dim // 3 n_mixture = self.output_dim // 3
@ -536,12 +649,23 @@ class WaveNet(nn.Layer):
return loss return loss
def sample_from_mog(self, y): def sample_from_mog(self, y):
"""Sample from the output distribution where the output distribution is a mixture of Gaussians. """Sample from the output distribution when the output distribution
Args: is a mixture of Gaussian distributions.
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
Parameters
------------
y : Tensor [shape=(B, T, C_output)]
The parameterd of the output distribution. It is the concatenation
of 3 parts, the logits of every distribution, the mean of each
distribution and the log standard deviation of each distribution.
Each part's shape is (B, T, n_mixture), where ``n_mixture`` means
the number of Gaussians in the mixture.
Returns: Returns
Tensor: shape(B, T), waveform sampled from the output distribution. --------
Tensor: [shape=(B, T)]
Waveform sampled from the output distribution.
""" """
batch_size, time_steps, output_dim = y.shape batch_size, time_steps, output_dim = y.shape
n_mixture = output_dim // 3 n_mixture = output_dim // 3
@ -568,11 +692,16 @@ class WaveNet(nn.Layer):
def sample(self, y): def sample(self, y):
"""Sample from the output distribution. """Sample from the output distribution.
Args:
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. Parameters
----------
y : Tensor [shape=(B, T, C_output)]
The parameterd of the output distribution.
Returns: Returns
Tensor: shape(B, T), waveform sampled from the output distribution. --------
Tensor [shape=(B, T)]
Waveform sampled from the output distribution.
""" """
if self.loss_type == "softmax": if self.loss_type == "softmax":
return self.sample_from_softmax(y) return self.sample_from_softmax(y)
@ -580,14 +709,20 @@ class WaveNet(nn.Layer):
return self.sample_from_mog(y) return self.sample_from_mog(y)
def loss(self, y, t): def loss(self, y, t):
"""compute the loss where output distribution is a mixture of Gaussians. """Compute the loss given the output distribution and the target.
Args: Parameters
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. ----------
t (Tensor): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. y : Tensor [shape=(B, T, C_output)]
The parameterd of the output distribution.
t : Tensor [shape=(B, T)]
The target audio.
Returns: Returns
Tensor: shape(1, ), dtype float32, the loss. ---------
Tensor: [shape=(1,)]
The loss.
""" """
if self.loss_type == "softmax": if self.loss_type == "softmax":
return self.compute_softmax_loss(y, t) return self.compute_softmax_loss(y, t)
@ -640,9 +775,11 @@ class ConditionalWaveNet(nn.Layer):
Thus, the ``output_size`` should be a multiple of 3. Thus, the ``output_size`` should be a multiple of 3.
log_scale_min : float, optional log_scale_min : float, optional
Minimum value of the log probability density, by default -9.0. Minimum value of the log scale of gaussian distributions, by default
-9.0.
This is only used for computing loss when ``loss_type`` is "mog", If the This is only used for computing loss when ``loss_type`` is "mog", If
the predicted log scale is less than -9.0, it is clipped at -9.0.
""" """
def __init__(self, def __init__(self,
upsample_factors: List[int], upsample_factors: List[int],