update docstring for waveflow
This commit is contained in:
parent
f2a35a17d4
commit
b6efb43990
|
@ -80,10 +80,6 @@ class Experiment(ExperimentBase):
|
||||||
z, log_det_jocobian = self.model(wav, mel)
|
z, log_det_jocobian = self.model(wav, mel)
|
||||||
return z, log_det_jocobian
|
return z, log_det_jocobian
|
||||||
|
|
||||||
def compute_losses(self, outputs):
|
|
||||||
loss = self.criterion(outputs)
|
|
||||||
return loss
|
|
||||||
|
|
||||||
def train_batch(self):
|
def train_batch(self):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
batch = self.read_batch()
|
batch = self.read_batch()
|
||||||
|
@ -92,8 +88,8 @@ class Experiment(ExperimentBase):
|
||||||
self.model.train()
|
self.model.train()
|
||||||
self.optimizer.clear_grad()
|
self.optimizer.clear_grad()
|
||||||
mel, wav = batch
|
mel, wav = batch
|
||||||
outputs = self.compute_outputs(mel, wav)
|
z, log_det_jocobian = self.compute_outputs(mel, wav)
|
||||||
loss = self.compute_losses(outputs)
|
loss = self.criterion(z, log_det_jocobian)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
iteration_time = time.time() - start
|
iteration_time = time.time() - start
|
||||||
|
@ -112,8 +108,8 @@ class Experiment(ExperimentBase):
|
||||||
valid_iterator = iter(self.valid_loader)
|
valid_iterator = iter(self.valid_loader)
|
||||||
valid_losses = []
|
valid_losses = []
|
||||||
mel, wav = next(valid_iterator)
|
mel, wav = next(valid_iterator)
|
||||||
outputs = self.compute_outputs(mel, wav)
|
z, log_det_jocobian = self.compute_outputs(mel, wav)
|
||||||
loss = self.compute_losses(outputs)
|
loss = self.criterion(z, log_det_jocobian)
|
||||||
valid_losses.append(float(loss))
|
valid_losses.append(float(loss))
|
||||||
valid_loss = np.mean(valid_losses)
|
valid_loss = np.mean(valid_losses)
|
||||||
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
|
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import List, Union
|
from typing import List, Union, Tuple
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
from paddle.nn import functional as F
|
from paddle.nn import functional as F
|
||||||
|
@ -9,27 +9,56 @@ from paddle.nn import initializer as I
|
||||||
from parakeet.utils import checkpoint
|
from parakeet.utils import checkpoint
|
||||||
from parakeet.modules import geometry as geo
|
from parakeet.modules import geometry as geo
|
||||||
|
|
||||||
__all__ = ["UpsampleNet", "WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
||||||
|
|
||||||
def fold(x, n_group):
|
def fold(x, n_group):
|
||||||
"""Fold audio or spectrogram's temporal dimension in to groups.
|
r"""Fold audio or spectrogram's temporal dimension in to groups.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(*, time_steps), the input tensor
|
----------
|
||||||
n_group (int): the size of a group.
|
x : Tensor [shape=(\*, time_steps)
|
||||||
|
The input tensor.
|
||||||
|
|
||||||
|
n_group : int
|
||||||
|
The size of a group.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(*, time_steps // n_group, group), folded tensor.
|
---------
|
||||||
|
Tensor : [shape=(`*, time_steps // n_group, group)]
|
||||||
|
Folded tensor.
|
||||||
"""
|
"""
|
||||||
*spatial_shape, time_steps = x.shape
|
*spatial_shape, time_steps = x.shape
|
||||||
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
||||||
return paddle.reshape(x, new_shape)
|
return paddle.reshape(x, new_shape)
|
||||||
|
|
||||||
class UpsampleNet(nn.LayerList):
|
class UpsampleNet(nn.LayerList):
|
||||||
"""
|
"""Layer to upsample mel spectrogram to the same temporal resolution with
|
||||||
Layer to upsample mel spectrogram to the same temporal resolution with
|
the corresponding waveform.
|
||||||
the corresponding waveform. It consists of several conv2dtranspose layers
|
|
||||||
which perform de convolution on mel and time dimension.
|
It consists of several conv2dtranspose layers which perform deconvolution
|
||||||
|
on mel and time dimension.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
upscale_factors : List[int], optional
|
||||||
|
Time upsampling factors for each Conv2DTranspose Layer.
|
||||||
|
|
||||||
|
The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
|
||||||
|
Layers. Each upscale_factor is used as the ``stride`` for the
|
||||||
|
corresponding Conv2DTranspose. Defaults to [16, 16], this the default
|
||||||
|
upsampling factor is 256.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
------
|
||||||
|
``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
|
||||||
|
transformation used to extract spectrogram features from audio.
|
||||||
|
|
||||||
|
For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
|
||||||
|
transformation whose ``hop_length`` equals 256 is suitable.
|
||||||
|
|
||||||
|
See Also
|
||||||
|
---------
|
||||||
|
``librosa.core.stft``
|
||||||
"""
|
"""
|
||||||
def __init__(self, upsample_factors):
|
def __init__(self, upsample_factors):
|
||||||
super(UpsampleNet, self).__init__()
|
super(UpsampleNet, self).__init__()
|
||||||
|
@ -49,17 +78,25 @@ class UpsampleNet(nn.LayerList):
|
||||||
self.upsample_factors = upsample_factors
|
self.upsample_factors = upsample_factors
|
||||||
|
|
||||||
def forward(self, x, trim_conv_artifact=False):
|
def forward(self, x, trim_conv_artifact=False):
|
||||||
"""
|
r"""Forward pass of the ``UpsampleNet``.
|
||||||
Args:
|
|
||||||
x (Tensor): shape(batch_size, input_channels, time_steps), the input
|
Parameters
|
||||||
spectrogram.
|
-----------
|
||||||
trim_conv_artifact (bool, optional): trim deconvolution artifact at
|
x : Tensor [shape=(batch_size, input_channels, time_steps)]
|
||||||
each layer. Defaults to False.
|
The input spectrogram.
|
||||||
|
|
||||||
|
trim_conv_artifact : bool, optional
|
||||||
|
Trim deconvolution artifact at each layer. Defaults to False.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(batch_size, input_channels, time_steps * upsample_factor).
|
--------
|
||||||
If trim_conv_artifact is True, the output time steps is less
|
Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)]
|
||||||
than time_steps * upsample_factors.
|
The upsampled spectrogram.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
--------
|
||||||
|
If trim_conv_artifact is ``True``, the output time steps is less
|
||||||
|
than ``time_steps \* upsample_factors``.
|
||||||
"""
|
"""
|
||||||
x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T)
|
x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T)
|
||||||
for layer in self:
|
for layer in self:
|
||||||
|
@ -72,11 +109,27 @@ class UpsampleNet(nn.LayerList):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
#TODO write doc
|
||||||
class ResidualBlock(nn.Layer):
|
class ResidualBlock(nn.Layer):
|
||||||
"""
|
"""ResidualBlock, the basic unit of ResidualNet used in WaveFlow.
|
||||||
ResidualBlock, the basic unit of ResidualNet. It has a conv2d layer, which
|
|
||||||
has causal padding in height dimension and same paddign in width dimension.
|
It has a conv2d layer, which has causal padding in height dimension and
|
||||||
It also has projection for the condition and output.
|
same paddign in width dimension. It also has projection for the condition
|
||||||
|
and output.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
channels : int
|
||||||
|
Feature size of the input.
|
||||||
|
|
||||||
|
cond_channels : int
|
||||||
|
Featuer size of the condition.
|
||||||
|
|
||||||
|
kernel_size : Tuple[int]
|
||||||
|
Kernel size of the Convolution2d applied to the input.
|
||||||
|
|
||||||
|
dilations : int
|
||||||
|
Dilations of the Convolution2d applied to the input.
|
||||||
"""
|
"""
|
||||||
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
||||||
super(ResidualBlock, self).__init__()
|
super(ResidualBlock, self).__init__()
|
||||||
|
@ -113,14 +166,21 @@ class ResidualBlock(nn.Layer):
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Compute output for a whole folded sequence.
|
"""Compute output for a whole folded sequence.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(batch_size, channel, height, width), the input.
|
----------
|
||||||
condition (Tensor): shape(batch_size, condition_channel, height, width),
|
x : Tensor [shape=(batch_size, channel, height, width)]
|
||||||
the local condition.
|
The input.
|
||||||
|
|
||||||
|
condition : Tensor [shape=(batch_size, condition_channel, height, width)]
|
||||||
|
The local condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
res (Tensor): shape(batch_size, channel, height, width), the residual output.
|
-------
|
||||||
res (Tensor): shape(batch_size, channel, height, width), the skip output.
|
res : Tensor [shape=(batch_size, channel, height, width)]
|
||||||
|
The residual output.
|
||||||
|
|
||||||
|
skip : Tensor [shape=(batch_size, channel, height, width)]
|
||||||
|
The skip output.
|
||||||
"""
|
"""
|
||||||
x_in = x
|
x_in = x
|
||||||
x = self.conv(x)
|
x = self.conv(x)
|
||||||
|
@ -131,10 +191,12 @@ class ResidualBlock(nn.Layer):
|
||||||
|
|
||||||
x = self.out_proj(x)
|
x = self.out_proj(x)
|
||||||
res, skip = paddle.chunk(x, 2, axis=1)
|
res, skip = paddle.chunk(x, 2, axis=1)
|
||||||
return x_in + res, skip
|
res = x_in + res
|
||||||
|
return res, skip
|
||||||
|
|
||||||
def start_sequence(self):
|
def start_sequence(self):
|
||||||
"""Prepare the layer for incremental computation of causal convolution. Reset the buffer for causal convolution.
|
"""Prepare the layer for incremental computation of causal
|
||||||
|
convolution. Reset the buffer for causal convolution.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If not in evaluation mode.
|
ValueError: If not in evaluation mode.
|
||||||
|
@ -155,13 +217,21 @@ class ResidualBlock(nn.Layer):
|
||||||
def add_input(self, x_row, condition_row):
|
def add_input(self, x_row, condition_row):
|
||||||
"""Compute the output for a row and update the buffer.
|
"""Compute the output for a row and update the buffer.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input.
|
----------
|
||||||
condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input.
|
x_row : Tensor [shape=(batch_size, channel, 1, width)]
|
||||||
|
A row of the input.
|
||||||
|
|
||||||
|
condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
|
||||||
|
A row of the condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
res (Tensor): shape(batch_size, channel, 1, width), the residual output.
|
-------
|
||||||
res (Tensor): shape(batch_size, channel, 1, width), the skip output.
|
res : Tensor [shape=(batch_size, channel, 1, width)]
|
||||||
|
A row of the the residual output.
|
||||||
|
|
||||||
|
res : Tensor [shape=(batch_size, channel, 1, width)]
|
||||||
|
A row of the skip output.
|
||||||
"""
|
"""
|
||||||
x_row_in = x_row
|
x_row_in = x_row
|
||||||
if self._conv_buffer is None:
|
if self._conv_buffer is None:
|
||||||
|
@ -182,7 +252,8 @@ class ResidualBlock(nn.Layer):
|
||||||
|
|
||||||
x_row = self.out_proj(x_row)
|
x_row = self.out_proj(x_row)
|
||||||
res, skip = paddle.chunk(x_row, 2, axis=1)
|
res, skip = paddle.chunk(x_row, 2, axis=1)
|
||||||
return x_row_in + res, skip
|
res = x_row_in + res
|
||||||
|
return res, skip
|
||||||
|
|
||||||
def _init_buffer(self, input):
|
def _init_buffer(self, input):
|
||||||
batch_size, channels, _, width = input.shape
|
batch_size, channels, _, width = input.shape
|
||||||
|
@ -195,11 +266,36 @@ class ResidualBlock(nn.Layer):
|
||||||
|
|
||||||
|
|
||||||
class ResidualNet(nn.LayerList):
|
class ResidualNet(nn.LayerList):
|
||||||
|
"""A stack of several ResidualBlocks. It merges condition at each layer.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
n_layer : int
|
||||||
|
Number of ResidualBlocks in the ResidualNet.
|
||||||
|
|
||||||
|
residual_channels : int
|
||||||
|
Feature size of each ResidualBlocks.
|
||||||
|
|
||||||
|
condition_channels : int
|
||||||
|
Feature size of the condition.
|
||||||
|
|
||||||
|
kernel_size : Tuple[int]
|
||||||
|
Kernel size of each ResidualBlock.
|
||||||
|
|
||||||
|
dilations_h : List[int]
|
||||||
|
Dilation in height dimension of every ResidualBlock.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the length of dilations_h does not equals n_layers.
|
||||||
"""
|
"""
|
||||||
A stack of several ResidualBlocks. It merges condition at each layer. All
|
def __init__(self,
|
||||||
skip outputs are collected.
|
n_layer: int,
|
||||||
"""
|
residual_channels: int,
|
||||||
def __init__(self, n_layer, residual_channels, condition_channels, kernel_size, dilations_h):
|
condition_channels: int,
|
||||||
|
kernel_size: Tuple[int],
|
||||||
|
dilations_h: List[int]):
|
||||||
if len(dilations_h) != n_layer:
|
if len(dilations_h) != n_layer:
|
||||||
raise ValueError("number of dilations_h should equals num of layers")
|
raise ValueError("number of dilations_h should equals num of layers")
|
||||||
super(ResidualNet, self).__init__()
|
super(ResidualNet, self).__init__()
|
||||||
|
@ -211,14 +307,18 @@ class ResidualNet(nn.LayerList):
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Comput the output of given the input and the condition.
|
"""Comput the output of given the input and the condition.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(batch_size, channel, height, width), the input.
|
-----------
|
||||||
condition (Tensor): shape(batch_size, condition_channel, height, width),
|
x : Tensor [shape=(batch_size, channel, height, width)]
|
||||||
the local condition.
|
The input.
|
||||||
|
|
||||||
|
condition : Tensor [shape=(batch_size, condition_channel, height, width)]
|
||||||
|
The local condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(batch_size, channel, height, width), the output, which
|
--------
|
||||||
is an aggregation of all the skip outputs.
|
Tensor : [shape=(batch_size, channel, height, width)]
|
||||||
|
The output, which is an aggregation of all the skip outputs.
|
||||||
"""
|
"""
|
||||||
skip_connections = []
|
skip_connections = []
|
||||||
for layer in self:
|
for layer in self:
|
||||||
|
@ -228,20 +328,29 @@ class ResidualNet(nn.LayerList):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def start_sequence(self):
|
def start_sequence(self):
|
||||||
"""Prepare the layer for incremental computation."""
|
"""Prepare the layer for incremental computation.
|
||||||
|
"""
|
||||||
for layer in self:
|
for layer in self:
|
||||||
layer.start_sequence()
|
layer.start_sequence()
|
||||||
|
|
||||||
def add_input(self, x_row, condition_row):
|
def add_input(self, x_row, condition_row):
|
||||||
"""Compute the output for a row and update the buffer.
|
"""Compute the output for a row and update the buffers.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input.
|
----------
|
||||||
condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input.
|
x_row : Tensor [shape=(batch_size, channel, 1, width)]
|
||||||
|
A row of the input.
|
||||||
|
|
||||||
|
condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
|
||||||
|
A row of the condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(batch_size, channel, 1, width), the output, which is
|
-------
|
||||||
an aggregation of all the skip outputs.
|
res : Tensor [shape=(batch_size, channel, 1, width)]
|
||||||
|
A row of the the residual output.
|
||||||
|
|
||||||
|
res : Tensor [shape=(batch_size, channel, 1, width)]
|
||||||
|
A row of the skip output.
|
||||||
"""
|
"""
|
||||||
skip_connections = []
|
skip_connections = []
|
||||||
for layer in self:
|
for layer in self:
|
||||||
|
@ -252,12 +361,29 @@ class ResidualNet(nn.LayerList):
|
||||||
|
|
||||||
|
|
||||||
class Flow(nn.Layer):
|
class Flow(nn.Layer):
|
||||||
"""
|
"""A bijection (Reversable layer) that transform a density of latent
|
||||||
A bijection (Reversable layer) that transform a density of latent variables
|
variables p(Z) into a complex data distribution p(X).
|
||||||
p(Z) into a complex data distribution p(X).
|
|
||||||
|
|
||||||
It's a auto regressive flow. The `forward` method implements the probability
|
It's an auto regressive flow. The `forward` method implements the
|
||||||
density estimation. The `inverse` method implements the sampling.
|
probability density estimation. The `inverse` method implements the
|
||||||
|
sampling.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
n_layers : int
|
||||||
|
Number of ResidualBlocks in the Flow.
|
||||||
|
|
||||||
|
channels : int
|
||||||
|
Feature size of the ResidualBlocks.
|
||||||
|
|
||||||
|
mel_bands : int
|
||||||
|
Feature size of the mel spectrogram (mel bands).
|
||||||
|
|
||||||
|
kernel_size : Tuple[int]
|
||||||
|
Kernel size of each ResisualBlocks in the Flow.
|
||||||
|
|
||||||
|
n_group : int
|
||||||
|
Number of timesteps to the folded into a group.
|
||||||
"""
|
"""
|
||||||
dilations_dict = {
|
dilations_dict = {
|
||||||
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
|
@ -301,18 +427,29 @@ class Flow(nn.Layer):
|
||||||
return z_out
|
return z_out
|
||||||
|
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Probability density estimation. It is done by inversely transform a sample
|
"""Probability density estimation. It is done by inversely transform
|
||||||
from p(X) back into a sample from p(Z).
|
a sample from p(X) into a sample from p(Z).
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(X).
|
-----------
|
||||||
condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
|
x : Tensor [shape=(batch, 1, height, width)]
|
||||||
|
A input sample of the distribution p(X).
|
||||||
|
|
||||||
|
condition : Tensor [shape=(batch, condition_channel, height, width)]
|
||||||
|
The local condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
(z, (logs, b))
|
--------
|
||||||
z (Tensor): shape(batch, 1, height, width), the transformed sample.
|
z (Tensor): shape(batch, 1, height, width), the transformed sample.
|
||||||
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
|
|
||||||
b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
|
Tuple[Tensor, Tensor]
|
||||||
|
The parameter of the transformation.
|
||||||
|
|
||||||
|
logs (Tensor): shape(batch, 1, height - 1, width), the log scale
|
||||||
|
of the transformation from x to z.
|
||||||
|
|
||||||
|
b (Tensor): shape(batch, 1, height - 1, width), the shift of the
|
||||||
|
transformation from x to z.
|
||||||
"""
|
"""
|
||||||
# (B, C, H-1, W)
|
# (B, C, H-1, W)
|
||||||
logs, b = self._predict_parameters(
|
logs, b = self._predict_parameters(
|
||||||
|
@ -340,18 +477,30 @@ class Flow(nn.Layer):
|
||||||
self.resnet.start_sequence()
|
self.resnet.start_sequence()
|
||||||
|
|
||||||
def inverse(self, z, condition):
|
def inverse(self, z, condition):
|
||||||
"""Sampling from the the distrition p(X). It is done by sample form p(Z)
|
"""Sampling from the the distrition p(X). It is done by sample form
|
||||||
and transform the sample. It is a auto regressive transformation.
|
p(Z) and transform the sample. It is a auto regressive transformation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
z (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(Z).
|
-----------
|
||||||
condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
|
z : Tensor [shape=(batch, 1, height, width)]
|
||||||
|
A sample of the distribution p(Z).
|
||||||
|
|
||||||
|
condition : Tensor [shape=(batch, condition_channel, height, width)]
|
||||||
|
The local condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
(x, (logs, b))
|
---------
|
||||||
x (Tensor): shape(batch, 1, height, width), the transformed sample.
|
x : Tensor [shape=(batch, 1, height, width)]
|
||||||
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
|
The transformed sample.
|
||||||
b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
|
|
||||||
|
Tuple[Tensor, Tensor]
|
||||||
|
The parameter of the transformation.
|
||||||
|
|
||||||
|
logs (Tensor): shape(batch, 1, height - 1, width), the log scale
|
||||||
|
of the transformation from x to z.
|
||||||
|
|
||||||
|
b (Tensor): shape(batch, 1, height - 1, width), the shift of the
|
||||||
|
transformation from x to z.
|
||||||
"""
|
"""
|
||||||
z_0 = z[:, :, :1, :]
|
z_0 = z[:, :, :1, :]
|
||||||
x = []
|
x = []
|
||||||
|
@ -377,7 +526,29 @@ class Flow(nn.Layer):
|
||||||
|
|
||||||
|
|
||||||
class WaveFlow(nn.LayerList):
|
class WaveFlow(nn.LayerList):
|
||||||
"""An Deep Reversible layer that is composed of a stack of auto regressive flows.s"""
|
"""An Deep Reversible layer that is composed of severel auto regressive
|
||||||
|
flows.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
-----------
|
||||||
|
n_flows : int
|
||||||
|
Number of flows in the WaveFlow model.
|
||||||
|
|
||||||
|
n_layers : int
|
||||||
|
Number of ResidualBlocks in each Flow.
|
||||||
|
|
||||||
|
n_group : int
|
||||||
|
Number of timesteps to fold as a group.
|
||||||
|
|
||||||
|
channels : int
|
||||||
|
Feature size of each ResidualBlock.
|
||||||
|
|
||||||
|
mel_bands : int
|
||||||
|
Feature size of mel spectrogram (mel bands).
|
||||||
|
|
||||||
|
kernel_size : Union[int, List[int]]
|
||||||
|
Kernel size of the convolution layer in each ResidualBlock.
|
||||||
|
"""
|
||||||
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
|
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
|
||||||
if n_group % 2 or n_flows % 2:
|
if n_group % 2 or n_flows % 2:
|
||||||
raise ValueError("number of flows and number of group must be even "
|
raise ValueError("number of flows and number of group must be even "
|
||||||
|
@ -416,15 +587,25 @@ class WaveFlow(nn.LayerList):
|
||||||
return x, condition
|
return x, condition
|
||||||
|
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Probability density estimation.
|
"""Probability density estimation of random variable x given the
|
||||||
|
condition.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(batch_size, time_steps), the audio.
|
-----------
|
||||||
condition (Tensor): shape(batch_size, condition channel, time_steps), the local condition.
|
x : Tensor [shape=(batch_size, time_steps)]
|
||||||
|
The audio.
|
||||||
|
|
||||||
|
condition : Tensor [shape=(batch_size, condition channel, time_steps)]
|
||||||
|
The local condition (mel spectrogram here).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
z: (Tensor): shape(batch_size, time_steps), the transformed sample.
|
--------
|
||||||
log_det_jacobian: (Tensor), shape(1,), the log determinant of the jacobian of (dz/dx).
|
z : Tensor [shape=(batch_size, time_steps)]
|
||||||
|
The transformed random variable.
|
||||||
|
|
||||||
|
log_det_jacobian: Tensor [shape=(1,)]
|
||||||
|
The log determinant of the jacobian of the transformation from x
|
||||||
|
to z.
|
||||||
"""
|
"""
|
||||||
# x: (B, T)
|
# x: (B, T)
|
||||||
# condition: (B, C, T) upsampled condition
|
# condition: (B, C, T) upsampled condition
|
||||||
|
@ -451,15 +632,24 @@ class WaveFlow(nn.LayerList):
|
||||||
return z, log_det_jacobian
|
return z, log_det_jacobian
|
||||||
|
|
||||||
def inverse(self, z, condition):
|
def inverse(self, z, condition):
|
||||||
"""Sampling from the the distrition p(X). It is done by sample form p(Z)
|
"""Sampling from the the distrition p(X).
|
||||||
and transform the sample. It is a auto regressive transformation.
|
|
||||||
|
It is done by sample a ``z`` form p(Z) and transform it into ``x``.
|
||||||
|
Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
|
||||||
|
autoregressive manner.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
z (Tensor): shape(batch, 1, time_steps), a input sample of the distribution p(Z).
|
----------
|
||||||
condition (Tensor): shape(batch, condition_channel, time_steps), the local condition.
|
z : Tensor [shape=(batch, 1, time_steps]
|
||||||
|
A sample of the distribution p(Z).
|
||||||
|
|
||||||
|
condition : Tensor [shape=(batch, condition_channel, time_steps)]
|
||||||
|
The local condition.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
x: (Tensor): shape(batch_size, time_steps), the transformed sample.
|
--------
|
||||||
|
x : Tensor [shape=(batch_size, time_steps)]
|
||||||
|
The transformed sample (audio here).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
z, condition = self._trim(z, condition)
|
z, condition = self._trim(z, condition)
|
||||||
|
@ -480,6 +670,31 @@ class WaveFlow(nn.LayerList):
|
||||||
|
|
||||||
|
|
||||||
class ConditionalWaveFlow(nn.LayerList):
|
class ConditionalWaveFlow(nn.LayerList):
|
||||||
|
"""ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
upsample_factors : List[int]
|
||||||
|
Upsample factors for the upsample net.
|
||||||
|
|
||||||
|
n_flows : int
|
||||||
|
Number of flows in the WaveFlow model.
|
||||||
|
|
||||||
|
n_layers : int
|
||||||
|
Number of ResidualBlocks in each Flow.
|
||||||
|
|
||||||
|
n_group : int
|
||||||
|
Number of timesteps to fold as a group.
|
||||||
|
|
||||||
|
channels : int
|
||||||
|
Feature size of each ResidualBlock.
|
||||||
|
|
||||||
|
n_mels : int
|
||||||
|
Feature size of mel spectrogram (mel bands).
|
||||||
|
|
||||||
|
kernel_size : Union[int, List[int]]
|
||||||
|
Kernel size of the convolution layer in each ResidualBlock.
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
upsample_factors: List[int],
|
upsample_factors: List[int],
|
||||||
n_flows: int,
|
n_flows: int,
|
||||||
|
@ -499,12 +714,44 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
kernel_size=kernel_size)
|
kernel_size=kernel_size)
|
||||||
|
|
||||||
def forward(self, audio, mel):
|
def forward(self, audio, mel):
|
||||||
|
"""Compute the transformed random variable z (x to z) and the log of
|
||||||
|
the determinant of the jacobian of the transformation from x to z.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
audio : Tensor [shape=(B, T)]
|
||||||
|
The audio.
|
||||||
|
|
||||||
|
mel : Tensor [shape=(B, C_mel, T_mel)]
|
||||||
|
The mel spectrogram.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
z : Tensor [shape=(B, T)]
|
||||||
|
The inversely transformed random variable z (x to z)
|
||||||
|
|
||||||
|
log_det_jacobian: Tensor [shape=(1,)]
|
||||||
|
the log of the determinant of the jacobian of the transformation
|
||||||
|
from x to z.
|
||||||
|
"""
|
||||||
condition = self.encoder(mel)
|
condition = self.encoder(mel)
|
||||||
z, log_det_jacobian = self.decoder(audio, condition)
|
z, log_det_jacobian = self.decoder(audio, condition)
|
||||||
return z, log_det_jacobian
|
return z, log_det_jacobian
|
||||||
|
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
def infer(self, mel):
|
def infer(self, mel):
|
||||||
|
r"""Generate raw audio given mel spectrogram.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mel : Tensor [shape=(B, C_mel, T_mel)]
|
||||||
|
Mel spectrogram (in log-magnitude).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor : [shape=(B, T)]
|
||||||
|
The synthesized audio, where``T <= T_mel \* upsample_factors``.
|
||||||
|
"""
|
||||||
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
|
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
|
||||||
batch_size, _, time_steps = condition.shape
|
batch_size, _, time_steps = condition.shape
|
||||||
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
||||||
|
@ -513,6 +760,18 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
|
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
def predict(self, mel):
|
def predict(self, mel):
|
||||||
|
"""Generate raw audio given mel spectrogram.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mel : np.ndarray [shape=(C_mel, T_mel)]
|
||||||
|
Mel spectrogram of an utterance(in log-magnitude).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray [shape=(T,)]
|
||||||
|
The synthesized audio.
|
||||||
|
"""
|
||||||
mel = paddle.to_tensor(mel)
|
mel = paddle.to_tensor(mel)
|
||||||
mel = paddle.unsqueeze(mel, 0)
|
mel = paddle.unsqueeze(mel, 0)
|
||||||
audio = self.infer(mel)
|
audio = self.infer(mel)
|
||||||
|
@ -521,6 +780,21 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, config, checkpoint_path):
|
def from_pretrained(cls, config, checkpoint_path):
|
||||||
|
"""Build a ConditionalWaveFlow model from a pretrained model.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config: yacs.config.CfgNode
|
||||||
|
model configs
|
||||||
|
|
||||||
|
checkpoint_path: Path or str
|
||||||
|
the path of pretrained model checkpoint, without extension name
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ConditionalWaveFlow
|
||||||
|
The model built from pretrained result.
|
||||||
|
"""
|
||||||
model = cls(
|
model = cls(
|
||||||
upsample_factors=config.model.upsample_factors,
|
upsample_factors=config.model.upsample_factors,
|
||||||
n_flows=config.model.n_flows,
|
n_flows=config.model.n_flows,
|
||||||
|
@ -534,14 +808,37 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
|
|
||||||
|
|
||||||
class WaveFlowLoss(nn.Layer):
|
class WaveFlowLoss(nn.Layer):
|
||||||
|
"""Criterion of a WaveFlow model.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
sigma : float
|
||||||
|
The standard deviation of the gaussian noise used in WaveFlow, by
|
||||||
|
default 1.0.
|
||||||
|
"""
|
||||||
def __init__(self, sigma=1.0):
|
def __init__(self, sigma=1.0):
|
||||||
super(WaveFlowLoss, self).__init__()
|
super(WaveFlowLoss, self).__init__()
|
||||||
self.sigma = sigma
|
self.sigma = sigma
|
||||||
self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
|
self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
|
||||||
|
|
||||||
def forward(self, model_output):
|
def forward(self, z, log_det_jacobian):
|
||||||
z, log_det_jacobian = model_output
|
"""Compute the loss given the transformed random variable z and the
|
||||||
|
log_det_jacobian of transformation from x to z.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
z : Tensor [shape=(B, T)]
|
||||||
|
The transformed random variable (x to z).
|
||||||
|
|
||||||
|
log_det_jacobian : Tensor [shape=(1,)]
|
||||||
|
The log of the determinant of the jacobian matrix of the
|
||||||
|
transformation from x to z.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor [shape=(1,)]
|
||||||
|
The loss.
|
||||||
|
"""
|
||||||
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
|
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
|
||||||
loss = loss / np.prod(z.shape)
|
loss = loss / np.prod(z.shape)
|
||||||
return loss + self.const
|
return loss + self.const
|
||||||
|
|
|
@ -28,6 +28,7 @@ from parakeet.modules.conv import Conv1dCell
|
||||||
from parakeet.modules.audio import quantize, dequantize, STFT
|
from parakeet.modules.audio import quantize, dequantize, STFT
|
||||||
from parakeet.utils import checkpoint, layer_tools
|
from parakeet.utils import checkpoint, layer_tools
|
||||||
|
|
||||||
|
__all__ = ["WaveNet", "ConditionalWaveNet"]
|
||||||
|
|
||||||
def crop(x, audio_start, audio_length):
|
def crop(x, audio_start, audio_length):
|
||||||
"""Crop the upsampled condition to match audio_length.
|
"""Crop the upsampled condition to match audio_length.
|
||||||
|
@ -285,21 +286,35 @@ class ResidualBlock(nn.Layer):
|
||||||
|
|
||||||
|
|
||||||
class ResidualNet(nn.LayerList):
|
class ResidualNet(nn.LayerList):
|
||||||
|
"""The residual network in wavenet.
|
||||||
|
|
||||||
|
It consists of ``n_stack`` stacks, each of which consists of ``n_loop``
|
||||||
|
ResidualBlocks.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
n_stack : int
|
||||||
|
Number of stacks in the ``ResidualNet``.
|
||||||
|
|
||||||
|
n_loop : int
|
||||||
|
Number of ResidualBlocks in a stack.
|
||||||
|
|
||||||
|
residual_channels : int
|
||||||
|
Input feature size of each ``ResidualBlock``'s input.
|
||||||
|
|
||||||
|
condition_dim : int
|
||||||
|
Feature size of the condition.
|
||||||
|
|
||||||
|
filter_size : int
|
||||||
|
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
|
||||||
|
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
n_stack: int,
|
n_stack: int,
|
||||||
n_loop: int,
|
n_loop: int,
|
||||||
residual_channels: int,
|
residual_channels: int,
|
||||||
condition_dim: int,
|
condition_dim: int,
|
||||||
filter_size: int):
|
filter_size: int):
|
||||||
"""The residual network in wavenet. It consists of `n_layer` stacks, each of which consists of `n_loop` ResidualBlocks.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
n_stack (int): number of stacks in the `ResidualNet`.
|
|
||||||
n_loop (int): number of ResidualBlocks in a stack.
|
|
||||||
residual_channels (int): channels of each `ResidualBlock`'s input.
|
|
||||||
condition_dim (int): channels of the condition.
|
|
||||||
filter_size (int): filter size of the internal Conv1DCell of each `ResidualBlock`.
|
|
||||||
"""
|
|
||||||
super(ResidualNet, self).__init__()
|
super(ResidualNet, self).__init__()
|
||||||
# double the dilation at each layer in a stack
|
# double the dilation at each layer in a stack
|
||||||
dilations = [2**i for i in range(n_loop)] * n_stack
|
dilations = [2**i for i in range(n_loop)] * n_stack
|
||||||
|
@ -308,13 +323,21 @@ class ResidualNet(nn.LayerList):
|
||||||
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
|
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
|
||||||
|
|
||||||
def forward(self, x, condition=None):
|
def forward(self, x, condition=None):
|
||||||
"""
|
"""Forward pass of ``ResidualNet``.
|
||||||
Args:
|
|
||||||
x (Tensor): shape(B, C_res, T), dtype float32, the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.)
|
Parameters
|
||||||
condition (Tensor, optional): shape(B, C_cond, T), dtype float32, the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels) Defaults to None.
|
----------
|
||||||
|
x : Tensor [shape=(B, C, T)]
|
||||||
|
The input.
|
||||||
|
|
||||||
|
condition : Tensor, optional [shape=(B, C_cond, T)]
|
||||||
|
The condition, it has been upsampled in time steps, so it has the
|
||||||
|
same time steps as the input does. Defaults to None.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
skip_connection (Tensor): shape(B, C_res, T), dtype float32, the output.
|
--------
|
||||||
|
Tensor [shape=(B, C, T)]
|
||||||
|
The output.
|
||||||
"""
|
"""
|
||||||
for i, func in enumerate(self):
|
for i, func in enumerate(self):
|
||||||
x, skip = func(x, condition)
|
x, skip = func(x, condition)
|
||||||
|
@ -326,22 +349,32 @@ class ResidualNet(nn.LayerList):
|
||||||
return skip_connections
|
return skip_connections
|
||||||
|
|
||||||
def start_sequence(self):
|
def start_sequence(self):
|
||||||
"""Prepare the ResidualNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
|
"""Prepare the ResidualNet to generate a new sequence. This method
|
||||||
|
should be called before starting calling `add_input` multiple times.
|
||||||
"""
|
"""
|
||||||
for block in self:
|
for block in self:
|
||||||
block.start_sequence()
|
block.start_sequence()
|
||||||
|
|
||||||
def add_input(self, x, condition=None):
|
def add_input(self, x, condition=None):
|
||||||
"""Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion.
|
"""Take a step input and return a step output.
|
||||||
|
|
||||||
|
This method works similarily with ``forward`` but in a
|
||||||
|
``step-in-step-out`` fashion.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(B, C_res), dtype float32, input for a step.
|
----------
|
||||||
condition (Tensor, optional): shape(B, C_cond), dtype float32, condition for a step. Defaults to None.
|
x : Tensor [shape=(B, C)]
|
||||||
|
Input for a step.
|
||||||
|
|
||||||
|
condition : Tensor, optional [shape=(B, C_cond)]
|
||||||
|
Condition for a step. Defaults to None.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
skip_connection (Tensor): shape(B, C_res), dtype float32, the output for a step.
|
----------
|
||||||
|
Tensor [shape=(B, C)]
|
||||||
|
T he skip connection for a step. This output is accumulated with
|
||||||
|
that of other ResidualBlocks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for i, func in enumerate(self):
|
for i, func in enumerate(self):
|
||||||
x, skip = func.add_input(x, condition)
|
x, skip = func.add_input(x, condition)
|
||||||
if i == 0:
|
if i == 0:
|
||||||
|
@ -353,20 +386,49 @@ class ResidualNet(nn.LayerList):
|
||||||
|
|
||||||
|
|
||||||
class WaveNet(nn.Layer):
|
class WaveNet(nn.Layer):
|
||||||
|
"""Wavenet that transform upsampled mel spectrogram into waveform.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
-----------
|
||||||
|
n_stack : int
|
||||||
|
``n_stack`` for the internal ``ResidualNet``.
|
||||||
|
|
||||||
|
n_loop : int
|
||||||
|
``n_loop`` for the internal ``ResidualNet``.
|
||||||
|
|
||||||
|
residual_channels : int
|
||||||
|
Feature size of the input.
|
||||||
|
|
||||||
|
output_dim : int
|
||||||
|
Feature size of the input.
|
||||||
|
|
||||||
|
condition_dim : int
|
||||||
|
Feature size of the condition (mel spectrogram bands).
|
||||||
|
|
||||||
|
filter_size : int
|
||||||
|
Kernel size of the internal ``ResidualNet``.
|
||||||
|
|
||||||
|
loss_type : str, optional ["mog" or "softmax"]
|
||||||
|
The output type and loss type of the model, by default "mog".
|
||||||
|
|
||||||
|
If "softmax", the model input is first quantized audio and the model
|
||||||
|
outputs a discret categorical distribution.
|
||||||
|
|
||||||
|
If "mog", the model input is audio in floating point format, and the
|
||||||
|
model outputs parameters for a mixture of gaussian distributions.
|
||||||
|
Namely, the weight, mean and log scale of each gaussian distribution.
|
||||||
|
Thus, the ``output_size`` should be a multiple of 3.
|
||||||
|
|
||||||
|
log_scale_min : float, optional
|
||||||
|
Minimum value of the log scale of gaussian distributions, by default
|
||||||
|
-9.0.
|
||||||
|
|
||||||
|
This is only used for computing loss when ``loss_type`` is "mog", If
|
||||||
|
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
||||||
|
"""
|
||||||
def __init__(self, n_stack, n_loop, residual_channels, output_dim,
|
def __init__(self, n_stack, n_loop, residual_channels, output_dim,
|
||||||
condition_dim, filter_size, loss_type, log_scale_min):
|
condition_dim, filter_size, loss_type, log_scale_min):
|
||||||
"""Wavenet that transform upsampled mel spectrogram into waveform.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
n_stack (int): n_stack for the internal ResidualNet.
|
|
||||||
n_loop (int): n_loop for the internal ResidualNet.
|
|
||||||
residual_channels (int): the channel of the input.
|
|
||||||
output_dim (int): the channel of the output distribution.
|
|
||||||
condition_dim (int): the channel of the condition.
|
|
||||||
filter_size (int): the filter size of the internal ResidualNet.
|
|
||||||
loss_type (str): loss type of the wavenet. Possible values are 'softmax' and 'mog'. If `loss_type` is 'softmax', the output is the logits of the catrgotical(multinomial) distribution, `output_dim` means the number of classes of the categorical distribution. If `loss_type` is mog(mixture of gaussians), the output is the parameters of a mixture of gaussians, which consists of weight(in the form of logit) of each gaussian distribution and its mean and log standard deviaton. So when `loss_type` is 'mog', `output_dim` should be perfectly divided by 3.
|
|
||||||
log_scale_min (int): the minimum value of log standard deviation of the output gaussian distributions. Note that this value is only used for computing loss if `loss_type` is 'mog', values less than `log_scale_min` is clipped when computing loss.
|
|
||||||
"""
|
|
||||||
super(WaveNet, self).__init__()
|
super(WaveNet, self).__init__()
|
||||||
if loss_type not in ["softmax", "mog"]:
|
if loss_type not in ["softmax", "mog"]:
|
||||||
raise ValueError("loss_type {} is not supported".format(loss_type))
|
raise ValueError("loss_type {} is not supported".format(loss_type))
|
||||||
|
@ -396,14 +458,19 @@ class WaveNet(nn.Layer):
|
||||||
self.log_scale_min = log_scale_min
|
self.log_scale_min = log_scale_min
|
||||||
|
|
||||||
def forward(self, x, condition=None):
|
def forward(self, x, condition=None):
|
||||||
"""compute the output distribution (represented by its parameters).
|
"""Forward pass of ``WaveNet``.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(B, T), dtype float32, the input waveform.
|
-----------
|
||||||
condition (Tensor, optional): shape(B, C_cond, T), dtype float32, the upsampled condition. Defaults to None.
|
x : Tensor [shape=(B, T)]
|
||||||
|
The input waveform.
|
||||||
|
condition : Tensor, optional [shape=(B, C_cond, T)]
|
||||||
|
the upsampled condition. Defaults to None.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(B, T, C_output), dtype float32, the parameter of the output distributions.
|
-------
|
||||||
|
Tensor: [shape=(B, T, C_output)]
|
||||||
|
The parameters of the output distributions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Causal Conv
|
# Causal Conv
|
||||||
|
@ -426,19 +493,28 @@ class WaveNet(nn.Layer):
|
||||||
return y
|
return y
|
||||||
|
|
||||||
def start_sequence(self):
|
def start_sequence(self):
|
||||||
"""Prepare the WaveNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
|
"""Prepare the WaveNet to generate a new sequence. This method should
|
||||||
|
be called before starting calling ``add_input`` multiple times.
|
||||||
"""
|
"""
|
||||||
self.resnet.start_sequence()
|
self.resnet.start_sequence()
|
||||||
|
|
||||||
def add_input(self, x, condition=None):
|
def add_input(self, x, condition=None):
|
||||||
"""compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion.
|
"""Compute the output distribution (represented by its parameters) for
|
||||||
|
a step. It works similarily with the ``forward`` method but in a
|
||||||
|
``step-in-step-out`` fashion.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (Tensor): shape(B,), dtype float32, a step of the input waveform.
|
-----------
|
||||||
condition (Tensor, optional): shape(B, C_cond, ), dtype float32, a step of the upsampled condition. Defaults to None.
|
x : Tensor [shape=(B,)]
|
||||||
|
A step of the input waveform.
|
||||||
|
|
||||||
|
condition : Tensor, optional [shape=(B, C_cond)]
|
||||||
|
A step of the upsampled condition. Defaults to None.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(B, C_output), dtype float32, the parameter of the output distributions.
|
--------
|
||||||
|
Tensor: [shape=(B, C_output)]
|
||||||
|
A steo of the parameters of the output distributions.
|
||||||
"""
|
"""
|
||||||
# Causal Conv
|
# Causal Conv
|
||||||
if self.loss_type == "softmax":
|
if self.loss_type == "softmax":
|
||||||
|
@ -458,14 +534,28 @@ class WaveNet(nn.Layer):
|
||||||
return y
|
return y
|
||||||
|
|
||||||
def compute_softmax_loss(self, y, t):
|
def compute_softmax_loss(self, y, t):
|
||||||
"""compute the loss where output distribution is a categorial distribution.
|
"""Compute the loss when output distributions are categorial
|
||||||
|
distributions.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
y (Tensor): shape(B, T, C_output), dtype float32, the logits of the output distribution.
|
----------
|
||||||
t (Tensor): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.
|
y : Tensor [shape=(B, T, C_output)]
|
||||||
|
The logits of the output distributions.
|
||||||
|
|
||||||
|
t : Tensor [shape=(B, T)]
|
||||||
|
The target audio. The audio is first quantized then used as the
|
||||||
|
target.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-------
|
||||||
|
Output distributions whose input contains padding is neglected in
|
||||||
|
loss computation. So the first ``context_size`` steps does not
|
||||||
|
contribute to the loss.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(1, ), dtype float32, the loss.
|
--------
|
||||||
|
Tensor: [shape=(1,)]
|
||||||
|
The loss.
|
||||||
"""
|
"""
|
||||||
# context size is not taken into account
|
# context size is not taken into account
|
||||||
y = y[:, self.context_size:, :]
|
y = y[:, self.context_size:, :]
|
||||||
|
@ -479,13 +569,18 @@ class WaveNet(nn.Layer):
|
||||||
return reduced_loss
|
return reduced_loss
|
||||||
|
|
||||||
def sample_from_softmax(self, y):
|
def sample_from_softmax(self, y):
|
||||||
"""Sample from the output distribution where the output distribution is a categorical distriobution.
|
"""Sample from the output distribution when the output distributions
|
||||||
|
are categorical distriobutions.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
y (Tensor): shape(B, T, C_output), the logits of the output distribution
|
----------
|
||||||
|
y : Tensor [shape=(B, T, C_output)]
|
||||||
|
The logits of the output distributions.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(B, T), waveform sampled from the output distribution.
|
--------
|
||||||
|
Tensor [shape=(B, T)]
|
||||||
|
Waveform sampled from the output distribution.
|
||||||
"""
|
"""
|
||||||
# dequantize
|
# dequantize
|
||||||
batch_size, time_steps, output_dim, = y.shape
|
batch_size, time_steps, output_dim, = y.shape
|
||||||
|
@ -497,14 +592,32 @@ class WaveNet(nn.Layer):
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
def compute_mog_loss(self, y, t):
|
def compute_mog_loss(self, y, t):
|
||||||
"""compute the loss where output distribution is a mixture of Gaussians.
|
"""Compute the loss where output distributions is a mixture of
|
||||||
|
Gaussians distributions.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
|
-----------
|
||||||
t (Tensor): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.
|
y : Tensor [shape=(B, T, C_output)]
|
||||||
|
The parameterd of the output distribution. It is the concatenation
|
||||||
|
of 3 parts, the logits of every distribution, the mean of each
|
||||||
|
distribution and the log standard deviation of each distribution.
|
||||||
|
|
||||||
|
Each part's shape is (B, T, n_mixture), where ``n_mixture`` means
|
||||||
|
the number of Gaussians in the mixture.
|
||||||
|
|
||||||
|
t : Tensor [shape=(B, T)]
|
||||||
|
The target audio.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-------
|
||||||
|
Output distributions whose input contains padding is neglected in
|
||||||
|
loss computation. So the first ``context_size`` steps does not
|
||||||
|
contribute to the loss.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(1, ), dtype float32, the loss.
|
--------
|
||||||
|
Tensor: [shape=(1,)]
|
||||||
|
The loss.
|
||||||
"""
|
"""
|
||||||
n_mixture = self.output_dim // 3
|
n_mixture = self.output_dim // 3
|
||||||
|
|
||||||
|
@ -536,12 +649,23 @@ class WaveNet(nn.Layer):
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def sample_from_mog(self, y):
|
def sample_from_mog(self, y):
|
||||||
"""Sample from the output distribution where the output distribution is a mixture of Gaussians.
|
"""Sample from the output distribution when the output distribution
|
||||||
Args:
|
is a mixture of Gaussian distributions.
|
||||||
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture.
|
|
||||||
|
Parameters
|
||||||
|
------------
|
||||||
|
y : Tensor [shape=(B, T, C_output)]
|
||||||
|
The parameterd of the output distribution. It is the concatenation
|
||||||
|
of 3 parts, the logits of every distribution, the mean of each
|
||||||
|
distribution and the log standard deviation of each distribution.
|
||||||
|
|
||||||
|
Each part's shape is (B, T, n_mixture), where ``n_mixture`` means
|
||||||
|
the number of Gaussians in the mixture.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(B, T), waveform sampled from the output distribution.
|
--------
|
||||||
|
Tensor: [shape=(B, T)]
|
||||||
|
Waveform sampled from the output distribution.
|
||||||
"""
|
"""
|
||||||
batch_size, time_steps, output_dim = y.shape
|
batch_size, time_steps, output_dim = y.shape
|
||||||
n_mixture = output_dim // 3
|
n_mixture = output_dim // 3
|
||||||
|
@ -568,11 +692,16 @@ class WaveNet(nn.Layer):
|
||||||
|
|
||||||
def sample(self, y):
|
def sample(self, y):
|
||||||
"""Sample from the output distribution.
|
"""Sample from the output distribution.
|
||||||
Args:
|
|
||||||
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution.
|
Parameters
|
||||||
|
----------
|
||||||
|
y : Tensor [shape=(B, T, C_output)]
|
||||||
|
The parameterd of the output distribution.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(B, T), waveform sampled from the output distribution.
|
--------
|
||||||
|
Tensor [shape=(B, T)]
|
||||||
|
Waveform sampled from the output distribution.
|
||||||
"""
|
"""
|
||||||
if self.loss_type == "softmax":
|
if self.loss_type == "softmax":
|
||||||
return self.sample_from_softmax(y)
|
return self.sample_from_softmax(y)
|
||||||
|
@ -580,14 +709,20 @@ class WaveNet(nn.Layer):
|
||||||
return self.sample_from_mog(y)
|
return self.sample_from_mog(y)
|
||||||
|
|
||||||
def loss(self, y, t):
|
def loss(self, y, t):
|
||||||
"""compute the loss where output distribution is a mixture of Gaussians.
|
"""Compute the loss given the output distribution and the target.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
y (Tensor): shape(B, T, C_output), dtype float32, the parameterd of the output distribution.
|
----------
|
||||||
t (Tensor): shape(B, T), dtype float32, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation.
|
y : Tensor [shape=(B, T, C_output)]
|
||||||
|
The parameterd of the output distribution.
|
||||||
|
|
||||||
|
t : Tensor [shape=(B, T)]
|
||||||
|
The target audio.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: shape(1, ), dtype float32, the loss.
|
---------
|
||||||
|
Tensor: [shape=(1,)]
|
||||||
|
The loss.
|
||||||
"""
|
"""
|
||||||
if self.loss_type == "softmax":
|
if self.loss_type == "softmax":
|
||||||
return self.compute_softmax_loss(y, t)
|
return self.compute_softmax_loss(y, t)
|
||||||
|
@ -640,9 +775,11 @@ class ConditionalWaveNet(nn.Layer):
|
||||||
Thus, the ``output_size`` should be a multiple of 3.
|
Thus, the ``output_size`` should be a multiple of 3.
|
||||||
|
|
||||||
log_scale_min : float, optional
|
log_scale_min : float, optional
|
||||||
Minimum value of the log probability density, by default -9.0.
|
Minimum value of the log scale of gaussian distributions, by default
|
||||||
|
-9.0.
|
||||||
|
|
||||||
This is only used for computing loss when ``loss_type`` is "mog", If the
|
This is only used for computing loss when ``loss_type`` is "mog", If
|
||||||
|
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
upsample_factors: List[int],
|
upsample_factors: List[int],
|
||||||
|
|
Loading…
Reference in New Issue