548 lines
21 KiB
Python
548 lines
21 KiB
Python
import math
|
|
import numpy as np
|
|
from typing import List, Union
|
|
import paddle
|
|
from paddle import nn
|
|
from paddle.nn import functional as F
|
|
from paddle.nn import initializer as I
|
|
|
|
from parakeet.utils import checkpoint
|
|
from parakeet.modules import geometry as geo
|
|
|
|
__all__ = ["UpsampleNet", "WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
|
|
|
def fold(x, n_group):
|
|
"""Fold audio or spectrogram's temporal dimension in to groups.
|
|
|
|
Args:
|
|
x (Tensor): shape(*, time_steps), the input tensor
|
|
n_group (int): the size of a group.
|
|
|
|
Returns:
|
|
Tensor: shape(*, time_steps // n_group, group), folded tensor.
|
|
"""
|
|
*spatial_shape, time_steps = x.shape
|
|
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
|
return paddle.reshape(x, new_shape)
|
|
|
|
class UpsampleNet(nn.LayerList):
|
|
"""
|
|
Layer to upsample mel spectrogram to the same temporal resolution with
|
|
the corresponding waveform. It consists of several conv2dtranspose layers
|
|
which perform de convolution on mel and time dimension.
|
|
"""
|
|
def __init__(self, upsample_factors):
|
|
super(UpsampleNet, self).__init__()
|
|
for factor in upsample_factors:
|
|
std = math.sqrt(1 / (3 * 2 * factor))
|
|
init = I.Uniform(-std, std)
|
|
self.append(
|
|
nn.utils.weight_norm(
|
|
nn.Conv2DTranspose(1, 1, (3, 2 * factor),
|
|
padding=(1, factor // 2),
|
|
stride=(1, factor),
|
|
weight_attr=init,
|
|
bias_attr=init)))
|
|
|
|
# upsample factors
|
|
self.upsample_factor = np.prod(upsample_factors)
|
|
self.upsample_factors = upsample_factors
|
|
|
|
def forward(self, x, trim_conv_artifact=False):
|
|
"""
|
|
Args:
|
|
x (Tensor): shape(batch_size, input_channels, time_steps), the input
|
|
spectrogram.
|
|
trim_conv_artifact (bool, optional): trim deconvolution artifact at
|
|
each layer. Defaults to False.
|
|
|
|
Returns:
|
|
Tensor: shape(batch_size, input_channels, time_steps * upsample_factor).
|
|
If trim_conv_artifact is True, the output time steps is less
|
|
than time_steps * upsample_factors.
|
|
"""
|
|
x = paddle.unsqueeze(x, 1) #(B, C, T) -> (B, 1, C, T)
|
|
for layer in self:
|
|
x = layer(x)
|
|
if trim_conv_artifact:
|
|
time_cutoff = layer._kernel_size[1] - layer._stride[1]
|
|
x = x[:, :, :, :-time_cutoff]
|
|
x = F.leaky_relu(x, 0.4)
|
|
x = paddle.squeeze(x, 1) # back to (B, C, T)
|
|
return x
|
|
|
|
|
|
class ResidualBlock(nn.Layer):
|
|
"""
|
|
ResidualBlock, the basic unit of ResidualNet. It has a conv2d layer, which
|
|
has causal padding in height dimension and same paddign in width dimension.
|
|
It also has projection for the condition and output.
|
|
"""
|
|
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
|
super(ResidualBlock, self).__init__()
|
|
# input conv
|
|
std = math.sqrt(1 / channels * np.prod(kernel_size))
|
|
init = I.Uniform(-std, std)
|
|
receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)]
|
|
rh, rw = receptive_field
|
|
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
|
|
conv = nn.Conv2D(channels, 2 * channels, kernel_size,
|
|
padding=paddings,
|
|
dilation=dilations,
|
|
weight_attr=init,
|
|
bias_attr=init)
|
|
self.conv = nn.utils.weight_norm(conv)
|
|
self.rh = rh
|
|
self.rw = rw
|
|
self.dilations = dilations
|
|
|
|
# condition projection
|
|
std = math.sqrt(1 / cond_channels)
|
|
init = I.Uniform(-std, std)
|
|
condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1),
|
|
weight_attr=init, bias_attr=init)
|
|
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
|
|
|
# parametric residual & skip connection
|
|
std = math.sqrt(1 / channels)
|
|
init = I.Uniform(-std, std)
|
|
out_proj = nn.Conv2D(channels, 2 * channels, (1, 1),
|
|
weight_attr=init, bias_attr=init)
|
|
self.out_proj = nn.utils.weight_norm(out_proj)
|
|
|
|
def forward(self, x, condition):
|
|
"""Compute output for a whole folded sequence.
|
|
|
|
Args:
|
|
x (Tensor): shape(batch_size, channel, height, width), the input.
|
|
condition (Tensor): shape(batch_size, condition_channel, height, width),
|
|
the local condition.
|
|
|
|
Returns:
|
|
res (Tensor): shape(batch_size, channel, height, width), the residual output.
|
|
res (Tensor): shape(batch_size, channel, height, width), the skip output.
|
|
"""
|
|
x_in = x
|
|
x = self.conv(x)
|
|
x += self.condition_proj(condition)
|
|
|
|
content, gate = paddle.chunk(x, 2, axis=1)
|
|
x = paddle.tanh(content) * F.sigmoid(gate)
|
|
|
|
x = self.out_proj(x)
|
|
res, skip = paddle.chunk(x, 2, axis=1)
|
|
return x_in + res, skip
|
|
|
|
def start_sequence(self):
|
|
"""Prepare the layer for incremental computation of causal convolution. Reset the buffer for causal convolution.
|
|
|
|
Raises:
|
|
ValueError: If not in evaluation mode.
|
|
"""
|
|
if self.training:
|
|
raise ValueError("Only use start sequence at evaluation mode.")
|
|
self._conv_buffer = None
|
|
|
|
# NOTE: call self.conv's weight norm hook expliccitly since
|
|
# its weight will be visited directly in `add_input` without
|
|
# calling its `__call__` method. If we do not trigger the weight
|
|
# norm hook, the weight may be outdated. e.g. after loading from
|
|
# a saved checkpoint
|
|
# see also: https://github.com/pytorch/pytorch/issues/47588
|
|
for hook in self.conv._forward_pre_hooks.values():
|
|
hook(self.conv, None)
|
|
|
|
def add_input(self, x_row, condition_row):
|
|
"""Compute the output for a row and update the buffer.
|
|
|
|
Args:
|
|
x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input.
|
|
condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input.
|
|
|
|
Returns:
|
|
res (Tensor): shape(batch_size, channel, 1, width), the residual output.
|
|
res (Tensor): shape(batch_size, channel, 1, width), the skip output.
|
|
"""
|
|
x_row_in = x_row
|
|
if self._conv_buffer is None:
|
|
self._init_buffer(x_row)
|
|
self._update_buffer(x_row)
|
|
|
|
rw = self.rw
|
|
x_row = F.conv2d(
|
|
self._conv_buffer,
|
|
self.conv.weight,
|
|
self.conv.bias,
|
|
padding=[0, 0, rw // 2, (rw - 1) // 2],
|
|
dilation=self.dilations)
|
|
x_row += self.condition_proj(condition_row)
|
|
|
|
content, gate = paddle.chunk(x_row, 2, axis=1)
|
|
x_row = paddle.tanh(content) * F.sigmoid(gate)
|
|
|
|
x_row = self.out_proj(x_row)
|
|
res, skip = paddle.chunk(x_row, 2, axis=1)
|
|
return x_row_in + res, skip
|
|
|
|
def _init_buffer(self, input):
|
|
batch_size, channels, _, width = input.shape
|
|
self._conv_buffer = paddle.zeros(
|
|
[batch_size, channels, self.rh, width], dtype=input.dtype)
|
|
|
|
def _update_buffer(self, input):
|
|
self._conv_buffer = paddle.concat(
|
|
[self._conv_buffer[:, :, 1:, :], input], axis=2)
|
|
|
|
|
|
class ResidualNet(nn.LayerList):
|
|
"""
|
|
A stack of several ResidualBlocks. It merges condition at each layer. All
|
|
skip outputs are collected.
|
|
"""
|
|
def __init__(self, n_layer, residual_channels, condition_channels, kernel_size, dilations_h):
|
|
if len(dilations_h) != n_layer:
|
|
raise ValueError("number of dilations_h should equals num of layers")
|
|
super(ResidualNet, self).__init__()
|
|
for i in range(n_layer):
|
|
dilation = (dilations_h[i], 2 ** i)
|
|
layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation)
|
|
self.append(layer)
|
|
|
|
def forward(self, x, condition):
|
|
"""Comput the output of given the input and the condition.
|
|
|
|
Args:
|
|
x (Tensor): shape(batch_size, channel, height, width), the input.
|
|
condition (Tensor): shape(batch_size, condition_channel, height, width),
|
|
the local condition.
|
|
|
|
Returns:
|
|
Tensor: shape(batch_size, channel, height, width), the output, which
|
|
is an aggregation of all the skip outputs.
|
|
"""
|
|
skip_connections = []
|
|
for layer in self:
|
|
x, skip = layer(x, condition)
|
|
skip_connections.append(skip)
|
|
out = paddle.sum(paddle.stack(skip_connections, 0), 0)
|
|
return out
|
|
|
|
def start_sequence(self):
|
|
"""Prepare the layer for incremental computation."""
|
|
for layer in self:
|
|
layer.start_sequence()
|
|
|
|
def add_input(self, x_row, condition_row):
|
|
"""Compute the output for a row and update the buffer.
|
|
|
|
Args:
|
|
x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input.
|
|
condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input.
|
|
|
|
Returns:
|
|
Tensor: shape(batch_size, channel, 1, width), the output, which is
|
|
an aggregation of all the skip outputs.
|
|
"""
|
|
skip_connections = []
|
|
for layer in self:
|
|
x_row, skip = layer.add_input(x_row, condition_row)
|
|
skip_connections.append(skip)
|
|
out = paddle.sum(paddle.stack(skip_connections, 0), 0)
|
|
return out
|
|
|
|
|
|
class Flow(nn.Layer):
|
|
"""
|
|
A bijection (Reversable layer) that transform a density of latent variables
|
|
p(Z) into a complex data distribution p(X).
|
|
|
|
It's a auto regressive flow. The `forward` method implements the probability
|
|
density estimation. The `inverse` method implements the sampling.
|
|
"""
|
|
dilations_dict = {
|
|
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
|
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
|
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
|
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
|
128: [1, 2, 4, 8, 16, 32, 64, 1]
|
|
}
|
|
|
|
def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
|
|
super(Flow, self).__init__()
|
|
# input projection
|
|
self.input_proj = nn.utils.weight_norm(
|
|
nn.Conv2D(1, channels, (1, 1),
|
|
weight_attr=I.Uniform(-1., 1.),
|
|
bias_attr=I.Uniform(-1., 1.)))
|
|
|
|
# residual net
|
|
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
|
|
self.dilations_dict[n_group])
|
|
|
|
# output projection
|
|
self.output_proj = nn.Conv2D(channels, 2, (1, 1),
|
|
weight_attr=I.Constant(0.),
|
|
bias_attr=I.Constant(0.))
|
|
|
|
# specs
|
|
self.n_group = n_group
|
|
|
|
def _predict_parameters(self, x, condition):
|
|
x = self.input_proj(x)
|
|
x = self.resnet(x, condition)
|
|
bijection_params = self.output_proj(x)
|
|
logs, b = paddle.chunk(bijection_params, 2, axis=1)
|
|
return logs, b
|
|
|
|
def _transform(self, x, logs, b):
|
|
z_0 = x[:, :, :1, :] # the first row, just copy it
|
|
z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
|
|
z_out = paddle.concat([z_0, z_out], axis=2)
|
|
return z_out
|
|
|
|
def forward(self, x, condition):
|
|
"""Probability density estimation. It is done by inversely transform a sample
|
|
from p(X) back into a sample from p(Z).
|
|
|
|
Args:
|
|
x (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(X).
|
|
condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
|
|
|
|
Returns:
|
|
(z, (logs, b))
|
|
z (Tensor): shape(batch, 1, height, width), the transformed sample.
|
|
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
|
|
b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
|
|
"""
|
|
# (B, C, H-1, W)
|
|
logs, b = self._predict_parameters(
|
|
x[:, :, :-1, :], condition[:, :, 1:, :])
|
|
z = self._transform(x, logs, b)
|
|
return z, (logs, b)
|
|
|
|
def _predict_row_parameters(self, x_row, condition_row):
|
|
x_row = self.input_proj(x_row)
|
|
x_row = self.resnet.add_input(x_row, condition_row)
|
|
bijection_params = self.output_proj(x_row)
|
|
logs, b = paddle.chunk(bijection_params, 2, axis=1)
|
|
return logs, b
|
|
|
|
def _inverse_transform_row(self, z_row, logs, b):
|
|
x_row = (z_row - b) * paddle.exp(-logs)
|
|
return x_row
|
|
|
|
def _inverse_row(self, z_row, x_row, condition_row):
|
|
logs, b = self._predict_row_parameters(x_row, condition_row)
|
|
x_next_row = self._inverse_transform_row(z_row, logs, b)
|
|
return x_next_row, (logs, b)
|
|
|
|
def _start_sequence(self):
|
|
self.resnet.start_sequence()
|
|
|
|
def inverse(self, z, condition):
|
|
"""Sampling from the the distrition p(X). It is done by sample form p(Z)
|
|
and transform the sample. It is a auto regressive transformation.
|
|
|
|
Args:
|
|
z (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(Z).
|
|
condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
|
|
|
|
Returns:
|
|
(x, (logs, b))
|
|
x (Tensor): shape(batch, 1, height, width), the transformed sample.
|
|
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
|
|
b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
|
|
"""
|
|
z_0 = z[:, :, :1, :]
|
|
x = []
|
|
logs_list = []
|
|
b_list = []
|
|
x.append(z_0)
|
|
|
|
self._start_sequence()
|
|
for i in range(1, self.n_group):
|
|
x_row = x[-1] # actuallt i-1:i
|
|
z_row = z[:, :, i:i+1, :]
|
|
condition_row = condition[:, :, i:i+1, :]
|
|
|
|
x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row)
|
|
x.append(x_next_row)
|
|
logs_list.append(logs)
|
|
b_list.append(b)
|
|
|
|
x = paddle.concat(x, 2)
|
|
logs = paddle.concat(logs_list, 2)
|
|
b = paddle.concat(b_list, 2)
|
|
return x, (logs, b)
|
|
|
|
|
|
class WaveFlow(nn.LayerList):
|
|
"""An Deep Reversible layer that is composed of a stack of auto regressive flows.s"""
|
|
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
|
|
if n_group % 2 or n_flows % 2:
|
|
raise ValueError("number of flows and number of group must be even "
|
|
"since a permutation along group among flows is used.")
|
|
super(WaveFlow, self).__init__()
|
|
for _ in range(n_flows):
|
|
self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group))
|
|
|
|
# permutations in h
|
|
self.perms = self._create_perm(n_group, n_flows)
|
|
|
|
# specs
|
|
self.n_group = n_group
|
|
self.n_flows = n_flows
|
|
|
|
def _create_perm(self, n_group, n_flows):
|
|
indices = list(range(n_group))
|
|
half = n_group // 2
|
|
perms = []
|
|
for i in range(n_flows):
|
|
if i < n_flows // 2:
|
|
perms.append(indices[::-1])
|
|
else:
|
|
perm = list(reversed(indices[:half])) + list(reversed(indices[half:]))
|
|
perms.append(perm)
|
|
return perms
|
|
|
|
def _trim(self, x, condition):
|
|
assert condition.shape[-1] >= x.shape[-1]
|
|
pruned_len = int(x.shape[-1] // self.n_group * self.n_group)
|
|
|
|
if x.shape[-1] > pruned_len:
|
|
x = x[:, :pruned_len]
|
|
if condition.shape[-1] > pruned_len:
|
|
condition = condition[:, :, :pruned_len]
|
|
return x, condition
|
|
|
|
def forward(self, x, condition):
|
|
"""Probability density estimation.
|
|
|
|
Args:
|
|
x (Tensor): shape(batch_size, time_steps), the audio.
|
|
condition (Tensor): shape(batch_size, condition channel, time_steps), the local condition.
|
|
|
|
Returns:
|
|
z: (Tensor): shape(batch_size, time_steps), the transformed sample.
|
|
log_det_jacobian: (Tensor), shape(1,), the log determinant of the jacobian of (dz/dx).
|
|
"""
|
|
# x: (B, T)
|
|
# condition: (B, C, T) upsampled condition
|
|
x, condition = self._trim(x, condition)
|
|
|
|
# to (B, C, h, T//h) layout
|
|
x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
|
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
|
|
|
|
# flows
|
|
logs_list = []
|
|
for i, layer in enumerate(self):
|
|
x, (logs, b) = layer(x, condition)
|
|
logs_list.append(logs)
|
|
# permute paddle has no shuffle dim
|
|
x = geo.shuffle_dim(x, 2, perm=self.perms[i])
|
|
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
|
|
|
z = paddle.squeeze(x, 1) # (B, H, W)
|
|
batch_size = z.shape[0]
|
|
z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
|
|
|
|
log_det_jacobian = paddle.sum(paddle.stack(logs_list))
|
|
return z, log_det_jacobian
|
|
|
|
def inverse(self, z, condition):
|
|
"""Sampling from the the distrition p(X). It is done by sample form p(Z)
|
|
and transform the sample. It is a auto regressive transformation.
|
|
|
|
Args:
|
|
z (Tensor): shape(batch, 1, time_steps), a input sample of the distribution p(Z).
|
|
condition (Tensor): shape(batch, condition_channel, time_steps), the local condition.
|
|
|
|
Returns:
|
|
x: (Tensor): shape(batch_size, time_steps), the transformed sample.
|
|
"""
|
|
|
|
z, condition = self._trim(z, condition)
|
|
# to (B, C, h, T//h) layout
|
|
z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
|
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
|
|
|
|
# reverse it flow by flow
|
|
for i in reversed(range(self.n_flows)):
|
|
z = geo.shuffle_dim(z, 2, perm=self.perms[i])
|
|
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
|
z, (logs, b) = self[i].inverse(z, condition)
|
|
|
|
x = paddle.squeeze(z, 1) # (B, H, W)
|
|
batch_size = x.shape[0]
|
|
x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
|
|
return x
|
|
|
|
|
|
class ConditionalWaveFlow(nn.LayerList):
|
|
def __init__(self,
|
|
upsample_factors: List[int],
|
|
n_flows: int,
|
|
n_layers: int,
|
|
n_group: int,
|
|
channels: int,
|
|
n_mels: int,
|
|
kernel_size: Union[int, List[int]]):
|
|
super(ConditionalWaveFlow, self).__init__()
|
|
self.encoder = UpsampleNet(upsample_factors)
|
|
self.decoder = WaveFlow(
|
|
n_flows=n_flows,
|
|
n_layers=n_layers,
|
|
n_group=n_group,
|
|
channels=channels,
|
|
mel_bands=n_mels,
|
|
kernel_size=kernel_size)
|
|
|
|
def forward(self, audio, mel):
|
|
condition = self.encoder(mel)
|
|
z, log_det_jacobian = self.decoder(audio, condition)
|
|
return z, log_det_jacobian
|
|
|
|
@paddle.no_grad()
|
|
def infer(self, mel):
|
|
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
|
|
batch_size, _, time_steps = condition.shape
|
|
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
|
x = self.decoder.inverse(z, condition)
|
|
return x
|
|
|
|
@paddle.no_grad()
|
|
def predict(self, mel):
|
|
mel = paddle.to_tensor(mel)
|
|
mel = paddle.unsqueeze(mel, 0)
|
|
audio = self.infer(mel)
|
|
audio = audio[0].numpy()
|
|
return audio
|
|
|
|
@classmethod
|
|
def from_pretrained(cls, config, checkpoint_path):
|
|
model = cls(
|
|
upsample_factors=config.model.upsample_factors,
|
|
n_flows=config.model.n_flows,
|
|
n_layers=config.model.n_layers,
|
|
n_group=config.model.n_group,
|
|
channels=config.model.channels,
|
|
n_mels=config.data.n_mels,
|
|
kernel_size=config.model.kernel_size)
|
|
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
|
return model
|
|
|
|
|
|
class WaveFlowLoss(nn.Layer):
|
|
def __init__(self, sigma=1.0):
|
|
super(WaveFlowLoss, self).__init__()
|
|
self.sigma = sigma
|
|
self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
|
|
|
|
def forward(self, model_output):
|
|
z, log_det_jacobian = model_output
|
|
|
|
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
|
|
loss = loss / np.prod(z.shape)
|
|
return loss + self.const
|