clean code
This commit is contained in:
parent
b15c313423
commit
98841ee48a
|
@ -163,6 +163,35 @@ class WeightedRandomSampler(Sampler):
|
||||||
return self.num_samples
|
return self.num_samples
|
||||||
|
|
||||||
|
|
||||||
|
class DistributedSampler(Sampler):
|
||||||
|
def __init__(self, dataset_size, num_trainers, rank, shuffle=True):
|
||||||
|
self.dataset_size = dataset_size
|
||||||
|
self.num_trainers = num_trainers
|
||||||
|
self.rank = rank
|
||||||
|
self.num_samples = int(np.ceil(dataset_size / num_trainers))
|
||||||
|
self.total_size = self.num_samples * num_trainers
|
||||||
|
assert self.total_size >= self.dataset_size
|
||||||
|
self.shuffle = shuffle
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
indices = list(range(self.dataset_size))
|
||||||
|
if self.shuffle:
|
||||||
|
random.shuffle(indices)
|
||||||
|
|
||||||
|
# Append extra samples to make it evenly distributed on all trainers.
|
||||||
|
indices += indices[:(self.total_size - self.dataset_size)]
|
||||||
|
assert len(indices) == self.total_size
|
||||||
|
|
||||||
|
# Subset samples for each trainer.
|
||||||
|
indices = indices[self.rank:self.total_size:self.num_trainers]
|
||||||
|
assert len(indices) == self.num_samples
|
||||||
|
|
||||||
|
return iter(indices)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.num_samples
|
||||||
|
|
||||||
|
|
||||||
class BatchSampler(Sampler):
|
class BatchSampler(Sampler):
|
||||||
r"""Wraps another sampler to yield a mini-batch of indices.
|
r"""Wraps another sampler to yield a mini-batch of indices.
|
||||||
Args:
|
Args:
|
||||||
|
@ -206,4 +235,4 @@ class BatchSampler(Sampler):
|
||||||
if self.drop_last:
|
if self.drop_last:
|
||||||
return len(self.sampler) // self.batch_size
|
return len(self.sampler) // self.batch_size
|
||||||
else:
|
else:
|
||||||
return (len(self.sampler) + self.batch_size - 1) // self.batch_size
|
return (len(self.sampler) + self.batch_size - 1) // self.batch_size
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
valid_size: 16
|
||||||
|
train_clip_second: 0.5
|
||||||
|
sample_rate: 22050
|
||||||
|
fft_window_shift: 256
|
||||||
|
fft_window_size: 1024
|
||||||
|
fft_size: 2048
|
||||||
|
mel_bands: 80
|
||||||
|
|
||||||
|
seed: 1
|
||||||
|
batch_size: 8
|
||||||
|
test_every: 2000
|
||||||
|
save_every: 10000
|
||||||
|
max_iterations: 2000000
|
||||||
|
|
||||||
|
layers: 30
|
||||||
|
kernel_width: 2
|
||||||
|
dilation_block: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
|
||||||
|
residual_channels: 128
|
||||||
|
skip_channels: 128
|
||||||
|
loss_type: mix-gaussian-pdf
|
||||||
|
num_mixtures: 10
|
||||||
|
log_scale_min: -9.0
|
||||||
|
|
||||||
|
conditioner:
|
||||||
|
filter_sizes: [[32, 3], [32, 3]]
|
||||||
|
upsample_factors: [16, 16]
|
||||||
|
|
||||||
|
learning_rate: 0.001
|
||||||
|
gradient_max_norm: 100.0
|
||||||
|
anneal:
|
||||||
|
every: 200000
|
||||||
|
rate: 0.5
|
|
@ -0,0 +1,31 @@
|
||||||
|
valid_size: 16
|
||||||
|
train_clip_second: 0.5
|
||||||
|
sample_rate: 22050
|
||||||
|
fft_window_shift: 256
|
||||||
|
fft_window_size: 1024
|
||||||
|
fft_size: 2048
|
||||||
|
mel_bands: 80
|
||||||
|
|
||||||
|
seed: 1
|
||||||
|
batch_size: 8
|
||||||
|
test_every: 2000
|
||||||
|
save_every: 10000
|
||||||
|
max_iterations: 2000000
|
||||||
|
|
||||||
|
layers: 30
|
||||||
|
kernel_width: 2
|
||||||
|
dilation_block: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
|
||||||
|
residual_channels: 128
|
||||||
|
skip_channels: 128
|
||||||
|
loss_type: softmax
|
||||||
|
num_channels: 2048
|
||||||
|
|
||||||
|
conditioner:
|
||||||
|
filter_sizes: [[32, 3], [32, 3]]
|
||||||
|
upsample_factors: [16, 16]
|
||||||
|
|
||||||
|
learning_rate: 0.001
|
||||||
|
gradient_max_norm: 100.0
|
||||||
|
anneal:
|
||||||
|
every: 200000
|
||||||
|
rate: 0.5
|
|
@ -1,5 +1,3 @@
|
||||||
import math
|
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
|
@ -9,7 +7,7 @@ from paddle import fluid
|
||||||
import utils
|
import utils
|
||||||
from parakeet.datasets import ljspeech
|
from parakeet.datasets import ljspeech
|
||||||
from parakeet.data import dataset
|
from parakeet.data import dataset
|
||||||
from parakeet.data.sampler import Sampler, BatchSampler, SequentialSampler
|
from parakeet.data.sampler import DistributedSampler, BatchSampler
|
||||||
from parakeet.data.datacargo import DataCargo
|
from parakeet.data.datacargo import DataCargo
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,7 +18,7 @@ class Dataset(ljspeech.LJSpeech):
|
||||||
self.fft_window_shift = config.fft_window_shift
|
self.fft_window_shift = config.fft_window_shift
|
||||||
# Calculate context frames.
|
# Calculate context frames.
|
||||||
frames_per_second = config.sample_rate // self.fft_window_shift
|
frames_per_second = config.sample_rate // self.fft_window_shift
|
||||||
train_clip_frames = int(math.ceil(
|
train_clip_frames = int(np.ceil(
|
||||||
config.train_clip_second * frames_per_second))
|
config.train_clip_second * frames_per_second))
|
||||||
context_frames = config.context_size // self.fft_window_shift
|
context_frames = config.context_size // self.fft_window_shift
|
||||||
self.num_frames = train_clip_frames + context_frames
|
self.num_frames = train_clip_frames + context_frames
|
||||||
|
@ -39,7 +37,7 @@ class Dataset(ljspeech.LJSpeech):
|
||||||
assert loaded_sr == sr
|
assert loaded_sr == sr
|
||||||
|
|
||||||
# Pad audio to the right size.
|
# Pad audio to the right size.
|
||||||
frames = math.ceil(float(audio.size) / fft_window_shift)
|
frames = int(np.ceil(float(audio.size) / fft_window_shift))
|
||||||
fft_padding = (fft_size - fft_window_shift) // 2
|
fft_padding = (fft_size - fft_window_shift) // 2
|
||||||
desired_length = frames * fft_window_shift + fft_padding * 2
|
desired_length = frames * fft_window_shift + fft_padding * 2
|
||||||
pad_amount = (desired_length - audio.size) // 2
|
pad_amount = (desired_length - audio.size) // 2
|
||||||
|
@ -125,35 +123,6 @@ class Subset(dataset.Dataset):
|
||||||
return len(self.indices)
|
return len(self.indices)
|
||||||
|
|
||||||
|
|
||||||
class DistributedSampler(Sampler):
|
|
||||||
def __init__(self, dataset_size, num_trainers, rank, shuffle=True):
|
|
||||||
self.dataset_size = dataset_size
|
|
||||||
self.num_trainers = num_trainers
|
|
||||||
self.rank = rank
|
|
||||||
self.num_samples = int(math.ceil(dataset_size / num_trainers))
|
|
||||||
self.total_size = self.num_samples * num_trainers
|
|
||||||
assert self.total_size >= self.dataset_size
|
|
||||||
self.shuffle = shuffle
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
indices = list(range(self.dataset_size))
|
|
||||||
if self.shuffle:
|
|
||||||
random.shuffle(indices)
|
|
||||||
|
|
||||||
# Append extra samples to make it evenly distributed on all trainers.
|
|
||||||
indices += indices[:(self.total_size - self.dataset_size)]
|
|
||||||
assert len(indices) == self.total_size
|
|
||||||
|
|
||||||
# Subset samples for each trainer.
|
|
||||||
indices = indices[self.rank:self.total_size:self.num_trainers]
|
|
||||||
assert len(indices) == self.num_samples
|
|
||||||
|
|
||||||
return iter(indices)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.num_samples
|
|
||||||
|
|
||||||
|
|
||||||
class LJSpeech:
|
class LJSpeech:
|
||||||
def __init__(self, config, nranks, rank):
|
def __init__(self, config, nranks, rank):
|
||||||
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||||
|
|
|
@ -1,249 +0,0 @@
|
||||||
import paddle
|
|
||||||
from paddle import fluid
|
|
||||||
import paddle.fluid.dygraph as dg
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import weight_norm
|
|
||||||
|
|
||||||
|
|
||||||
def Embedding(name_scope,
|
|
||||||
num_embeddings,
|
|
||||||
embed_dim,
|
|
||||||
padding_idx=None,
|
|
||||||
std=0.1,
|
|
||||||
dtype="float32"):
|
|
||||||
# param attrs
|
|
||||||
weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
|
|
||||||
scale=std))
|
|
||||||
layer = dg.Embedding(
|
|
||||||
name_scope, (num_embeddings, embed_dim),
|
|
||||||
padding_idx=padding_idx,
|
|
||||||
param_attr=weight_attr,
|
|
||||||
dtype=dtype)
|
|
||||||
return layer
|
|
||||||
|
|
||||||
|
|
||||||
def FC(name_scope,
|
|
||||||
in_features,
|
|
||||||
size,
|
|
||||||
num_flatten_dims=1,
|
|
||||||
relu=False,
|
|
||||||
dropout=0.0,
|
|
||||||
act=None,
|
|
||||||
dtype="float32"):
|
|
||||||
"""
|
|
||||||
A special Linear Layer, when it is used with dropout, the weight is
|
|
||||||
initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
|
|
||||||
"""
|
|
||||||
|
|
||||||
# stds
|
|
||||||
if isinstance(in_features, int):
|
|
||||||
in_features = [in_features]
|
|
||||||
|
|
||||||
stds = [np.sqrt((1.0 - dropout) / in_feature) for in_feature in in_features]
|
|
||||||
if relu:
|
|
||||||
stds = [std * np.sqrt(2.0) for std in stds]
|
|
||||||
|
|
||||||
weight_inits = [
|
|
||||||
fluid.initializer.NormalInitializer(scale=std) for std in stds
|
|
||||||
]
|
|
||||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
|
||||||
|
|
||||||
# param attrs
|
|
||||||
weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
|
|
||||||
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
|
||||||
|
|
||||||
layer = weight_norm.FC(name_scope,
|
|
||||||
size,
|
|
||||||
num_flatten_dims=num_flatten_dims,
|
|
||||||
param_attr=weight_attrs,
|
|
||||||
bias_attr=bias_attr,
|
|
||||||
act=act,
|
|
||||||
dtype=dtype)
|
|
||||||
return layer
|
|
||||||
|
|
||||||
|
|
||||||
def Conv1D(name_scope,
|
|
||||||
in_channels,
|
|
||||||
num_filters,
|
|
||||||
filter_size=2,
|
|
||||||
dilation=1,
|
|
||||||
groups=None,
|
|
||||||
causal=False,
|
|
||||||
std_mul=1.0,
|
|
||||||
dropout=0.0,
|
|
||||||
use_cudnn=True,
|
|
||||||
act=None,
|
|
||||||
dtype="float32"):
|
|
||||||
"""
|
|
||||||
A special Conv1D Layer, when it is used with dropout, the weight is
|
|
||||||
initialized as
|
|
||||||
normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_channels)))
|
|
||||||
"""
|
|
||||||
# std
|
|
||||||
std = np.sqrt((std_mul * (1.0 - dropout)) / (filter_size * in_channels))
|
|
||||||
weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
|
|
||||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
|
||||||
|
|
||||||
# param attrs
|
|
||||||
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
|
||||||
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
|
||||||
|
|
||||||
layer = weight_norm.Conv1D(
|
|
||||||
name_scope,
|
|
||||||
num_filters,
|
|
||||||
filter_size,
|
|
||||||
dilation,
|
|
||||||
groups=groups,
|
|
||||||
causal=causal,
|
|
||||||
param_attr=weight_attr,
|
|
||||||
bias_attr=bias_attr,
|
|
||||||
use_cudnn=use_cudnn,
|
|
||||||
act=act,
|
|
||||||
dtype=dtype)
|
|
||||||
return layer
|
|
||||||
|
|
||||||
|
|
||||||
class Conv1D_GU(dg.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
name_scope,
|
|
||||||
conditioner_dim,
|
|
||||||
in_channels,
|
|
||||||
num_filters,
|
|
||||||
filter_size,
|
|
||||||
dilation,
|
|
||||||
causal=False,
|
|
||||||
residual=True,
|
|
||||||
dtype="float32"):
|
|
||||||
super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)
|
|
||||||
|
|
||||||
self.conditioner_dim = conditioner_dim
|
|
||||||
self.in_channels = in_channels
|
|
||||||
self.num_filters = num_filters
|
|
||||||
self.filter_size = filter_size
|
|
||||||
self.dilation = dilation
|
|
||||||
self.causal = causal
|
|
||||||
self.residual = residual
|
|
||||||
|
|
||||||
if residual:
|
|
||||||
assert (
|
|
||||||
in_channels == num_filters
|
|
||||||
), "this block uses residual connection"\
|
|
||||||
"the input_channels should equals num_filters"
|
|
||||||
|
|
||||||
self.conv = Conv1D(
|
|
||||||
self.full_name(),
|
|
||||||
in_channels,
|
|
||||||
2 * num_filters,
|
|
||||||
filter_size,
|
|
||||||
dilation,
|
|
||||||
causal=causal,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
self.fc = Conv1D(
|
|
||||||
self.full_name(),
|
|
||||||
conditioner_dim,
|
|
||||||
2 * num_filters,
|
|
||||||
filter_size=1,
|
|
||||||
dilation=1,
|
|
||||||
causal=False,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
def forward(self, x, skip=None, conditioner=None):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
|
|
||||||
layer, where B means batch_size, C_in means the input channels
|
|
||||||
T means input time steps.
|
|
||||||
conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
|
|
||||||
conditioner, where C_con is conditioner hidden dim which
|
|
||||||
equals the num of mel bands. Note that when using residual
|
|
||||||
connection, the Conv1DGLU does not change the number of
|
|
||||||
channels, so out channels equals input channels.
|
|
||||||
Returns:
|
|
||||||
x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
|
|
||||||
C_out means the output channels of Conv1DGLU.
|
|
||||||
"""
|
|
||||||
residual = x
|
|
||||||
x = self.conv(x)
|
|
||||||
|
|
||||||
if conditioner is not None:
|
|
||||||
cond_bias = self.fc(conditioner)
|
|
||||||
x += cond_bias
|
|
||||||
|
|
||||||
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
|
||||||
|
|
||||||
# Gated Unit.
|
|
||||||
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
|
|
||||||
fluid.layers.tanh(content))
|
|
||||||
|
|
||||||
if skip is None:
|
|
||||||
skip = x
|
|
||||||
else:
|
|
||||||
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
|
||||||
|
|
||||||
if self.residual:
|
|
||||||
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
|
||||||
|
|
||||||
return x, skip
|
|
||||||
|
|
||||||
def add_input(self, x, skip=None, conditioner=None):
|
|
||||||
"""
|
|
||||||
Inputs:
|
|
||||||
x: shape(B, num_filters, 1, time_steps)
|
|
||||||
conditioner: shape(B, conditioner_dim, 1, time_steps)
|
|
||||||
Outputs:
|
|
||||||
out: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
|
||||||
"""
|
|
||||||
residual = x
|
|
||||||
|
|
||||||
# add step input and produce step output
|
|
||||||
x = self.conv.add_input(x)
|
|
||||||
|
|
||||||
if conditioner is not None:
|
|
||||||
cond_bias = self.fc(conditioner)
|
|
||||||
x += cond_bias
|
|
||||||
|
|
||||||
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
|
||||||
|
|
||||||
# Gated Unit.
|
|
||||||
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
|
|
||||||
fluid.layers.tanh(content))
|
|
||||||
|
|
||||||
if skip is None:
|
|
||||||
skip = x
|
|
||||||
else:
|
|
||||||
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
|
||||||
|
|
||||||
if self.residual:
|
|
||||||
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
|
||||||
|
|
||||||
return x, skip
|
|
||||||
|
|
||||||
|
|
||||||
def Conv2DTranspose(name_scope,
|
|
||||||
num_filters,
|
|
||||||
filter_size,
|
|
||||||
padding=0,
|
|
||||||
stride=1,
|
|
||||||
dilation=1,
|
|
||||||
use_cudnn=True,
|
|
||||||
act=None,
|
|
||||||
dtype="float32"):
|
|
||||||
val = 1.0 / (filter_size[0] * filter_size[1])
|
|
||||||
weight_init = fluid.initializer.ConstantInitializer(val)
|
|
||||||
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
|
||||||
|
|
||||||
layer = weight_norm.Conv2DTranspose(
|
|
||||||
name_scope,
|
|
||||||
num_filters,
|
|
||||||
filter_size=filter_size,
|
|
||||||
padding=padding,
|
|
||||||
stride=stride,
|
|
||||||
dilation=dilation,
|
|
||||||
param_attr=weight_attr,
|
|
||||||
use_cudnn=use_cudnn,
|
|
||||||
act=act,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
return layer
|
|
|
@ -4,12 +4,12 @@ import time
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from paddle import fluid
|
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
|
from paddle import fluid
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
from data import LJSpeech
|
from data import LJSpeech
|
||||||
from wavenet_modules import WaveNetModule, debug
|
from wavenet_modules import WaveNetModule
|
||||||
|
|
||||||
|
|
||||||
class WaveNet():
|
class WaveNet():
|
||||||
|
@ -33,18 +33,6 @@ class WaveNet():
|
||||||
self.trainloader = dataset.trainloader
|
self.trainloader = dataset.trainloader
|
||||||
self.validloader = dataset.validloader
|
self.validloader = dataset.validloader
|
||||||
|
|
||||||
# if self.rank == 0:
|
|
||||||
# for i, (audios, mels, ids) in enumerate(self.validloader()):
|
|
||||||
# print("audios {}, mels {}, ids {}".format(audios.dtype, mels.dtype, ids.dtype))
|
|
||||||
# print("{}: rank {}, audios {}, mels {}, indices {} / {}".format(
|
|
||||||
# i, self.rank, audios.shape, mels.shape, ids.shape,
|
|
||||||
# ids.numpy()))
|
|
||||||
#
|
|
||||||
# for i, (audios, mels, ids) in enumerate(self.trainloader):
|
|
||||||
# print("{}: rank {}, audios {}, mels {}, indices {} / {}".format(
|
|
||||||
# i, self.rank, audios.shape, mels.shape, ids.shape,
|
|
||||||
# ids.numpy()))
|
|
||||||
|
|
||||||
wavenet = WaveNetModule("wavenet", config, self.rank)
|
wavenet = WaveNetModule("wavenet", config, self.rank)
|
||||||
|
|
||||||
# Dry run once to create and initalize all necessary parameters.
|
# Dry run once to create and initalize all necessary parameters.
|
||||||
|
@ -139,8 +127,8 @@ class WaveNet():
|
||||||
self.wavenet.eval()
|
self.wavenet.eval()
|
||||||
|
|
||||||
total_loss = []
|
total_loss = []
|
||||||
start_time = time.time()
|
|
||||||
sample_audios = []
|
sample_audios = []
|
||||||
|
start_time = time.time()
|
||||||
for audios, mels, audio_starts in self.validloader():
|
for audios, mels, audio_starts in self.validloader():
|
||||||
loss, sample_audio = self.wavenet(audios, mels, audio_starts, True)
|
loss, sample_audio = self.wavenet(audios, mels, audio_starts, True)
|
||||||
total_loss.append(float(loss.numpy()))
|
total_loss.append(float(loss.numpy()))
|
||||||
|
@ -160,11 +148,6 @@ class WaveNet():
|
||||||
tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
|
tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
|
||||||
iteration, sample_rate=self.config.sample_rate)
|
iteration, sample_rate=self.config.sample_rate)
|
||||||
|
|
||||||
def save(self, iteration):
|
|
||||||
utils.save_latest_parameters(self.checkpoint_dir, iteration,
|
|
||||||
self.wavenet, self.optimizer)
|
|
||||||
utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
|
|
||||||
|
|
||||||
@dg.no_grad
|
@dg.no_grad
|
||||||
def infer(self, iteration):
|
def infer(self, iteration):
|
||||||
self.wavenet.eval()
|
self.wavenet.eval()
|
||||||
|
@ -186,3 +169,8 @@ class WaveNet():
|
||||||
syn_audio.shape, syn_time))
|
syn_audio.shape, syn_time))
|
||||||
librosa.output.write_wav(filename, syn_audio,
|
librosa.output.write_wav(filename, syn_audio,
|
||||||
sr=config.sample_rate)
|
sr=config.sample_rate)
|
||||||
|
|
||||||
|
def save(self, iteration):
|
||||||
|
utils.save_latest_parameters(self.checkpoint_dir, iteration,
|
||||||
|
self.wavenet, self.optimizer)
|
||||||
|
utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
import itertools
|
import itertools
|
||||||
import math
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from paddle import fluid
|
|
||||||
import paddle.fluid.dygraph as dg
|
import paddle.fluid.dygraph as dg
|
||||||
import ops
|
from paddle import fluid
|
||||||
import weight_norm
|
from parakeet.modules import conv, modules
|
||||||
|
|
||||||
|
|
||||||
def get_padding(filter_size, stride, padding_type='same'):
|
def get_padding(filter_size, stride, padding_type='same'):
|
||||||
|
@ -16,22 +14,6 @@ def get_padding(filter_size, stride, padding_type='same'):
|
||||||
return padding
|
return padding
|
||||||
|
|
||||||
|
|
||||||
def debug(x, var_name, rank, verbose=False):
|
|
||||||
if not verbose and rank != 0:
|
|
||||||
return
|
|
||||||
dim = len(x.shape)
|
|
||||||
if not isinstance(x, np.ndarray):
|
|
||||||
x = x.numpy()
|
|
||||||
if dim == 1:
|
|
||||||
print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x))
|
|
||||||
elif dim == 2:
|
|
||||||
print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5]))
|
|
||||||
elif dim == 3:
|
|
||||||
print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5, 0]))
|
|
||||||
else:
|
|
||||||
print("Rank", rank, var_name, "shape", x.shape)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_slices(x, audio_starts, audio_length, rank):
|
def extract_slices(x, audio_starts, audio_length, rank):
|
||||||
slices = []
|
slices = []
|
||||||
for i in range(x.shape[0]):
|
for i in range(x.shape[0]):
|
||||||
|
@ -58,7 +40,7 @@ class Conditioner(dg.Layer):
|
||||||
stride = (up_scale, 1)
|
stride = (up_scale, 1)
|
||||||
padding = get_padding(filter_sizes[i], stride)
|
padding = get_padding(filter_sizes[i], stride)
|
||||||
self.deconvs.append(
|
self.deconvs.append(
|
||||||
ops.Conv2DTranspose(
|
modules.Conv2DTranspose(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
num_filters=1,
|
num_filters=1,
|
||||||
filter_size=filter_sizes[i],
|
filter_size=filter_sizes[i],
|
||||||
|
@ -94,12 +76,13 @@ class WaveNetModule(dg.Layer):
|
||||||
print("context_size", self.context_size)
|
print("context_size", self.context_size)
|
||||||
|
|
||||||
if config.loss_type == "softmax":
|
if config.loss_type == "softmax":
|
||||||
self.embedding_fc = ops.Embedding(
|
self.embedding_fc = modules.Embedding(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
num_embeddings=config.num_channels,
|
num_embeddings=config.num_channels,
|
||||||
embed_dim=config.residual_channels)
|
embed_dim=config.residual_channels,
|
||||||
|
std=0.1)
|
||||||
elif config.loss_type == "mix-gaussian-pdf":
|
elif config.loss_type == "mix-gaussian-pdf":
|
||||||
self.embedding_fc = ops.FC(
|
self.embedding_fc = modules.FC(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
in_features=1,
|
in_features=1,
|
||||||
size=config.residual_channels,
|
size=config.residual_channels,
|
||||||
|
@ -112,7 +95,7 @@ class WaveNetModule(dg.Layer):
|
||||||
self.dilated_causal_convs = []
|
self.dilated_causal_convs = []
|
||||||
for dilation in self.dilations:
|
for dilation in self.dilations:
|
||||||
self.dilated_causal_convs.append(
|
self.dilated_causal_convs.append(
|
||||||
ops.Conv1D_GU(
|
modules.Conv1D_GU(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
conditioner_dim=config.mel_bands,
|
conditioner_dim=config.mel_bands,
|
||||||
in_channels=config.residual_channels,
|
in_channels=config.residual_channels,
|
||||||
|
@ -126,7 +109,7 @@ class WaveNetModule(dg.Layer):
|
||||||
for i, layer in enumerate(self.dilated_causal_convs):
|
for i, layer in enumerate(self.dilated_causal_convs):
|
||||||
self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
|
self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
|
||||||
|
|
||||||
self.fc1 = ops.FC(
|
self.fc1 = modules.FC(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
in_features=config.residual_channels,
|
in_features=config.residual_channels,
|
||||||
size=config.skip_channels,
|
size=config.skip_channels,
|
||||||
|
@ -134,7 +117,7 @@ class WaveNetModule(dg.Layer):
|
||||||
relu=True,
|
relu=True,
|
||||||
act="relu")
|
act="relu")
|
||||||
|
|
||||||
self.fc2 = ops.FC(
|
self.fc2 = modules.FC(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
in_features=config.skip_channels,
|
in_features=config.skip_channels,
|
||||||
size=config.skip_channels,
|
size=config.skip_channels,
|
||||||
|
@ -143,14 +126,14 @@ class WaveNetModule(dg.Layer):
|
||||||
act="relu")
|
act="relu")
|
||||||
|
|
||||||
if config.loss_type == "softmax":
|
if config.loss_type == "softmax":
|
||||||
self.fc3 = ops.FC(
|
self.fc3 = modules.FC(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
in_features=config.skip_channels,
|
in_features=config.skip_channels,
|
||||||
size=config.num_channels,
|
size=config.num_channels,
|
||||||
num_flatten_dims=2,
|
num_flatten_dims=2,
|
||||||
relu=False)
|
relu=False)
|
||||||
elif config.loss_type == "mix-gaussian-pdf":
|
elif config.loss_type == "mix-gaussian-pdf":
|
||||||
self.fc3 = ops.FC(
|
self.fc3 = modules.FC(
|
||||||
self.full_name(),
|
self.full_name(),
|
||||||
in_features=config.skip_channels,
|
in_features=config.skip_channels,
|
||||||
size=3 * config.num_mixtures,
|
size=3 * config.num_mixtures,
|
||||||
|
@ -175,8 +158,8 @@ class WaveNetModule(dg.Layer):
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
def sample_mix_gaussian(self, mix_parameters):
|
def sample_mix_gaussian(self, mix_parameters):
|
||||||
# mix_parameters reshape from [bs, 13799, 3 * num_mixtures]
|
# mix_parameters reshape from [bs, len, 3 * num_mixtures]
|
||||||
# to [bs * 13799, 3 * num_mixtures].
|
# to [bs * len, 3 * num_mixtures].
|
||||||
batch, length, hidden = mix_parameters.shape
|
batch, length, hidden = mix_parameters.shape
|
||||||
mix_param_2d = fluid.layers.reshape(mix_parameters,
|
mix_param_2d = fluid.layers.reshape(mix_parameters,
|
||||||
[batch * length, hidden])
|
[batch * length, hidden])
|
||||||
|
@ -197,7 +180,7 @@ class WaveNetModule(dg.Layer):
|
||||||
mu_comp = fluid.layers.gather_nd(mu, comp_samples)
|
mu_comp = fluid.layers.gather_nd(mu, comp_samples)
|
||||||
s_comp = fluid.layers.gather_nd(s, comp_samples)
|
s_comp = fluid.layers.gather_nd(s, comp_samples)
|
||||||
|
|
||||||
# N(0, 1) Normal Sample.
|
# N(0, 1) normal sample.
|
||||||
u = fluid.layers.gaussian_random(shape=[batch * length])
|
u = fluid.layers.gaussian_random(shape=[batch * length])
|
||||||
samples = mu_comp + u * s_comp
|
samples = mu_comp + u * s_comp
|
||||||
samples = fluid.layers.clip(samples, min=-1.0, max=1.0)
|
samples = fluid.layers.clip(samples, min=-1.0, max=1.0)
|
||||||
|
@ -205,8 +188,6 @@ class WaveNetModule(dg.Layer):
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
def softmax_loss(self, targets, mix_parameters):
|
def softmax_loss(self, targets, mix_parameters):
|
||||||
# targets: [bs, 13799] -> [bs, 11752]
|
|
||||||
# mix_params: [bs, 13799, 3] -> [bs, 11752, 3]
|
|
||||||
targets = targets[:, self.context_size:]
|
targets = targets[:, self.context_size:]
|
||||||
mix_parameters = mix_parameters[:, self.context_size:, :]
|
mix_parameters = mix_parameters[:, self.context_size:, :]
|
||||||
|
|
||||||
|
@ -216,22 +197,22 @@ class WaveNetModule(dg.Layer):
|
||||||
quantized = fluid.layers.cast(
|
quantized = fluid.layers.cast(
|
||||||
(targets + 1.0) / 2.0 * num_channels, dtype="int64")
|
(targets + 1.0) / 2.0 * num_channels, dtype="int64")
|
||||||
|
|
||||||
# per_sample_loss shape: [bs, 17952, 1]
|
# per_sample_loss shape: [bs, len, 1]
|
||||||
per_sample_loss = fluid.layers.softmax_with_cross_entropy(
|
per_sample_loss = fluid.layers.softmax_with_cross_entropy(
|
||||||
logits=mix_parameters, label=fluid.layers.unsqueeze(quantized, 2))
|
logits=mix_parameters, label=fluid.layers.unsqueeze(quantized, 2))
|
||||||
loss = fluid.layers.reduce_mean(per_sample_loss)
|
loss = fluid.layers.reduce_mean(per_sample_loss)
|
||||||
#debug(loss, "softmax loss", self.rank)
|
|
||||||
|
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def mixture_density_loss(self, targets, mix_parameters, log_scale_min):
|
def mixture_density_loss(self, targets, mix_parameters, log_scale_min):
|
||||||
# targets: [bs, 13799] -> [bs, 11752]
|
# targets: [bs, len]
|
||||||
# mix_params: [bs, 13799, 3] -> [bs, 11752, 3]
|
# mix_params: [bs, len, 3 * num_mixture]
|
||||||
targets = targets[:, self.context_size:]
|
targets = targets[:, self.context_size:]
|
||||||
mix_parameters = mix_parameters[:, self.context_size:, :]
|
mix_parameters = mix_parameters[:, self.context_size:, :]
|
||||||
|
|
||||||
# log_s: [bs, 11752, num_mixture]
|
# log_s: [bs, len, num_mixture]
|
||||||
logits_pi, mu, log_s = fluid.layers.split(mix_parameters, num_or_sections=3, dim=-1)
|
logits_pi, mu, log_s = fluid.layers.split(
|
||||||
|
mix_parameters, num_or_sections=3, dim=-1)
|
||||||
|
|
||||||
pi = fluid.layers.softmax(logits_pi, axis=-1)
|
pi = fluid.layers.softmax(logits_pi, axis=-1)
|
||||||
log_s = fluid.layers.clip(log_s, min=log_scale_min, max=100.0)
|
log_s = fluid.layers.clip(log_s, min=log_scale_min, max=100.0)
|
||||||
|
@ -242,10 +223,9 @@ class WaveNetModule(dg.Layer):
|
||||||
targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
|
targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
|
||||||
x_std = inv_s * (targets - mu)
|
x_std = inv_s * (targets - mu)
|
||||||
exponent = fluid.layers.exp(-0.5 * x_std * x_std)
|
exponent = fluid.layers.exp(-0.5 * x_std * x_std)
|
||||||
# pdf_x: [bs, 11752, 1]
|
|
||||||
pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
|
pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
|
||||||
pdf_x = pi * pdf_x
|
pdf_x = pi * pdf_x
|
||||||
# pdf_x: [bs, 11752]
|
# pdf_x: [bs, len]
|
||||||
pdf_x = fluid.layers.reduce_sum(pdf_x, dim=-1)
|
pdf_x = fluid.layers.reduce_sum(pdf_x, dim=-1)
|
||||||
per_sample_loss = 0.0 - fluid.layers.log(pdf_x + 1e-9)
|
per_sample_loss = 0.0 - fluid.layers.log(pdf_x + 1e-9)
|
||||||
|
|
||||||
|
@ -254,8 +234,6 @@ class WaveNetModule(dg.Layer):
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def forward(self, audios, mels, audio_starts, sample=False):
|
def forward(self, audios, mels, audio_starts, sample=False):
|
||||||
# audios: [bs, 13800], mels: [bs, full_frame_length, 80]
|
|
||||||
# audio_starts: [bs]
|
|
||||||
# Build conditioner based on mels.
|
# Build conditioner based on mels.
|
||||||
full_conditioner = self.conditioner(mels)
|
full_conditioner = self.conditioner(mels)
|
||||||
|
|
||||||
|
@ -264,15 +242,14 @@ class WaveNetModule(dg.Layer):
|
||||||
conditioner = extract_slices(full_conditioner,
|
conditioner = extract_slices(full_conditioner,
|
||||||
audio_starts, audio_length, self.rank)
|
audio_starts, audio_length, self.rank)
|
||||||
|
|
||||||
# input_audio, target_audio: [bs, 13799]
|
# input_audio, target_audio: [bs, len]
|
||||||
input_audios = audios[:, :-1]
|
input_audios = audios[:, :-1]
|
||||||
target_audios = audios[:, 1:]
|
target_audios = audios[:, 1:]
|
||||||
# conditioner: [bs, 13799, 80]
|
# conditioner: [bs, len, mel_bands]
|
||||||
conditioner = conditioner[:, 1:, :]
|
conditioner = conditioner[:, 1:, :]
|
||||||
|
|
||||||
loss_type = self.config.loss_type
|
loss_type = self.config.loss_type
|
||||||
|
|
||||||
# layer_input: [bs, 13799, 128]
|
|
||||||
if loss_type == "softmax":
|
if loss_type == "softmax":
|
||||||
input_audios = fluid.layers.clip(
|
input_audios = fluid.layers.clip(
|
||||||
input_audios, min=-1.0, max=0.99999)
|
input_audios, min=-1.0, max=0.99999)
|
||||||
|
@ -280,31 +257,31 @@ class WaveNetModule(dg.Layer):
|
||||||
quantized = fluid.layers.cast(
|
quantized = fluid.layers.cast(
|
||||||
(input_audios + 1.0) / 2.0 * self.config.num_channels,
|
(input_audios + 1.0) / 2.0 * self.config.num_channels,
|
||||||
dtype="int64")
|
dtype="int64")
|
||||||
layer_input = self.embedding_fc(fluid.layers.unsqueeze(quantized, 2))
|
layer_input = self.embedding_fc(
|
||||||
|
fluid.layers.unsqueeze(quantized, 2))
|
||||||
elif loss_type == "mix-gaussian-pdf":
|
elif loss_type == "mix-gaussian-pdf":
|
||||||
layer_input = self.embedding_fc(fluid.layers.unsqueeze(input_audios, 2))
|
layer_input = self.embedding_fc(
|
||||||
|
fluid.layers.unsqueeze(input_audios, 2))
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"loss_type {} is unsupported!".format(loss_type))
|
"loss_type {} is unsupported!".format(loss_type))
|
||||||
|
|
||||||
# layer_input: [bs, res_channel, 1, 13799]
|
# layer_input: [bs, res_channel, 1, len]
|
||||||
layer_input = fluid.layers.unsqueeze(fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
|
layer_input = fluid.layers.unsqueeze(
|
||||||
# conditioner: [bs, mel_bands, 1, 13799]
|
fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
|
||||||
conditioner = fluid.layers.unsqueeze(fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
|
# conditioner: [bs, mel_bands, 1, len]
|
||||||
|
conditioner = fluid.layers.unsqueeze(
|
||||||
|
fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
|
||||||
|
|
||||||
# layer_input: [bs, res_channel, 1, 13799]
|
|
||||||
# skip: [bs, res_channel, 1, 13799]
|
|
||||||
skip = None
|
skip = None
|
||||||
for i, layer in enumerate(self.dilated_causal_convs):
|
for i, layer in enumerate(self.dilated_causal_convs):
|
||||||
|
# layer_input: [bs, res_channel, 1, len]
|
||||||
|
# skip: [bs, res_channel, 1, len]
|
||||||
layer_input, skip = layer(layer_input, skip, conditioner)
|
layer_input, skip = layer(layer_input, skip, conditioner)
|
||||||
#debug(layer_input, "layer_input_" + str(i), self.rank)
|
|
||||||
#debug(skip, "skip_" + str(i), self.rank)
|
|
||||||
|
|
||||||
# Reshape skip to [bs, 13799, res_channel]
|
# Reshape skip to [bs, len, res_channel]
|
||||||
skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
|
skip = fluid.layers.transpose(
|
||||||
#debug(skip, "skip", self.rank)
|
fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
|
||||||
|
|
||||||
# mix_param: [bs, 13799, 3 * num_mixtures]
|
|
||||||
mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
|
mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
|
||||||
|
|
||||||
# Sample teacher-forced audio.
|
# Sample teacher-forced audio.
|
||||||
|
@ -317,12 +294,7 @@ class WaveNetModule(dg.Layer):
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"loss_type {} is unsupported!".format(loss_type))
|
"loss_type {} is unsupported!".format(loss_type))
|
||||||
#debug(sample_audios, "sample_audios", self.rank)
|
|
||||||
|
|
||||||
# Calculate mix-gaussian density loss.
|
|
||||||
# padding is all zero.
|
|
||||||
# target_audio: [bs, 13799].
|
|
||||||
# mix_params: [bs, 13799, 3].
|
|
||||||
if loss_type == "softmax":
|
if loss_type == "softmax":
|
||||||
loss = self.softmax_loss(target_audios, mix_parameters)
|
loss = self.softmax_loss(target_audios, mix_parameters)
|
||||||
elif loss_type == "mix-gaussian-pdf":
|
elif loss_type == "mix-gaussian-pdf":
|
||||||
|
@ -332,27 +304,16 @@ class WaveNetModule(dg.Layer):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"loss_type {} is unsupported!".format(loss_type))
|
"loss_type {} is unsupported!".format(loss_type))
|
||||||
|
|
||||||
#print("Rank {}, loss {}".format(self.rank, loss.numpy()))
|
|
||||||
|
|
||||||
return loss, sample_audios
|
return loss, sample_audios
|
||||||
|
|
||||||
def synthesize(self, mels):
|
def synthesize(self, mels):
|
||||||
self.start_new_sequence()
|
self.start_new_sequence()
|
||||||
print("input mels shape", mels.shape)
|
|
||||||
# mels: [bs=1, n_frames, 80]
|
|
||||||
# conditioner: [1, n_frames * samples_per_frame, 80]
|
|
||||||
# Should I move forward by one sample? No difference
|
|
||||||
# Append context frame to mels
|
|
||||||
bs, n_frames, mel_bands = mels.shape
|
bs, n_frames, mel_bands = mels.shape
|
||||||
#num_pad_frames = int(np.ceil(self.context_size / self.config.fft_window_shift))
|
|
||||||
#silence = fluid.layers.zeros(shape=[bs, num_pad_frames, mel_bands], dtype="float32")
|
|
||||||
#inf_mels = fluid.layers.concat([silence, mels], axis=1)
|
|
||||||
#print("padded mels shape", inf_mels.shape)
|
|
||||||
|
|
||||||
#conditioner = self.conditioner(inf_mels)[:, self.context_size:, :]
|
|
||||||
conditioner = self.conditioner(mels)
|
conditioner = self.conditioner(mels)
|
||||||
time_steps = conditioner.shape[1]
|
time_steps = conditioner.shape[1]
|
||||||
print("Total steps", time_steps)
|
|
||||||
|
print("input mels shape", mels.shape)
|
||||||
|
print("Total synthesis steps", time_steps)
|
||||||
|
|
||||||
loss_type = self.config.loss_type
|
loss_type = self.config.loss_type
|
||||||
audio_samples = []
|
audio_samples = []
|
||||||
|
@ -361,8 +322,8 @@ class WaveNetModule(dg.Layer):
|
||||||
if i % 100 == 0:
|
if i % 100 == 0:
|
||||||
print("Step", i)
|
print("Step", i)
|
||||||
|
|
||||||
# convert from real value sample to audio embedding.
|
# Convert from real value sample to audio embedding.
|
||||||
# [bs, 1, 128]
|
# audio_input: [bs, 1, channel]
|
||||||
if loss_type == "softmax":
|
if loss_type == "softmax":
|
||||||
current_sample = fluid.layers.clip(
|
current_sample = fluid.layers.clip(
|
||||||
current_sample, min=-1.0, max=0.99999)
|
current_sample, min=-1.0, max=0.99999)
|
||||||
|
@ -377,21 +338,23 @@ class WaveNetModule(dg.Layer):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"loss_type {} is unsupported!".format(loss_type))
|
"loss_type {} is unsupported!".format(loss_type))
|
||||||
|
|
||||||
# [bs, 128, 1, 1]
|
# [bs, channel, 1, 1]
|
||||||
audio_input = fluid.layers.unsqueeze(fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
|
audio_input = fluid.layers.unsqueeze(
|
||||||
# [bs, 80]
|
fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
|
||||||
|
# [bs, mel_bands]
|
||||||
cond_input = conditioner[:, i, :]
|
cond_input = conditioner[:, i, :]
|
||||||
# [bs, 80, 1, 1]
|
# [bs, mel_bands, 1, 1]
|
||||||
cond_input = fluid.layers.reshape(
|
cond_input = fluid.layers.reshape(
|
||||||
cond_input, cond_input.shape + [1, 1])
|
cond_input, cond_input.shape + [1, 1])
|
||||||
|
|
||||||
skip = None
|
skip = None
|
||||||
for layer in self.dilated_causal_convs:
|
for layer in self.dilated_causal_convs:
|
||||||
audio_input, skip = layer.add_input(audio_input, skip, cond_input)
|
audio_input, skip = layer.add_input(
|
||||||
|
audio_input, skip, cond_input)
|
||||||
|
|
||||||
# [bs, 1, 128]
|
# [bs, 1, channel]
|
||||||
skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
|
skip = fluid.layers.transpose(
|
||||||
# [bs, 1, 3]
|
fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
|
||||||
mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
|
mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
|
||||||
if loss_type == "softmax":
|
if loss_type == "softmax":
|
||||||
sample = self.sample_softmax(mix_parameters)
|
sample = self.sample_softmax(mix_parameters)
|
||||||
|
@ -407,17 +370,12 @@ class WaveNetModule(dg.Layer):
|
||||||
current_sample = fluid.layers.reshape(current_sample,
|
current_sample = fluid.layers.reshape(current_sample,
|
||||||
current_sample.shape + [1, 1])
|
current_sample.shape + [1, 1])
|
||||||
|
|
||||||
# syn_audio: (num_samples,)
|
# syn_audio: [num_samples]
|
||||||
syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
|
syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
|
||||||
|
|
||||||
return syn_audio
|
return syn_audio
|
||||||
|
|
||||||
def start_new_sequence(self):
|
def start_new_sequence(self):
|
||||||
for layer in self.sublayers():
|
for layer in self.sublayers():
|
||||||
if isinstance(layer, weight_norm.Conv1D):
|
if isinstance(layer, conv.Conv1D):
|
||||||
layer.start_new_sequence()
|
layer.start_new_sequence()
|
||||||
|
|
||||||
def save(self, iteration):
|
|
||||||
utils.save_latest_parameters(self.checkpoint_dir, iteration,
|
|
||||||
self.wavenet, self.optimizer)
|
|
||||||
utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
|
|
||||||
|
|
|
@ -1,920 +0,0 @@
|
||||||
import math
|
|
||||||
from copy import deepcopy
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import paddle.fluid.dygraph as dg
|
|
||||||
from paddle import fluid
|
|
||||||
from paddle.fluid import core
|
|
||||||
from paddle.fluid.framework import Variable
|
|
||||||
from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
|
|
||||||
from paddle.fluid.layers import utils
|
|
||||||
from six.moves import reduce
|
|
||||||
|
|
||||||
|
|
||||||
def _norm(p, dim):
|
|
||||||
"""Computes the norm over all dimensions except dim.
|
|
||||||
It differs from pytorch implementation that it does not keep dim.
|
|
||||||
This difference is related with the broadcast mechanism in paddle.
|
|
||||||
Read elementeise_mul for more.
|
|
||||||
"""
|
|
||||||
if dim is None:
|
|
||||||
return np.linalg.norm(p, ord=2, axis=None)
|
|
||||||
elif dim == 0:
|
|
||||||
p = np.reshape(p, newshape=(p.shape[0], -1))
|
|
||||||
return np.linalg.norm(p, ord=2, axis=1)
|
|
||||||
elif dim == p.ndim - 1:
|
|
||||||
p = np.reshape(p, newshape=(-1, p.shape[-1]))
|
|
||||||
return np.linalg.norm(p, ord=2, axis=0)
|
|
||||||
else:
|
|
||||||
perm = list(range(p.ndim))
|
|
||||||
perm[0] = dim
|
|
||||||
perm[dim] = 0
|
|
||||||
return _norm(np.transpose(p, axes=perm))
|
|
||||||
|
|
||||||
|
|
||||||
class Conv1D(dg.Layer):
|
|
||||||
"""
|
|
||||||
A convolution 1D block implemented with Conv2D. Form simplicity and
|
|
||||||
ensuring the output has the same length as the input, it does not allow
|
|
||||||
stride > 1.
|
|
||||||
"""
|
|
||||||
def __init__(self,
|
|
||||||
name_scope,
|
|
||||||
num_filters,
|
|
||||||
filter_size=3,
|
|
||||||
dilation=1,
|
|
||||||
groups=None,
|
|
||||||
causal=False,
|
|
||||||
param_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
use_cudnn=True,
|
|
||||||
act=None,
|
|
||||||
dtype="float32"):
|
|
||||||
super(Conv1D, self).__init__(name_scope, dtype=dtype)
|
|
||||||
|
|
||||||
if causal:
|
|
||||||
padding = dilation * (filter_size - 1)
|
|
||||||
else:
|
|
||||||
padding = (dilation * (filter_size - 1)) // 2
|
|
||||||
|
|
||||||
self.num_filters = num_filters
|
|
||||||
self.filter_size = filter_size
|
|
||||||
self.dilation = dilation
|
|
||||||
self.causal = causal
|
|
||||||
self.padding = padding
|
|
||||||
self.act = act
|
|
||||||
|
|
||||||
self.conv = Conv2D(
|
|
||||||
self.full_name(),
|
|
||||||
num_filters=num_filters,
|
|
||||||
filter_size=(1, filter_size),
|
|
||||||
stride=(1, 1),
|
|
||||||
dilation=(1, dilation),
|
|
||||||
padding=(0, padding),
|
|
||||||
groups=groups,
|
|
||||||
param_attr=param_attr,
|
|
||||||
bias_attr=bias_attr,
|
|
||||||
use_cudnn=use_cudnn,
|
|
||||||
act=act,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
|
||||||
input channels.
|
|
||||||
Returns:
|
|
||||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
|
||||||
output channels (num_filters).
|
|
||||||
"""
|
|
||||||
x = self.conv(x)
|
|
||||||
if self.filter_size > 1:
|
|
||||||
if self.causal:
|
|
||||||
x = fluid.layers.slice(
|
|
||||||
x, axes=[3], starts=[0], ends=[-self.padding])
|
|
||||||
elif self.filter_size % 2 == 0:
|
|
||||||
x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
|
|
||||||
return x
|
|
||||||
|
|
||||||
def start_new_sequence(self):
|
|
||||||
self.temp_weight = None
|
|
||||||
self.input_buffer = None
|
|
||||||
|
|
||||||
def add_input(self, x):
|
|
||||||
"""
|
|
||||||
Adding input for a time step and compute an output for a time step.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
|
||||||
input channels, and T = 1.
|
|
||||||
Returns:
|
|
||||||
out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
|
|
||||||
means output channels (num_filters), and T = 1.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if self.temp_weight is None:
|
|
||||||
self.temp_weight = self._reshaped_weight()
|
|
||||||
|
|
||||||
window_size = 1 + (self.filter_size - 1) * self.dilation
|
|
||||||
batch_size = x.shape[0]
|
|
||||||
in_channels = x.shape[1]
|
|
||||||
|
|
||||||
if self.filter_size > 1:
|
|
||||||
if self.input_buffer is None:
|
|
||||||
self.input_buffer = fluid.layers.fill_constant(
|
|
||||||
[batch_size, in_channels, 1, window_size - 1],
|
|
||||||
dtype=x.dtype,
|
|
||||||
value=0.0)
|
|
||||||
else:
|
|
||||||
self.input_buffer = self.input_buffer[:, :, :, 1:]
|
|
||||||
self.input_buffer = fluid.layers.concat(
|
|
||||||
[self.input_buffer, x], axis=3)
|
|
||||||
x = self.input_buffer
|
|
||||||
if self.dilation > 1:
|
|
||||||
if not hasattr(self, "indices"):
|
|
||||||
self.indices = dg.to_variable(
|
|
||||||
np.arange(0, window_size, self.dilation))
|
|
||||||
tmp = fluid.layers.transpose(
|
|
||||||
self.input_buffer, perm=[3, 1, 2, 0])
|
|
||||||
tmp = fluid.layers.gather(tmp, index=self.indices)
|
|
||||||
tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
|
|
||||||
x = tmp
|
|
||||||
inputs = fluid.layers.reshape(
|
|
||||||
x, shape=[batch_size, in_channels * 1 * self.filter_size])
|
|
||||||
out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
|
|
||||||
out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
|
|
||||||
out = fluid.layers.reshape(out, out.shape + [1, 1])
|
|
||||||
out = self._helper.append_activation(out, act=self.act)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def _reshaped_weight(self):
|
|
||||||
"""
|
|
||||||
Get the linearized weight of convolution filter, cause it is by nature
|
|
||||||
a matmul weight. And because the model uses weight norm, compute the
|
|
||||||
weight by weight_v * weight_g to make it faster.
|
|
||||||
Returns:
|
|
||||||
weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
|
|
||||||
"""
|
|
||||||
shape = self.conv._filter_param_v.shape
|
|
||||||
matrix_shape = [shape[0], np.prod(shape[1:])]
|
|
||||||
weight_matrix = fluid.layers.reshape(
|
|
||||||
self.conv._filter_param_v, shape=matrix_shape)
|
|
||||||
weight_matrix = fluid.layers.elementwise_mul(
|
|
||||||
fluid.layers.l2_normalize(
|
|
||||||
weight_matrix, axis=1),
|
|
||||||
self.conv._filter_param_g,
|
|
||||||
axis=0)
|
|
||||||
return weight_matrix
|
|
||||||
|
|
||||||
|
|
||||||
class FC(dg.Layer):
|
|
||||||
"""
|
|
||||||
**Fully Connected Layer**
|
|
||||||
This function creates a fully connected layer in the network. It can take
|
|
||||||
one or multiple tensors as its inputs(input can be a list of Variable, see
|
|
||||||
Args in detail). It creates a pair of variables called (magnitude(g),
|
|
||||||
direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected
|
|
||||||
weight matrix from each input unit to each output unit.
|
|
||||||
The fully connected layer multiplies each input tensor
|
|
||||||
with its corresponding weight to produce an output Tensor with shape [M, `size`],
|
|
||||||
where M is batch size. If multiple input tensors are given, the results of
|
|
||||||
multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
|
|
||||||
is not None, a bias variable will be created and added to the output.
|
|
||||||
Finally, if activation is not None, it will be applied to the output as well.
|
|
||||||
When the input is single tensor:
|
|
||||||
.. math::
|
|
||||||
Out = Act({X(normalize(V)g) + b})
|
|
||||||
When the input are multiple tensors:
|
|
||||||
.. math::
|
|
||||||
Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
|
|
||||||
In the above equation:
|
|
||||||
* :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
|
|
||||||
* :math:`X_i`: The i-th input tensor.
|
|
||||||
* :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
|
|
||||||
* :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
|
|
||||||
* :math:`b`: The bias parameter created by this layer (if needed).
|
|
||||||
* :math:`Act`: The activation function.
|
|
||||||
* :math:`Out`: The output tensor.
|
|
||||||
See below for an example.
|
|
||||||
.. code-block:: text
|
|
||||||
Given:
|
|
||||||
data_1.data = [[[0.1, 0.2],
|
|
||||||
[0.3, 0.4]]]
|
|
||||||
data_1.shape = (1, 2, 2) # 1 is batch_size
|
|
||||||
data_2 = [[[0.1, 0.2, 0.3]]]
|
|
||||||
data_2.shape = (1, 1, 3)
|
|
||||||
out = fluid.layers.fc(input=[data_1, data_2], size=2)
|
|
||||||
Then:
|
|
||||||
out.data = [[0.18669507, 0.1893476]]
|
|
||||||
out.shape = (1, 2)
|
|
||||||
Args:
|
|
||||||
name_scope(str): The name of this class.
|
|
||||||
size(int): The number of output units in this layer.
|
|
||||||
num_flatten_dims (int): The fc layer can accept an input tensor with more than
|
|
||||||
two dimensions. If this happens, the multidimensional tensor will first be flattened
|
|
||||||
into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
|
|
||||||
tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
|
|
||||||
dimensions will be flatten to form the first dimension of the final matrix (height of
|
|
||||||
the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
|
|
||||||
form the second dimension of the final matrix (width of the matrix). For example, suppose
|
|
||||||
`X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
|
|
||||||
Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
|
|
||||||
param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
|
|
||||||
parameters/weights of this layer.
|
|
||||||
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
|
|
||||||
of this layer. If it is set to False, no bias will be added to the output units.
|
|
||||||
If it is set to None, the bias is initialized zero. Default: None.
|
|
||||||
act (str|None): Activation to be applied to the output of this layer.
|
|
||||||
is_test(bool): A flag indicating whether execution is in test phase. Default: False
|
|
||||||
dtype(str): Dtype used for weight
|
|
||||||
Raises:
|
|
||||||
ValueError: If rank of the input tensor is less than 2.
|
|
||||||
Examples:
|
|
||||||
.. code-block:: python
|
|
||||||
from paddle.fluid.dygraph.base import to_variable
|
|
||||||
import paddle.fluid as fluid
|
|
||||||
from paddle.fluid.dygraph import FC
|
|
||||||
import numpy as np
|
|
||||||
data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
|
|
||||||
with fluid.dygraph.guard():
|
|
||||||
fc = FC( "fc", 64, num_flatten_dims=2)
|
|
||||||
data = to_variable( data )
|
|
||||||
conv = fc( data )
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
name_scope,
|
|
||||||
size,
|
|
||||||
num_flatten_dims=1,
|
|
||||||
epsilon=1e-30,
|
|
||||||
param_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
act=None,
|
|
||||||
is_test=False,
|
|
||||||
dtype="float32"):
|
|
||||||
super(FC, self).__init__(name_scope, dtype)
|
|
||||||
|
|
||||||
self._size = size
|
|
||||||
self._num_flatten_dims = num_flatten_dims
|
|
||||||
self._epsilon = epsilon
|
|
||||||
self._dtype = dtype
|
|
||||||
self._param_attr = param_attr
|
|
||||||
self._bias_attr = bias_attr
|
|
||||||
self._act = act
|
|
||||||
self.__g = list()
|
|
||||||
self.__v = list()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _v(self, i=0):
|
|
||||||
return self.__v[i]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _g(self, i=0):
|
|
||||||
return self.__g[i]
|
|
||||||
|
|
||||||
@_v.setter
|
|
||||||
def _v(self, value, i=0):
|
|
||||||
assert isinstance(value, Parameter)
|
|
||||||
self.__v[i] = value
|
|
||||||
|
|
||||||
@_g.setter
|
|
||||||
def _g(self, value, i=0):
|
|
||||||
assert isinstance(value, Parameter)
|
|
||||||
self.__g[i] = value
|
|
||||||
|
|
||||||
def _build_once(self, input):
|
|
||||||
i = 0
|
|
||||||
for inp, param in self._helper.iter_inputs_and_params(
|
|
||||||
input, self._param_attr):
|
|
||||||
input_shape = inp.shape
|
|
||||||
|
|
||||||
param_shape = [
|
|
||||||
reduce(lambda a, b: a * b,
|
|
||||||
input_shape[self._num_flatten_dims:], 1)
|
|
||||||
] + [self._size]
|
|
||||||
self.__v.append(
|
|
||||||
self.add_parameter(
|
|
||||||
"_v%d" % i,
|
|
||||||
self.create_parameter(
|
|
||||||
attr=param,
|
|
||||||
shape=param_shape,
|
|
||||||
dtype=self._dtype,
|
|
||||||
is_bias=False)))
|
|
||||||
|
|
||||||
magnitude_shape = param_shape[1:]
|
|
||||||
magnitude_value = np.linalg.norm(
|
|
||||||
self.__v[i].numpy(), ord=2, axis=0)
|
|
||||||
|
|
||||||
self.__g.append(
|
|
||||||
self.add_parameter(
|
|
||||||
"_g%d" % i,
|
|
||||||
self.create_parameter(
|
|
||||||
attr=fluid.ParamAttr(initializer=fluid.initializer.
|
|
||||||
NumpyArrayInitializer(
|
|
||||||
magnitude_value)),
|
|
||||||
shape=magnitude_shape,
|
|
||||||
dtype=self._dtype,
|
|
||||||
is_bias=False)))
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
size = list([self._size])
|
|
||||||
self._b = self.create_parameter(
|
|
||||||
attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
|
|
||||||
|
|
||||||
def forward(self, input):
|
|
||||||
mul_results = list()
|
|
||||||
i = 0
|
|
||||||
for inp, param in self._helper.iter_inputs_and_params(
|
|
||||||
input, self._param_attr):
|
|
||||||
v_norm = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
v_normalized = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="norm",
|
|
||||||
inputs={"X": self.__v[i]},
|
|
||||||
outputs={"Out": v_normalized,
|
|
||||||
"Norm": v_norm},
|
|
||||||
attrs={"axis": 0,
|
|
||||||
"epsilon": self._epsilon})
|
|
||||||
weight = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="elementwise_mul",
|
|
||||||
inputs={"X": [v_normalized],
|
|
||||||
"Y": [self.__g[i]]},
|
|
||||||
outputs={"Out": [weight]},
|
|
||||||
attrs={"axis": 1})
|
|
||||||
tmp = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="mul",
|
|
||||||
inputs={"X": inp,
|
|
||||||
"Y": weight},
|
|
||||||
outputs={"Out": tmp},
|
|
||||||
attrs={
|
|
||||||
"x_num_col_dims": self._num_flatten_dims,
|
|
||||||
"y_num_col_dims": 1
|
|
||||||
})
|
|
||||||
i += 1
|
|
||||||
mul_results.append(tmp)
|
|
||||||
|
|
||||||
if len(mul_results) == 1:
|
|
||||||
pre_bias = mul_results[0]
|
|
||||||
else:
|
|
||||||
pre_bias = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="sum",
|
|
||||||
inputs={"X": mul_results},
|
|
||||||
outputs={"Out": pre_bias},
|
|
||||||
attrs={"use_mkldnn": False})
|
|
||||||
|
|
||||||
if self._b:
|
|
||||||
pre_activation = self._helper.create_variable_for_type_inference(
|
|
||||||
dtype=self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="elementwise_add",
|
|
||||||
inputs={"X": [pre_bias],
|
|
||||||
"Y": [self._b]},
|
|
||||||
outputs={"Out": [pre_activation]},
|
|
||||||
attrs={"axis": self._num_flatten_dims})
|
|
||||||
else:
|
|
||||||
pre_activation = pre_bias
|
|
||||||
# Currently, we don't support inplace in dygraph mode
|
|
||||||
return self._helper.append_activation(pre_activation, act=self._act)
|
|
||||||
|
|
||||||
|
|
||||||
class Conv2D(dg.Layer):
|
|
||||||
"""
|
|
||||||
The convolution2D layer calculates the output based on the input, filter
|
|
||||||
and strides, paddings, dilations, groups parameters. Input and
|
|
||||||
Output are in NCHW format, where N is batch size, C is the number of
|
|
||||||
channels, H is the height of the feature, and W is the width of the feature.
|
|
||||||
Filter is in MCHW format, where M is the number of output image channels,
|
|
||||||
C is the number of input image channels, H is the height of the filter,
|
|
||||||
and W is the width of the filter. If the groups is greater than 1,
|
|
||||||
C will equal the number of input image channels divided by the groups.
|
|
||||||
Please refer to UFLDL's `convolution
|
|
||||||
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
|
|
||||||
for more detials.
|
|
||||||
If bias attribution and activation type are provided, bias is added to the
|
|
||||||
output of the convolution, and the corresponding activation function is
|
|
||||||
applied to the final result.
|
|
||||||
For each input :math:`X`, the equation is:
|
|
||||||
.. math::
|
|
||||||
Out = \sigma ((Vg) \\ast X + b)
|
|
||||||
Where:
|
|
||||||
* :math:`X`: Input value, a tensor with NCHW format.
|
|
||||||
* :math:`V`: Filter direction value, a tensor with MCHW format.
|
|
||||||
* :math:`g`: Filter magnitude value, a tensor with M format.
|
|
||||||
* :math:`\\ast`: Convolution operation.
|
|
||||||
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
|
|
||||||
* :math:`\\sigma`: Activation function.
|
|
||||||
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
|
|
||||||
Example:
|
|
||||||
- Input:
|
|
||||||
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
|
|
||||||
Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
|
|
||||||
- Output:
|
|
||||||
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
|
|
||||||
Where
|
|
||||||
.. math::
|
|
||||||
H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
|
|
||||||
W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
|
|
||||||
Args:
|
|
||||||
name_scope(str) : The name for this class.
|
|
||||||
num_filters(int): The number of filter. It is as same as the output
|
|
||||||
image channel.
|
|
||||||
filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
|
|
||||||
it must contain two integers, (filter_size_H, filter_size_W).
|
|
||||||
Otherwise, the filter will be a square.
|
|
||||||
stride (int|tuple): The stride size. If stride is a tuple, it must
|
|
||||||
contain two integers, (stride_H, stride_W). Otherwise, the
|
|
||||||
stride_H = stride_W = stride. Default: stride = 1.
|
|
||||||
padding (int|tuple): The padding size. If padding is a tuple, it must
|
|
||||||
contain two integers, (padding_H, padding_W). Otherwise, the
|
|
||||||
padding_H = padding_W = padding. Default: padding = 0.
|
|
||||||
dilation (int|tuple): The dilation size. If dilation is a tuple, it must
|
|
||||||
contain two integers, (dilation_H, dilation_W). Otherwise, the
|
|
||||||
dilation_H = dilation_W = dilation. Default: dilation = 1.
|
|
||||||
groups (int): The groups number of the Conv2d Layer. According to grouped
|
|
||||||
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
|
|
||||||
the first half of the filters is only connected to the first half
|
|
||||||
of the input channels, while the second half of the filters is only
|
|
||||||
connected to the second half of the input channels. Default: groups=1.
|
|
||||||
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
|
|
||||||
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
|
|
||||||
will create ParamAttr as param_attr. If the Initializer of the param_attr
|
|
||||||
is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
|
|
||||||
and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
|
|
||||||
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
|
|
||||||
If it is set to False, no bias will be added to the output units.
|
|
||||||
If it is set to None or one attribute of ParamAttr, conv2d
|
|
||||||
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
|
|
||||||
is not set, the bias is initialized zero. Default: None.
|
|
||||||
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
|
|
||||||
library is installed. Default: True
|
|
||||||
act (str): Activation type, if it is set to None, activation is not appended.
|
|
||||||
Default: None
|
|
||||||
Raises:
|
|
||||||
ValueError: If the shapes of input, filter_size, stride, padding and
|
|
||||||
groups mismatch.
|
|
||||||
Examples:
|
|
||||||
.. code-block:: python
|
|
||||||
from paddle.fluid.dygraph.base import to_variable
|
|
||||||
import paddle.fluid as fluid
|
|
||||||
from paddle.fluid.dygraph import Conv2D
|
|
||||||
import numpy as np
|
|
||||||
data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
|
|
||||||
with fluid.dygraph.guard():
|
|
||||||
conv2d = Conv2D( "conv2d", 2, 3)
|
|
||||||
data = to_variable( data )
|
|
||||||
conv = conv2d( data )
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
name_scope,
|
|
||||||
num_filters,
|
|
||||||
filter_size,
|
|
||||||
stride=1,
|
|
||||||
padding=0,
|
|
||||||
dilation=1,
|
|
||||||
groups=None,
|
|
||||||
param_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
use_cudnn=True,
|
|
||||||
act=None,
|
|
||||||
epsilon=1e-30,
|
|
||||||
dtype="float32"):
|
|
||||||
assert param_attr is not False, "param_attr should not be False here."
|
|
||||||
super(Conv2D, self).__init__(name_scope, dtype)
|
|
||||||
self._groups = groups
|
|
||||||
self._stride = utils.convert_to_list(stride, 2, "stride")
|
|
||||||
self._padding = utils.convert_to_list(padding, 2, "padding")
|
|
||||||
self._dilation = utils.convert_to_list(dilation, 2, "dilation")
|
|
||||||
self._act = act
|
|
||||||
if not isinstance(use_cudnn, bool):
|
|
||||||
raise ValueError("use_cudnn should be True or False")
|
|
||||||
self._use_cudnn = use_cudnn
|
|
||||||
self._filter_size = filter_size
|
|
||||||
self._num_filters = num_filters
|
|
||||||
self._param_attr = param_attr
|
|
||||||
self._bias_attr = bias_attr
|
|
||||||
self._epsilon = epsilon
|
|
||||||
self._dtype = dtype
|
|
||||||
# if (self._num_channels == self._groups and
|
|
||||||
# num_filters % self._num_channels == 0 and not self._use_cudnn):
|
|
||||||
# self._l_type = 'depthwise_conv2d'
|
|
||||||
# else:
|
|
||||||
# TODO(jiabin): recover the usage of depthwise_conv2d when it's
|
|
||||||
# kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
|
|
||||||
self._l_type = "conv2d"
|
|
||||||
|
|
||||||
def _build_once(self, input):
|
|
||||||
self._num_channels = input.shape[1]
|
|
||||||
if self._groups is None:
|
|
||||||
num_filter_channels = self._num_channels
|
|
||||||
else:
|
|
||||||
if self._num_channels % self._groups != 0:
|
|
||||||
raise ValueError("num_channels must be divisible by groups.")
|
|
||||||
num_filter_channels = self._num_channels // self._groups
|
|
||||||
filter_size = utils.convert_to_list(self._filter_size, 2,
|
|
||||||
"filter_size")
|
|
||||||
filter_shape = [self._num_filters, int(num_filter_channels)
|
|
||||||
] + filter_size
|
|
||||||
|
|
||||||
def _get_default_param_initializer():
|
|
||||||
filter_elem_num = filter_size[0] * filter_size[
|
|
||||||
1] * self._num_channels
|
|
||||||
std = (2.0 / filter_elem_num)**0.5
|
|
||||||
return Normal(0.0, std, 0)
|
|
||||||
|
|
||||||
# weight_v
|
|
||||||
self._filter_param_v = self.create_parameter(
|
|
||||||
attr=self._param_attr,
|
|
||||||
shape=filter_shape,
|
|
||||||
dtype=self._dtype,
|
|
||||||
default_initializer=_get_default_param_initializer())
|
|
||||||
|
|
||||||
# weight_g
|
|
||||||
norm_value = _norm(
|
|
||||||
self._filter_param_v.numpy(), dim=0) # CAUTION: hard-code
|
|
||||||
self._filter_param_g = self.create_parameter(
|
|
||||||
attr=fluid.ParamAttr(
|
|
||||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
|
||||||
norm_value)),
|
|
||||||
shape=norm_value.shape,
|
|
||||||
dtype=self._dtype,
|
|
||||||
default_initializer=_get_default_param_initializer())
|
|
||||||
|
|
||||||
if self._use_cudnn:
|
|
||||||
self.create_variable(
|
|
||||||
name="kCUDNNFwdAlgoCache",
|
|
||||||
persistable=True,
|
|
||||||
type=core.VarDesc.VarType.RAW)
|
|
||||||
self.create_variable(
|
|
||||||
name="kCUDNNBwdDataAlgoCache",
|
|
||||||
persistable=True,
|
|
||||||
type=core.VarDesc.VarType.RAW)
|
|
||||||
self.create_variable(
|
|
||||||
name="kCUDNNBwdFilterAlgoCache",
|
|
||||||
persistable=True,
|
|
||||||
type=core.VarDesc.VarType.RAW)
|
|
||||||
|
|
||||||
self._bias_param = self.create_parameter(
|
|
||||||
attr=self._bias_attr,
|
|
||||||
shape=[self._num_filters],
|
|
||||||
dtype=self._dtype,
|
|
||||||
is_bias=True)
|
|
||||||
|
|
||||||
def forward(self, input):
|
|
||||||
matrix = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
tmp = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
new_shape = [
|
|
||||||
self._filter_param_v.shape[0],
|
|
||||||
reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
|
|
||||||
]
|
|
||||||
|
|
||||||
self._helper.append_op(
|
|
||||||
type="reshape2",
|
|
||||||
inputs={"X": self._filter_param_v},
|
|
||||||
attrs={"shape": new_shape},
|
|
||||||
outputs={"Out": matrix,
|
|
||||||
"XShape": tmp})
|
|
||||||
|
|
||||||
m_norm = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
m_normalized = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="norm",
|
|
||||||
inputs={"X": matrix},
|
|
||||||
outputs={"Out": m_normalized,
|
|
||||||
"Norm": m_norm},
|
|
||||||
attrs={"axis": 1,
|
|
||||||
"epsilon": self._epsilon})
|
|
||||||
|
|
||||||
v_normalized = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="reshape2",
|
|
||||||
inputs={"X": m_normalized},
|
|
||||||
attrs={"shape": self._filter_param_v.shape},
|
|
||||||
outputs={"Out": v_normalized,
|
|
||||||
"XShape": tmp2})
|
|
||||||
|
|
||||||
filter_param = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="elementwise_mul",
|
|
||||||
inputs={"X": [v_normalized],
|
|
||||||
"Y": [self._filter_param_g]},
|
|
||||||
outputs={"Out": [filter_param]},
|
|
||||||
attrs={"axis": 0}, # CAUTION: hard-code
|
|
||||||
)
|
|
||||||
|
|
||||||
pre_bias = self._helper.create_variable_for_type_inference(
|
|
||||||
dtype=self._dtype)
|
|
||||||
|
|
||||||
self._helper.append_op(
|
|
||||||
type=self._l_type,
|
|
||||||
inputs={"Input": input,
|
|
||||||
"Filter": filter_param},
|
|
||||||
outputs={"Output": pre_bias},
|
|
||||||
attrs={
|
|
||||||
"strides": self._stride,
|
|
||||||
"paddings": self._padding,
|
|
||||||
"dilations": self._dilation,
|
|
||||||
"groups": self._groups if self._groups else 1,
|
|
||||||
"use_cudnn": self._use_cudnn,
|
|
||||||
"use_mkldnn": False,
|
|
||||||
})
|
|
||||||
|
|
||||||
if self._bias_param is not None:
|
|
||||||
pre_act = self._helper.create_variable_for_type_inference(
|
|
||||||
dtype=self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="elementwise_add",
|
|
||||||
inputs={"X": [pre_bias],
|
|
||||||
"Y": [self._bias_param]},
|
|
||||||
outputs={"Out": [pre_act]},
|
|
||||||
attrs={"axis": 1})
|
|
||||||
else:
|
|
||||||
pre_act = pre_bias
|
|
||||||
|
|
||||||
# Currently, we don't support inplace in dygraph mode
|
|
||||||
return self._helper.append_activation(pre_act, act=self._act)
|
|
||||||
|
|
||||||
|
|
||||||
class Conv2DTranspose(dg.Layer):
|
|
||||||
"""
|
|
||||||
**Convlution2D transpose layer**
|
|
||||||
The convolution2D transpose layer calculates the output based on the input,
|
|
||||||
filter, and dilations, strides, paddings. Input(Input) and output(Output)
|
|
||||||
are in NCHW format. Where N is batch size, C is the number of channels,
|
|
||||||
H is the height of the feature, and W is the width of the feature.
|
|
||||||
Parameters(dilations, strides, paddings) are two elements. These two elements
|
|
||||||
represent height and width, respectively. The details of convolution transpose
|
|
||||||
layer, please refer to the following explanation and references
|
|
||||||
`therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
|
|
||||||
If bias attribution and activation type are provided, bias is added to
|
|
||||||
the output of the convolution, and the corresponding activation function
|
|
||||||
is applied to the final result.
|
|
||||||
For each input :math:`X`, the equation is:
|
|
||||||
.. math::
|
|
||||||
Out = \sigma ((Vg) \\ast X + b)
|
|
||||||
Where:
|
|
||||||
* :math:`X`: Input value, a tensor with NCHW format.
|
|
||||||
* :math:`V`: Filter value, a tensor with MCHW format.
|
|
||||||
* :math:`g`: Filter value, a tensor with M format.
|
|
||||||
* :math:`\\ast`: Convolution operation.
|
|
||||||
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
|
|
||||||
* :math:`\\sigma`: Activation function.
|
|
||||||
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
|
|
||||||
Example:
|
|
||||||
- Input:
|
|
||||||
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
|
|
||||||
Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
|
|
||||||
- Output:
|
|
||||||
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
|
|
||||||
Where
|
|
||||||
.. math::
|
|
||||||
H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
|
|
||||||
W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
|
|
||||||
H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
|
|
||||||
W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
|
|
||||||
Args:
|
|
||||||
name_scope(str): The name of this class.
|
|
||||||
num_filters(int): The number of the filter. It is as same as the output
|
|
||||||
image channel.
|
|
||||||
output_size(int|tuple|None): The output image size. If output size is a
|
|
||||||
tuple, it must contain two integers, (image_H, image_W). None if use
|
|
||||||
filter_size, padding, and stride to calculate output_size.
|
|
||||||
if output_size and filter_size are specified at the same time, They
|
|
||||||
should follow the formula above. Default: None.
|
|
||||||
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
|
|
||||||
it must contain two integers, (filter_size_H, filter_size_W).
|
|
||||||
Otherwise, the filter will be a square. None if use output size to
|
|
||||||
calculate filter_size. Default: None.
|
|
||||||
padding(int|tuple): The padding size. If padding is a tuple, it must
|
|
||||||
contain two integers, (padding_H, padding_W). Otherwise, the
|
|
||||||
padding_H = padding_W = padding. Default: padding = 0.
|
|
||||||
stride(int|tuple): The stride size. If stride is a tuple, it must
|
|
||||||
contain two integers, (stride_H, stride_W). Otherwise, the
|
|
||||||
stride_H = stride_W = stride. Default: stride = 1.
|
|
||||||
dilation(int|tuple): The dilation size. If dilation is a tuple, it must
|
|
||||||
contain two integers, (dilation_H, dilation_W). Otherwise, the
|
|
||||||
dilation_H = dilation_W = dilation. Default: dilation = 1.
|
|
||||||
groups(int): The groups number of the Conv2d transpose layer. Inspired by
|
|
||||||
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
|
|
||||||
when group=2, the first half of the filters is only connected to the
|
|
||||||
first half of the input channels, while the second half of the
|
|
||||||
filters is only connected to the second half of the input channels.
|
|
||||||
Default: groups = 1.
|
|
||||||
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
|
|
||||||
of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
|
|
||||||
will create ParamAttr as param_attr. If the Initializer of the param_attr
|
|
||||||
is not set, the parameter is initialized with Xavier. Default: None.
|
|
||||||
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
|
|
||||||
If it is set to False, no bias will be added to the output units.
|
|
||||||
If it is set to None or one attribute of ParamAttr, conv2d_transpose
|
|
||||||
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
|
|
||||||
is not set, the bias is initialized zero. Default: None.
|
|
||||||
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
|
|
||||||
library is installed. Default: True.
|
|
||||||
act (str): Activation type, if it is set to None, activation is not appended.
|
|
||||||
Default: None.
|
|
||||||
Returns:
|
|
||||||
Variable: The tensor variable storing the convolution transpose result.
|
|
||||||
Raises:
|
|
||||||
ValueError: If the shapes of input, filter_size, stride, padding and
|
|
||||||
groups mismatch.
|
|
||||||
Examples:
|
|
||||||
.. code-block:: python
|
|
||||||
import paddle.fluid as fluid
|
|
||||||
import numpy
|
|
||||||
with fluid.dygraph.guard():
|
|
||||||
data = numpy.random.random((3, 32, 32)).astype('float32')
|
|
||||||
conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
|
|
||||||
'Conv2DTranspose', num_filters=2, filter_size=3)
|
|
||||||
ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
name_scope,
|
|
||||||
num_filters,
|
|
||||||
output_size=None,
|
|
||||||
filter_size=None,
|
|
||||||
padding=0,
|
|
||||||
stride=1,
|
|
||||||
dilation=1,
|
|
||||||
groups=None,
|
|
||||||
param_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
use_cudnn=True,
|
|
||||||
epsilon=1e-30,
|
|
||||||
act=None,
|
|
||||||
dtype="float32"):
|
|
||||||
super(Conv2DTranspose, self).__init__(name_scope, dtype)
|
|
||||||
assert (param_attr is not False
|
|
||||||
), "param_attr should not be False in conv2d_transpose."
|
|
||||||
self._param_attr = param_attr
|
|
||||||
self._bias_attr = bias_attr
|
|
||||||
self._groups = groups
|
|
||||||
self._num_filters = num_filters
|
|
||||||
self._use_cudnn = use_cudnn
|
|
||||||
self._padding = padding
|
|
||||||
self._stride = stride
|
|
||||||
self._dilation = dilation
|
|
||||||
self._filter_size = filter_size
|
|
||||||
self._output_size = output_size
|
|
||||||
self._op_type = "conv2d_transpose"
|
|
||||||
self._epsilon = epsilon
|
|
||||||
|
|
||||||
def _build_once(self, input):
|
|
||||||
input_channel = input.shape[1]
|
|
||||||
if (input_channel == self._groups and
|
|
||||||
self._num_filters == input_channel and not self._use_cudnn):
|
|
||||||
self._op_type = "depthwise_conv2d_transpose"
|
|
||||||
|
|
||||||
if not isinstance(input, Variable):
|
|
||||||
raise TypeError("Input of conv2d_transpose must be Variable")
|
|
||||||
|
|
||||||
self._padding = utils.convert_to_list(self._padding, 2, "padding")
|
|
||||||
self._stride = utils.convert_to_list(self._stride, 2, "stride")
|
|
||||||
self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
|
|
||||||
|
|
||||||
if not isinstance(self._use_cudnn, bool):
|
|
||||||
raise ValueError("use_cudnn should be True or False")
|
|
||||||
|
|
||||||
if self._filter_size is None:
|
|
||||||
if self._output_size is None:
|
|
||||||
raise ValueError(
|
|
||||||
"output_size must be set when filter_size is None")
|
|
||||||
if isinstance(self._output_size, int):
|
|
||||||
self._output_size = [self._output_size, self._output_size]
|
|
||||||
|
|
||||||
h_in = input.shape[2]
|
|
||||||
w_in = input.shape[3]
|
|
||||||
|
|
||||||
filter_size_h = (self._output_size[0] -
|
|
||||||
(h_in - 1) * self._stride[0] + 2 *
|
|
||||||
self._padding[0] - 1) // self._dilation[0] + 1
|
|
||||||
filter_size_w = (self._output_size[1] -
|
|
||||||
(w_in - 1) * self._stride[1] + 2 *
|
|
||||||
self._padding[1] - 1) // self._dilation[1] + 1
|
|
||||||
self._filter_size = [filter_size_h, filter_size_w]
|
|
||||||
else:
|
|
||||||
self._filter_size = utils.convert_to_list(
|
|
||||||
self._filter_size, 2, "conv2d_transpose.filter_size")
|
|
||||||
|
|
||||||
if self._output_size is None:
|
|
||||||
self._output_size = []
|
|
||||||
elif isinstance(self._output_size, list) or isinstance(
|
|
||||||
self._output_size, int):
|
|
||||||
self._output_size = utils.convert_to_list(self._output_size, 2,
|
|
||||||
"output_size")
|
|
||||||
else:
|
|
||||||
raise ValueError("output_size should be list or int")
|
|
||||||
self._padding = utils.convert_to_list(self._padding, 2, "padding")
|
|
||||||
self._groups = 1 if self._groups is None else self._groups
|
|
||||||
filter_shape = [
|
|
||||||
input_channel,
|
|
||||||
self._num_filters // self._groups,
|
|
||||||
] + self._filter_size
|
|
||||||
|
|
||||||
# img filter v (direction)
|
|
||||||
self._img_filter_v = self.create_parameter(
|
|
||||||
dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
|
|
||||||
|
|
||||||
# img filter g (magnitude)
|
|
||||||
img_filter_magnitude = _norm(
|
|
||||||
self._img_filter_v.numpy(), dim=0) # CAUTION: hard-code
|
|
||||||
self._img_filter_g = self.create_parameter(
|
|
||||||
dtype=input.dtype,
|
|
||||||
shape=img_filter_magnitude.shape,
|
|
||||||
attr=fluid.ParamAttr(
|
|
||||||
initializer=NumpyArrayInitializer(img_filter_magnitude)))
|
|
||||||
|
|
||||||
self._img_bias = self.create_parameter(
|
|
||||||
attr=self._bias_attr,
|
|
||||||
shape=[self._num_filters],
|
|
||||||
dtype=self._dtype,
|
|
||||||
is_bias=True)
|
|
||||||
|
|
||||||
def forward(self, input):
|
|
||||||
matrix = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
tmp = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
new_shape = [
|
|
||||||
self._img_filter_v.shape[0],
|
|
||||||
reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
|
|
||||||
]
|
|
||||||
|
|
||||||
self._helper.append_op(
|
|
||||||
type="reshape2",
|
|
||||||
inputs={"X": self._img_filter_v},
|
|
||||||
attrs={"shape": new_shape},
|
|
||||||
outputs={"Out": matrix,
|
|
||||||
"XShape": tmp})
|
|
||||||
|
|
||||||
m_norm = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
m_normalized = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="norm",
|
|
||||||
inputs={"X": matrix},
|
|
||||||
outputs={"Out": m_normalized,
|
|
||||||
"Norm": m_norm},
|
|
||||||
attrs={"axis": 1,
|
|
||||||
"epsilon": self._epsilon})
|
|
||||||
|
|
||||||
v_normalized = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="reshape2",
|
|
||||||
inputs={"X": m_normalized},
|
|
||||||
attrs={"shape": self._img_filter_v.shape},
|
|
||||||
outputs={"Out": v_normalized,
|
|
||||||
"XShape": tmp2})
|
|
||||||
|
|
||||||
img_filter = self._helper.create_variable_for_type_inference(
|
|
||||||
self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="elementwise_mul",
|
|
||||||
inputs={"X": [v_normalized],
|
|
||||||
"Y": [self._img_filter_g]},
|
|
||||||
outputs={"Out": [img_filter]},
|
|
||||||
attrs={"axis": 0}, # CAUTION: hard-code
|
|
||||||
)
|
|
||||||
|
|
||||||
pre_bias = self._helper.create_variable_for_type_inference(
|
|
||||||
dtype=input.dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type=self._op_type,
|
|
||||||
inputs={"Input": [input],
|
|
||||||
"Filter": [img_filter]},
|
|
||||||
outputs={"Output": pre_bias},
|
|
||||||
attrs={
|
|
||||||
"output_size": self._output_size,
|
|
||||||
"strides": self._stride,
|
|
||||||
"paddings": self._padding,
|
|
||||||
"dilations": self._dilation,
|
|
||||||
"groups": self._groups,
|
|
||||||
"use_cudnn": self._use_cudnn,
|
|
||||||
})
|
|
||||||
|
|
||||||
if self._img_bias is not None:
|
|
||||||
pre_act = self._helper.create_variable_for_type_inference(
|
|
||||||
dtype=self._dtype)
|
|
||||||
self._helper.append_op(
|
|
||||||
type="elementwise_add",
|
|
||||||
inputs={"X": [pre_bias],
|
|
||||||
"Y": [self._img_bias]},
|
|
||||||
outputs={"Out": [pre_act]},
|
|
||||||
attrs={"axis": 1})
|
|
||||||
else:
|
|
||||||
pre_act = pre_bias
|
|
||||||
|
|
||||||
out = self._helper.append_activation(pre_act)
|
|
||||||
return out
|
|
|
@ -26,6 +26,7 @@ def FC(name_scope,
|
||||||
in_features,
|
in_features,
|
||||||
size,
|
size,
|
||||||
num_flatten_dims=1,
|
num_flatten_dims=1,
|
||||||
|
relu=False,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
epsilon=1e-30,
|
epsilon=1e-30,
|
||||||
act=None,
|
act=None,
|
||||||
|
@ -39,7 +40,11 @@ def FC(name_scope,
|
||||||
# stds
|
# stds
|
||||||
if isinstance(in_features, int):
|
if isinstance(in_features, int):
|
||||||
in_features = [in_features]
|
in_features = [in_features]
|
||||||
|
|
||||||
stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
|
stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
|
||||||
|
if relu:
|
||||||
|
stds = [std * np.sqrt(2.0) for std in stds]
|
||||||
|
|
||||||
weight_inits = [
|
weight_inits = [
|
||||||
fluid.initializer.NormalInitializer(scale=std) for std in stds
|
fluid.initializer.NormalInitializer(scale=std) for std in stds
|
||||||
]
|
]
|
||||||
|
@ -456,3 +461,152 @@ class PositionEmbedding(dg.Layer):
|
||||||
return out
|
return out
|
||||||
else:
|
else:
|
||||||
raise Exception("Then you can just use position rate at init")
|
raise Exception("Then you can just use position rate at init")
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1D_GU(dg.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
name_scope,
|
||||||
|
conditioner_dim,
|
||||||
|
in_channels,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
causal=False,
|
||||||
|
residual=True,
|
||||||
|
dtype="float32"):
|
||||||
|
super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)
|
||||||
|
|
||||||
|
self.conditioner_dim = conditioner_dim
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.num_filters = num_filters
|
||||||
|
self.filter_size = filter_size
|
||||||
|
self.dilation = dilation
|
||||||
|
self.causal = causal
|
||||||
|
self.residual = residual
|
||||||
|
|
||||||
|
if residual:
|
||||||
|
assert (
|
||||||
|
in_channels == num_filters
|
||||||
|
), "this block uses residual connection"\
|
||||||
|
"the input_channels should equals num_filters"
|
||||||
|
|
||||||
|
self.conv = Conv1D(
|
||||||
|
self.full_name(),
|
||||||
|
in_channels,
|
||||||
|
2 * num_filters,
|
||||||
|
filter_size,
|
||||||
|
dilation,
|
||||||
|
causal=causal,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
self.fc = Conv1D(
|
||||||
|
self.full_name(),
|
||||||
|
conditioner_dim,
|
||||||
|
2 * num_filters,
|
||||||
|
filter_size=1,
|
||||||
|
dilation=1,
|
||||||
|
causal=False,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
def forward(self, x, skip=None, conditioner=None):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU
|
||||||
|
layer, where B means batch_size, C_in means the input channels
|
||||||
|
T means input time steps.
|
||||||
|
skip (Variable): Shape(B, C_in, 1, T), skip connection.
|
||||||
|
conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
|
||||||
|
conditioner, where C_con is conditioner hidden dim which
|
||||||
|
equals the num of mel bands. Note that when using residual
|
||||||
|
connection, the Conv1D_GU does not change the number of
|
||||||
|
channels, so out channels equals input channels.
|
||||||
|
Returns:
|
||||||
|
x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where
|
||||||
|
C_out means the output channels of Conv1D_GU.
|
||||||
|
skip (Variable): Shape(B, C_out, 1, T), skip connection.
|
||||||
|
"""
|
||||||
|
residual = x
|
||||||
|
x = self.conv(x)
|
||||||
|
|
||||||
|
if conditioner is not None:
|
||||||
|
cond_bias = self.fc(conditioner)
|
||||||
|
x += cond_bias
|
||||||
|
|
||||||
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||||
|
|
||||||
|
# Gated Unit.
|
||||||
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
|
||||||
|
fluid.layers.tanh(content))
|
||||||
|
|
||||||
|
if skip is None:
|
||||||
|
skip = x
|
||||||
|
else:
|
||||||
|
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
if self.residual:
|
||||||
|
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
return x, skip
|
||||||
|
|
||||||
|
def add_input(self, x, skip=None, conditioner=None):
|
||||||
|
"""
|
||||||
|
Inputs:
|
||||||
|
x: shape(B, num_filters, 1, time_steps)
|
||||||
|
skip: shape(B, num_filters, 1, time_steps), skip connection
|
||||||
|
conditioner: shape(B, conditioner_dim, 1, time_steps)
|
||||||
|
Outputs:
|
||||||
|
x: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
||||||
|
skip: skip connection, same shape as x
|
||||||
|
"""
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
# add step input and produce step output
|
||||||
|
x = self.conv.add_input(x)
|
||||||
|
|
||||||
|
if conditioner is not None:
|
||||||
|
cond_bias = self.fc(conditioner)
|
||||||
|
x += cond_bias
|
||||||
|
|
||||||
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||||
|
|
||||||
|
# Gated Unit.
|
||||||
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
|
||||||
|
fluid.layers.tanh(content))
|
||||||
|
|
||||||
|
if skip is None:
|
||||||
|
skip = x
|
||||||
|
else:
|
||||||
|
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
if self.residual:
|
||||||
|
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
||||||
|
|
||||||
|
return x, skip
|
||||||
|
|
||||||
|
|
||||||
|
def Conv2DTranspose(name_scope,
|
||||||
|
num_filters,
|
||||||
|
filter_size,
|
||||||
|
padding=0,
|
||||||
|
stride=1,
|
||||||
|
dilation=1,
|
||||||
|
use_cudnn=True,
|
||||||
|
act=None,
|
||||||
|
dtype="float32"):
|
||||||
|
val = 1.0 / (filter_size[0] * filter_size[1])
|
||||||
|
weight_init = fluid.initializer.ConstantInitializer(val)
|
||||||
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
||||||
|
|
||||||
|
layer = weight_norm.Conv2DTranspose(
|
||||||
|
name_scope,
|
||||||
|
num_filters,
|
||||||
|
filter_size=filter_size,
|
||||||
|
padding=padding,
|
||||||
|
stride=stride,
|
||||||
|
dilation=dilation,
|
||||||
|
param_attr=weight_attr,
|
||||||
|
use_cudnn=use_cudnn,
|
||||||
|
act=act,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
return layer
|
||||||
|
|
Loading…
Reference in New Issue