From 424c16a68da3a4d9d70b365a7ec507dc2cc3599b Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 27 Feb 2020 10:23:05 +0000 Subject: [PATCH] staged clarinet --- .../clarinet/configs/clarinet_ljspeech.yaml | 52 ++++++ .../configs/wavenet_mixture_of_gaussians.yaml | 1 - .../configs/wavenet_single_gaussian.yaml | 1 - examples/wavenet/configs/wavenet_softmax.yaml | 1 - examples/wavenet/utils.py | 2 +- parakeet/data/dataset.py | 2 +- parakeet/models/clarinet/__init__.py | 16 ++ parakeet/models/clarinet/net.py | 169 ++++++++++++++++++ parakeet/models/clarinet/utils.py | 48 +++++ 9 files changed, 287 insertions(+), 5 deletions(-) create mode 100644 examples/clarinet/configs/clarinet_ljspeech.yaml create mode 100644 parakeet/models/clarinet/__init__.py create mode 100644 parakeet/models/clarinet/net.py create mode 100644 parakeet/models/clarinet/utils.py diff --git a/examples/clarinet/configs/clarinet_ljspeech.yaml b/examples/clarinet/configs/clarinet_ljspeech.yaml new file mode 100644 index 0000000..f689cca --- /dev/null +++ b/examples/clarinet/configs/clarinet_ljspeech.yaml @@ -0,0 +1,52 @@ +data: + batch_size: 4 + train_clip_seconds: 0.5 + sample_rate: 22050 + hop_length: 256 + win_length: 1024 + n_fft: 2048 + + n_mels: 80 + valid_size: 16 + + +conditioner: + upsampling_factors: [16, 16] + +teacher: + n_loop: 10 + n_layer: 3 + filter_size: 2 + residual_channels: 128 + loss_type: "mog" + output_dim: 3 + log_scale_min: -9 + +student: + n_loops: [10, 10, 10, 10, 10, 10] + n_layers: [1, 1, 1, 1, 1, 1] + filter_size: 3 + residual_channels: 64 + log_scale_min: -7 + +stft: + n_fft: 2048 + win_length: 1024 + hop_length: 256 + +loss: + lmd: 4 + +train: + learning_rate: 0.0005 + anneal_rate: 0.5 + anneal_interval: 200000 + gradient_max_norm: 100.0 + + checkpoint_interval: 10 + eval_interval: 10 + + max_iterations: 2000000 + + + diff --git a/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml b/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml index 427c975..a848a52 100644 --- a/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml +++ b/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml @@ -1,5 +1,4 @@ data: - root: "/workspace/datasets/LJSpeech-1.1/" batch_size: 4 train_clip_seconds: 0.5 sample_rate: 22050 diff --git a/examples/wavenet/configs/wavenet_single_gaussian.yaml b/examples/wavenet/configs/wavenet_single_gaussian.yaml index 8dd8d46..8e33349 100644 --- a/examples/wavenet/configs/wavenet_single_gaussian.yaml +++ b/examples/wavenet/configs/wavenet_single_gaussian.yaml @@ -1,5 +1,4 @@ data: - root: "/workspace/datasets/LJSpeech-1.1/" batch_size: 4 train_clip_seconds: 0.5 sample_rate: 22050 diff --git a/examples/wavenet/configs/wavenet_softmax.yaml b/examples/wavenet/configs/wavenet_softmax.yaml index 57c36cc..98018ee 100644 --- a/examples/wavenet/configs/wavenet_softmax.yaml +++ b/examples/wavenet/configs/wavenet_softmax.yaml @@ -1,5 +1,4 @@ data: - root: "/workspace/datasets/LJSpeech-1.1/" batch_size: 4 train_clip_seconds: 0.5 sample_rate: 22050 diff --git a/examples/wavenet/utils.py b/examples/wavenet/utils.py index 82ab553..86c8ebf 100644 --- a/examples/wavenet/utils.py +++ b/examples/wavenet/utils.py @@ -56,7 +56,7 @@ def eval_model(model, valid_loader, output_dir, sample_rate): audio_clips, mel_specs, audio_starts = batch wav_var = model.synthesis(mel_specs) wav_np = wav_var.numpy()[0] - sf.write(wav_np, path, samplerate=sample_rate) + sf.write(path, wav_np, samplerate=sample_rate) print("generated {}".format(path)) diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index d577f9e..16a58bf 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -134,7 +134,7 @@ class SliceDataset(DatasetMixin): format(len(order), len(dataset))) self._order = order - def len(self): + def __len__(self): return self._size def get_example(self, i): diff --git a/parakeet/models/clarinet/__init__.py b/parakeet/models/clarinet/__init__.py new file mode 100644 index 0000000..f3148be --- /dev/null +++ b/parakeet/models/clarinet/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .net import * +from .parallel_wavenet import * \ No newline at end of file diff --git a/parakeet/models/clarinet/net.py b/parakeet/models/clarinet/net.py new file mode 100644 index 0000000..35f0f03 --- /dev/null +++ b/parakeet/models/clarinet/net.py @@ -0,0 +1,169 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import numpy as np +from scipy import signal +from tqdm import trange + +import paddle.fluid.layers as F +import paddle.fluid.dygraph as dg +import paddle.fluid.initializer as I +import paddle.fluid.layers.distributions as D + +from parakeet.modules.weight_norm import Conv2DTranspose +from parakeet.models.wavenet import crop, WaveNet, UpsampleNet +from parakeet.models.clarinet.parallel_wavenet import ParallelWaveNet +from parakeet.models.clarinet.utils import conv2d + + +# Gaussian IAF model +class Clarinet(dg.Layer): + def __init__(self, + encoder, + teacher, + student, + stft, + min_log_scale=-6.0, + lmd=4.0): + super(Clarinet, self).__init__() + self.lmd = lmd + self.encoder = encoder + self.teacher = teacher + self.student = student + + self.min_log_scale = min_log_scale + self.stft = stft + + def forward(self, audio, mel, audio_start, clip_kl=True): + """Compute loss for a distill model + + Arguments: + audio {Variable} -- shape(batch_size, time_steps), target waveform. + mel {Variable} -- shape(batch_size, condition_dim, time_steps // hop_length), original mel spectrogram, not upsampled yet. + audio_starts {Variable} -- shape(batch_size, ), the index of the start sample. + clip_kl (bool) -- whether to clip kl divergence if it is greater than 10.0. + + Returns: + Variable -- shape(1,), loss + """ + + batch_size, audio_length = audio.shape # audio clip's length + + z = F.gaussian_random(audio.shape) + condition = self.encoder(mel) # (B, C, T) + condition_slice = crop(condition, audio_start, audio_length) + + x, s_means, s_scales = self.student(z, condition_slice) # all [0: T] + s_means = s_means[:, 1:] # (B, T-1), time steps [1: T] + s_scales = s_scales[:, 1:] # (B, T-1), time steps [1: T] + s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.) + + # teacher outputs single gaussian + y = self.teacher(x[:, :-1], condition_slice[:, :, 1:]) + _, t_means, t_scales = F.split(y, 3, -1) # time steps [1: T] + t_means = F.squeeze(t_means, [-1]) # (B, T-1), time steps [1: T] + t_scales = F.squeeze(t_scales, [-1]) # (B, T-1), time steps [1: T] + t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.) + + s_distribution = D.Normal(s_means, F.exp(s_clipped_scales)) + t_distribution = D.Normal(t_means, F.exp(t_clipped_scales)) + + # kl divergence loss, so we only need to sample once? no MC + kl = s_distribution.kl_divergence(t_distribution) + if clip_kl: + kl = F.clip(kl, -100., 10.) + # context size dropped + kl = F.reduce_mean(kl[:, self.teacher.context_size:]) + # major diff here + regularization = F.mse_loss(t_scales[:, self.teacher.context_size:], + s_scales[:, self.teacher.context_size:]) + + # introduce information from real target + spectrogram_frame_loss = F.mse_loss( + self.stft.magnitude(audio), self.stft.magnitude(x)) + loss = kl + self.lmd * regularization + spectrogram_frame_loss + loss_dict = { + "loss": loss, + "kl_divergence": kl, + "regularization": regularization, + "stft_loss": spectrogram_frame_loss + } + return loss_dict + + @dg.no_grad + def synthesis(self, mel): + """Synthesize waveform conditioned on the mel spectrogram. + + Arguments: + mel {Variable} -- shape(batch_size, frequqncy_bands, frames) + + Returns: + Variable -- shape(batch_size, frames * upsample_factor) + """ + condition = self.encoder(mel) + samples_shape = (condition.shape[0], condition.shape[-1]) + z = F.gaussian_random(samples_shape) + x, s_means, s_scales = self.student(z, condition) + return x + + +class STFT(dg.Layer): + def __init__(self, n_fft, hop_length, win_length, window="hanning"): + super(STFT, self).__init__() + self.hop_length = hop_length + self.n_bin = 1 + n_fft // 2 + self.n_fft = n_fft + + # calculate window + window = signal.get_window(window, win_length) + if n_fft != win_length: + pad = (n_fft - win_length) // 2 + window = np.pad(window, ((pad, pad), ), 'constant') + + # calculate weights + r = np.arange(0, n_fft) + M = np.expand_dims(r, -1) * np.expand_dims(r, 0) + w_real = np.reshape(window * + np.cos(2 * np.pi * M / n_fft)[:self.n_bin], + (self.n_bin, 1, 1, self.n_fft)).astype("float32") + w_imag = np.reshape(window * + np.sin(-2 * np.pi * M / n_fft)[:self.n_bin], + (self.n_bin, 1, 1, self.n_fft)).astype("float32") + + w = np.concatenate([w_real, w_imag], axis=0) + self.weight = dg.to_variable(w) + + def forward(self, x): + # x(batch_size, time_steps) + # pad it first with reflect mode + pad_start = F.reverse(x[:, 1:1 + self.n_fft // 2], axis=1) + pad_stop = F.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=1) + x = F.concat([pad_start, x, pad_stop], axis=-1) + + # to BC1T, C=1 + x = F.unsqueeze(x, axes=[1, 2]) + out = conv2d(x, self.weight, stride=(1, self.hop_length)) + real, imag = F.split(out, 2, dim=1) # BC1T + return real, imag + + def power(self, x): + real, imag = self(x) + power = real**2 + imag**2 + return power + + def magnitude(self, x): + power = self.power(x) + magnitude = F.sqrt(power) + return magnitude diff --git a/parakeet/models/clarinet/utils.py b/parakeet/models/clarinet/utils.py new file mode 100644 index 0000000..c2d3252 --- /dev/null +++ b/parakeet/models/clarinet/utils.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import fluid +from paddle.fluid.core import ops + + +@fluid.framework.dygraph_only +def conv2d(input, + weight, + stride=(1, 1), + padding=((0, 0), (0, 0)), + dilation=(1, 1), + groups=1, + use_cudnn=True, + data_format="NCHW"): + padding = tuple(pad for pad_dim in padding for pad in pad_dim) + + inputs = { + 'Input': [input], + 'Filter': [weight], + } + attrs = { + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn, + 'use_mkldnn': False, + 'fuse_relu_before_depthwise_conv': False, + "padding_algorithm": "EXPLICIT", + "data_format": data_format, + } + + outputs = ops.conv2d(inputs, attrs) + out = outputs["Output"][0] + return out \ No newline at end of file