From 424c16a68da3a4d9d70b365a7ec507dc2cc3599b Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Thu, 27 Feb 2020 10:23:05 +0000
Subject: [PATCH] staged clarinet

---
 .../clarinet/configs/clarinet_ljspeech.yaml   |  52 ++++++
 .../configs/wavenet_mixture_of_gaussians.yaml |   1 -
 .../configs/wavenet_single_gaussian.yaml      |   1 -
 examples/wavenet/configs/wavenet_softmax.yaml |   1 -
 examples/wavenet/utils.py                     |   2 +-
 parakeet/data/dataset.py                      |   2 +-
 parakeet/models/clarinet/__init__.py          |  16 ++
 parakeet/models/clarinet/net.py               | 169 ++++++++++++++++++
 parakeet/models/clarinet/utils.py             |  48 +++++
 9 files changed, 287 insertions(+), 5 deletions(-)
 create mode 100644 examples/clarinet/configs/clarinet_ljspeech.yaml
 create mode 100644 parakeet/models/clarinet/__init__.py
 create mode 100644 parakeet/models/clarinet/net.py
 create mode 100644 parakeet/models/clarinet/utils.py

diff --git a/examples/clarinet/configs/clarinet_ljspeech.yaml b/examples/clarinet/configs/clarinet_ljspeech.yaml
new file mode 100644
index 0000000..f689cca
--- /dev/null
+++ b/examples/clarinet/configs/clarinet_ljspeech.yaml
@@ -0,0 +1,52 @@
+data:
+  batch_size: 4
+  train_clip_seconds: 0.5
+  sample_rate: 22050
+  hop_length: 256
+  win_length: 1024
+  n_fft: 2048
+  
+  n_mels: 80
+  valid_size: 16
+
+
+conditioner:
+  upsampling_factors: [16, 16]
+
+teacher:
+  n_loop: 10
+  n_layer: 3
+  filter_size: 2
+  residual_channels: 128
+  loss_type: "mog"
+  output_dim: 3
+  log_scale_min: -9
+
+student:
+  n_loops: [10, 10, 10, 10, 10, 10]
+  n_layers: [1, 1, 1, 1, 1, 1]
+  filter_size: 3
+  residual_channels: 64
+  log_scale_min: -7
+
+stft:
+  n_fft: 2048
+  win_length: 1024
+  hop_length: 256
+
+loss:
+  lmd: 4
+
+train:
+  learning_rate: 0.0005
+  anneal_rate:  0.5
+  anneal_interval: 200000
+  gradient_max_norm: 100.0
+
+  checkpoint_interval: 10
+  eval_interval: 10
+  
+  max_iterations: 2000000
+
+
+
diff --git a/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml b/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml
index 427c975..a848a52 100644
--- a/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml
+++ b/examples/wavenet/configs/wavenet_mixture_of_gaussians.yaml
@@ -1,5 +1,4 @@
 data:
-  root: "/workspace/datasets/LJSpeech-1.1/"
   batch_size: 4
   train_clip_seconds: 0.5
   sample_rate: 22050
diff --git a/examples/wavenet/configs/wavenet_single_gaussian.yaml b/examples/wavenet/configs/wavenet_single_gaussian.yaml
index 8dd8d46..8e33349 100644
--- a/examples/wavenet/configs/wavenet_single_gaussian.yaml
+++ b/examples/wavenet/configs/wavenet_single_gaussian.yaml
@@ -1,5 +1,4 @@
 data:
-  root: "/workspace/datasets/LJSpeech-1.1/"
   batch_size: 4
   train_clip_seconds: 0.5
   sample_rate: 22050
diff --git a/examples/wavenet/configs/wavenet_softmax.yaml b/examples/wavenet/configs/wavenet_softmax.yaml
index 57c36cc..98018ee 100644
--- a/examples/wavenet/configs/wavenet_softmax.yaml
+++ b/examples/wavenet/configs/wavenet_softmax.yaml
@@ -1,5 +1,4 @@
 data:
-  root: "/workspace/datasets/LJSpeech-1.1/"
   batch_size: 4
   train_clip_seconds: 0.5
   sample_rate: 22050
diff --git a/examples/wavenet/utils.py b/examples/wavenet/utils.py
index 82ab553..86c8ebf 100644
--- a/examples/wavenet/utils.py
+++ b/examples/wavenet/utils.py
@@ -56,7 +56,7 @@ def eval_model(model, valid_loader, output_dir, sample_rate):
         audio_clips, mel_specs, audio_starts = batch
         wav_var = model.synthesis(mel_specs)
         wav_np = wav_var.numpy()[0]
-        sf.write(wav_np, path, samplerate=sample_rate)
+        sf.write(path, wav_np, samplerate=sample_rate)
         print("generated {}".format(path))
 
 
diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py
index d577f9e..16a58bf 100644
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
@@ -134,7 +134,7 @@ class SliceDataset(DatasetMixin):
                 format(len(order), len(dataset)))
         self._order = order
 
-    def len(self):
+    def __len__(self):
         return self._size
 
     def get_example(self, i):
diff --git a/parakeet/models/clarinet/__init__.py b/parakeet/models/clarinet/__init__.py
new file mode 100644
index 0000000..f3148be
--- /dev/null
+++ b/parakeet/models/clarinet/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .net import *
+from .parallel_wavenet import *
\ No newline at end of file
diff --git a/parakeet/models/clarinet/net.py b/parakeet/models/clarinet/net.py
new file mode 100644
index 0000000..35f0f03
--- /dev/null
+++ b/parakeet/models/clarinet/net.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import numpy as np
+from scipy import signal
+from tqdm import trange
+
+import paddle.fluid.layers as F
+import paddle.fluid.dygraph as dg
+import paddle.fluid.initializer as I
+import paddle.fluid.layers.distributions as D
+
+from parakeet.modules.weight_norm import Conv2DTranspose
+from parakeet.models.wavenet import crop, WaveNet, UpsampleNet
+from parakeet.models.clarinet.parallel_wavenet import ParallelWaveNet
+from parakeet.models.clarinet.utils import conv2d
+
+
+# Gaussian IAF model
+class Clarinet(dg.Layer):
+    def __init__(self,
+                 encoder,
+                 teacher,
+                 student,
+                 stft,
+                 min_log_scale=-6.0,
+                 lmd=4.0):
+        super(Clarinet, self).__init__()
+        self.lmd = lmd
+        self.encoder = encoder
+        self.teacher = teacher
+        self.student = student
+
+        self.min_log_scale = min_log_scale
+        self.stft = stft
+
+    def forward(self, audio, mel, audio_start, clip_kl=True):
+        """Compute loss for a distill model
+        
+        Arguments:
+            audio {Variable} -- shape(batch_size, time_steps), target waveform.
+            mel {Variable} -- shape(batch_size, condition_dim, time_steps // hop_length), original mel spectrogram, not upsampled yet.
+            audio_starts {Variable} -- shape(batch_size, ), the index of the start sample.
+            clip_kl (bool) -- whether to clip kl divergence if it is greater than 10.0.
+        
+        Returns:
+            Variable -- shape(1,), loss
+        """
+
+        batch_size, audio_length = audio.shape  # audio clip's length
+
+        z = F.gaussian_random(audio.shape)
+        condition = self.encoder(mel)  # (B, C, T)
+        condition_slice = crop(condition, audio_start, audio_length)
+
+        x, s_means, s_scales = self.student(z, condition_slice)  # all [0: T]
+        s_means = s_means[:, 1:]  # (B, T-1), time steps [1: T]
+        s_scales = s_scales[:, 1:]  # (B, T-1), time steps [1: T]
+        s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.)
+
+        # teacher outputs single gaussian
+        y = self.teacher(x[:, :-1], condition_slice[:, :, 1:])
+        _, t_means, t_scales = F.split(y, 3, -1)  # time steps [1: T]
+        t_means = F.squeeze(t_means, [-1])  # (B, T-1), time steps [1: T]
+        t_scales = F.squeeze(t_scales, [-1])  # (B, T-1), time steps [1: T]
+        t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.)
+
+        s_distribution = D.Normal(s_means, F.exp(s_clipped_scales))
+        t_distribution = D.Normal(t_means, F.exp(t_clipped_scales))
+
+        # kl divergence loss, so we only need to sample once? no MC
+        kl = s_distribution.kl_divergence(t_distribution)
+        if clip_kl:
+            kl = F.clip(kl, -100., 10.)
+        # context size dropped
+        kl = F.reduce_mean(kl[:, self.teacher.context_size:])
+        # major diff here
+        regularization = F.mse_loss(t_scales[:, self.teacher.context_size:],
+                                    s_scales[:, self.teacher.context_size:])
+
+        # introduce information from real target
+        spectrogram_frame_loss = F.mse_loss(
+            self.stft.magnitude(audio), self.stft.magnitude(x))
+        loss = kl + self.lmd * regularization + spectrogram_frame_loss
+        loss_dict = {
+            "loss": loss,
+            "kl_divergence": kl,
+            "regularization": regularization,
+            "stft_loss": spectrogram_frame_loss
+        }
+        return loss_dict
+
+    @dg.no_grad
+    def synthesis(self, mel):
+        """Synthesize waveform conditioned on the mel spectrogram.
+        
+        Arguments:
+            mel {Variable} -- shape(batch_size, frequqncy_bands, frames)
+        
+        Returns:
+            Variable -- shape(batch_size, frames * upsample_factor)
+        """
+        condition = self.encoder(mel)
+        samples_shape = (condition.shape[0], condition.shape[-1])
+        z = F.gaussian_random(samples_shape)
+        x, s_means, s_scales = self.student(z, condition)
+        return x
+
+
+class STFT(dg.Layer):
+    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
+        super(STFT, self).__init__()
+        self.hop_length = hop_length
+        self.n_bin = 1 + n_fft // 2
+        self.n_fft = n_fft
+
+        # calculate window
+        window = signal.get_window(window, win_length)
+        if n_fft != win_length:
+            pad = (n_fft - win_length) // 2
+            window = np.pad(window, ((pad, pad), ), 'constant')
+
+        # calculate weights
+        r = np.arange(0, n_fft)
+        M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
+        w_real = np.reshape(window *
+                            np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
+                            (self.n_bin, 1, 1, self.n_fft)).astype("float32")
+        w_imag = np.reshape(window *
+                            np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
+                            (self.n_bin, 1, 1, self.n_fft)).astype("float32")
+
+        w = np.concatenate([w_real, w_imag], axis=0)
+        self.weight = dg.to_variable(w)
+
+    def forward(self, x):
+        # x(batch_size, time_steps)
+        # pad it first with reflect mode
+        pad_start = F.reverse(x[:, 1:1 + self.n_fft // 2], axis=1)
+        pad_stop = F.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=1)
+        x = F.concat([pad_start, x, pad_stop], axis=-1)
+
+        # to BC1T, C=1
+        x = F.unsqueeze(x, axes=[1, 2])
+        out = conv2d(x, self.weight, stride=(1, self.hop_length))
+        real, imag = F.split(out, 2, dim=1)  # BC1T
+        return real, imag
+
+    def power(self, x):
+        real, imag = self(x)
+        power = real**2 + imag**2
+        return power
+
+    def magnitude(self, x):
+        power = self.power(x)
+        magnitude = F.sqrt(power)
+        return magnitude
diff --git a/parakeet/models/clarinet/utils.py b/parakeet/models/clarinet/utils.py
new file mode 100644
index 0000000..c2d3252
--- /dev/null
+++ b/parakeet/models/clarinet/utils.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import fluid
+from paddle.fluid.core import ops
+
+
+@fluid.framework.dygraph_only
+def conv2d(input,
+           weight,
+           stride=(1, 1),
+           padding=((0, 0), (0, 0)),
+           dilation=(1, 1),
+           groups=1,
+           use_cudnn=True,
+           data_format="NCHW"):
+    padding = tuple(pad for pad_dim in padding for pad in pad_dim)
+
+    inputs = {
+        'Input': [input],
+        'Filter': [weight],
+    }
+    attrs = {
+        'strides': stride,
+        'paddings': padding,
+        'dilations': dilation,
+        'groups': groups,
+        'use_cudnn': use_cudnn,
+        'use_mkldnn': False,
+        'fuse_relu_before_depthwise_conv': False,
+        "padding_algorithm": "EXPLICIT",
+        "data_format": data_format,
+    }
+
+    outputs = ops.conv2d(inputs, attrs)
+    out = outputs["Output"][0]
+    return out
\ No newline at end of file