From af4da7dd9e46fba8cb523ffeb3b5ade8a569e609 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Wed, 4 Nov 2020 23:22:45 +0800
Subject: [PATCH] 1. update code for waveflow's probability density estimation
 and sampling; 2. add WaveFlowLoss.

---
 parakeet/models/waveflow.py | 136 +++++++++++++++++++++++++++++++++---
 tests/test_waveflow.py      |   8 +--
 2 files changed, 129 insertions(+), 15 deletions(-)

diff --git a/parakeet/models/waveflow.py b/parakeet/models/waveflow.py
index d4c53c2..c921502 100644
--- a/parakeet/models/waveflow.py
+++ b/parakeet/models/waveflow.py
@@ -77,9 +77,9 @@ class UpsampleNet(nn.LayerList):
 
 class ResidualBlock(nn.Layer):
     """
-    ResidualBlock that merges infomation from the condition and outputs residual 
-    and skip. It has a conv2d layer, which has causal padding in height dimension 
-    and same paddign in width dimension.
+    ResidualBlock, the basic unit of ResidualNet. It has a conv2d layer, which 
+    has causal padding in height dimension and same paddign in width dimension. 
+    It also has projection for the condition and output.
     """
     def __init__(self, channels, cond_channels, kernel_size, dilations):
         super(ResidualBlock, self).__init__()
@@ -114,6 +114,17 @@ class ResidualBlock(nn.Layer):
         self.out_proj = nn.utils.weight_norm(out_proj)
         
     def forward(self, x, condition):
+        """Compute output for a whole folded sequence.
+        
+        Args:
+            x (Tensor): shape(batch_size, channel, height, width), the input.
+            condition (Tensor): shape(batch_size, condition_channel, height, width), 
+                the local condition.
+
+        Returns:
+            res (Tensor): shape(batch_size, channel, height, width), the residual output.
+            res (Tensor): shape(batch_size, channel, height, width), the skip output.
+        """
         x = self.conv(x)
         x += self.condition_proj(condition)
         
@@ -125,11 +136,26 @@ class ResidualBlock(nn.Layer):
         return res, skip
 
     def start_sequence(self):
+        """Prepare the layer for incremental computation of causal convolution. Reset the buffer for causal convolution. 
+
+        Raises:
+            ValueError: If not in evaluation mode.
+        """
         if self.training:
             raise ValueError("Only use start sequence at evaluation mode.")
         self._conv_buffer = None
 
     def add_input(self, x_row, condition_row):
+        """Compute the output for a row and update the buffer.
+
+        Args:
+            x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input.
+            condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input.
+
+        Returns:
+            res (Tensor): shape(batch_size, channel, 1, width), the residual output.
+            res (Tensor): shape(batch_size, channel, 1, width), the skip output.
+        """
         if self._conv_buffer is None:
             self._init_buffer(x_row)
         self._update_buffer(x_row)
@@ -175,6 +201,17 @@ class ResidualNet(nn.LayerList):
             self.append(layer)
             
     def forward(self, x, condition):
+        """Comput the output of given the input and the condition.
+
+        Args:
+            x (Tensor): shape(batch_size, channel, height, width), the input.
+            condition (Tensor): shape(batch_size, condition_channel, height, width), 
+                the local condition.
+
+        Returns:
+            Tensor: shape(batch_size, channel, height, width), the output, which 
+                is an aggregation of all the skip outputs.
+        """
         skip_connections = []
         for layer in self:
             x, skip = layer(x, condition)
@@ -183,11 +220,21 @@ class ResidualNet(nn.LayerList):
         return out
 
     def start_sequence(self):
+        """Prepare the layer for incremental computation."""
         for layer in self:
             layer.start_sequence()
     
     def add_input(self, x_row, condition_row):
-        # in reversed order
+        """Compute the output for a row and update the buffer.
+
+        Args:
+            x_row (Tensor): shape(batch_size, channel, 1, width), a row of the input.
+            condition_row (Tensor): shape(batch_size, condition_channel, 1, width), a row of the input.
+
+        Returns:
+            Tensor: shape(batch_size, channel, 1, width), the output, which is 
+                an aggregation of all the skip outputs.
+        """
         skip_connections = []
         for layer in self:
             x_row, skip = layer.add_input(x_row, condition_row)
@@ -198,8 +245,11 @@ class ResidualNet(nn.LayerList):
 
 class Flow(nn.Layer):
     """
-    A Layer that merges the condition and predict scale and bias given a random
-    variable X.
+    A bijection (Reversable layer) that transform a density of latent variables 
+    p(Z) into a complex data distribution p(X).
+
+    It's a auto regressive flow. The `forward` method implements the probability
+    density estimation. The `inverse` method implements the sampling.
     """
     dilations_dict = {
             8: [1, 1, 1, 1, 1, 1, 1, 1],
@@ -244,6 +294,19 @@ class Flow(nn.Layer):
         return z_out
     
     def forward(self, x, condition):
+        """Probability density estimation. It is done by inversely transform a sample 
+        from p(X) back into a sample from p(Z).
+
+        Args:
+            x (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(X).
+            condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
+
+        Returns:
+            (z, (logs, b))
+            z (Tensor): shape(batch, 1, height, width), the transformed sample.
+            logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
+            b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
+        """
         # (B, C, H-1, W)
         logs, b = self._predict_parameters(
             x[:, :, :-1, :], condition[:, :, 1:, :]) 
@@ -270,6 +333,19 @@ class Flow(nn.Layer):
         self.resnet.start_sequence()
     
     def inverse(self, z, condition):
+        """Sampling from the the distrition p(X). It is done by sample form p(Z)
+        and transform the sample. It is a auto regressive transformation.
+
+        Args:
+            z (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(Z).
+            condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
+
+        Returns:
+            (x, (logs, b))
+            x (Tensor): shape(batch, 1, height, width), the transformed sample.
+            logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the inverse transformation.
+            b (Tensor): shape(batch, 1, height - 1, width), the shift of the inverse transformation.
+        """
         z_0 = z[:, :, :1, :]
         x = []
         logs_list = []
@@ -290,11 +366,11 @@ class Flow(nn.Layer):
         x = paddle.concat(x, 2)
         logs = paddle.concat(logs_list, 2)
         b = paddle.concat(b_list, 2)
-
         return x, (logs, b)
 
     
 class WaveFlow(nn.LayerList):
+    """An Deep Reversible layer that is composed of a stack of auto regressive flows.s"""
     def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
         if n_group % 2 or n_flows % 2:
             raise ValueError("number of flows and number of group must be even "
@@ -333,6 +409,16 @@ class WaveFlow(nn.LayerList):
         return x, condition
     
     def forward(self, x, condition):
+        """Probability density estimation.
+
+        Args:
+            x (Tensor): shape(batch_size, time_steps), the audio.
+            condition (Tensor): shape(batch_size, condition channel, time_steps), the local condition.
+
+        Returns:
+            z: (Tensor): shape(batch_size, time_steps), the transformed sample.
+            log_det_jacobian: (Tensor), shape(1,), the log determinant of the jacobian of (dz/dx).
+        """
         # x: (B, T)
         # condition: (B, C, T) upsampled condition
         x, condition = self._trim(x, condition)
@@ -350,14 +436,28 @@ class WaveFlow(nn.LayerList):
             x = geo.shuffle_dim(x, 2, perm=self.perms[i])
             condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
 
-        z = paddle.squeeze(x, 1)
-        return z, logs_list
+        z = paddle.squeeze(x, 1) # (B, H, W)
+        batch_size = z.shape[0]
+        z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
+
+        log_det_jacobian = paddle.sum(paddle.stack(logs_list))
+        return z, log_det_jacobian
 
     def start_sequence(self):
         for layer in self:
             layer.start_sequence()
 
     def inverse(self, z, condition):
+        """Sampling from the the distrition p(X). It is done by sample form p(Z)
+        and transform the sample. It is a auto regressive transformation.
+
+        Args:
+            z (Tensor): shape(batch, 1, height, width), a input sample of the distribution p(Z).
+            condition (Tensor): shape(batch, condition_channel, height, width), the local condition.
+
+        Returns:
+            x: (Tensor): shape(batch_size, time_steps), the transformed sample.
+        """
         self.start_sequence()
 
         z, condition = self._trim(z, condition)
@@ -371,8 +471,22 @@ class WaveFlow(nn.LayerList):
             z = geo.shuffle_dim(z, 2, perm=self.perms[i])
             condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
             z, (logs, b) = self[i].inverse(z, condition)
-        x = paddle.squeeze(z, 1)
+
+        x = paddle.squeeze(z, 1) # (B, H, W)
+        batch_size = x.shape[0]
+        x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
         return x
 
 
-# TODO(chenfeiyu): WaveFlowLoss
+class WaveFlowLoss(nn.Layer):
+    def __init__(self, sigma=1.0):
+        super().__init__()
+        self.sigma = sigma
+        self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
+
+    def forward(self, model_output):
+        z, log_det_jacobian = model_output
+
+        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
+        loss = loss / np.prod(z.shape)
+        return loss + self.const
\ No newline at end of file
diff --git a/tests/test_waveflow.py b/tests/test_waveflow.py
index e70c65a..849c7d8 100644
--- a/tests/test_waveflow.py
+++ b/tests/test_waveflow.py
@@ -114,10 +114,10 @@ class TestWaveFlow(unittest.TestCase):
         x = paddle.randn([4, 32 * 8 ])
         condition = paddle.randn([4, 7, 32 * 8])
         net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
-        z, logs_list = net(x, condition)
+        z, logs_det_jacobian = net(x, condition)
 
-        self.assertTupleEqual(z.numpy().shape, (4, 8, 32))
-        self.assertTupleEqual(logs_list[0].numpy().shape, (4, 1, 7, 32))
+        self.assertTupleEqual(z.numpy().shape, (4, 32 * 8))
+        self.assertTupleEqual(logs_det_jacobian.numpy().shape, (1,))
 
     def test_inverse(self):
         z = paddle.randn([4, 32 * 8 ])
@@ -128,7 +128,7 @@ class TestWaveFlow(unittest.TestCase):
 
         with paddle.no_grad():
             x = net.inverse(z, condition)
-        self.assertTupleEqual(x.numpy().shape, (4, 8, 32))
+        self.assertTupleEqual(x.numpy().shape, (4, 32 * 8))
         
 
         
\ No newline at end of file