From 49f2c4b3fb2e86b62cdf0397fbf3a20c4ce9f0fe Mon Sep 17 00:00:00 2001
From: iclementine <chenfeiyu@baidu.com>
Date: Fri, 16 Apr 2021 14:57:17 +0800
Subject: [PATCH 1/2] change stft to use conv1d

---
 parakeet/modules/audio.py | 85 ++++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py
index 03e42b0..46614ab 100644
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@@ -16,6 +16,7 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from scipy import signal
+from librosa.util import pad_center
 import numpy as np
 
 __all__ = ["quantize", "dequantize", "STFT"]
@@ -69,6 +70,8 @@ def dequantize(quantized, n_bands, dtype=None):
     value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
     return value
 
+import librosa
+librosa.stft(
 
 class STFT(nn.Layer):
     """A module for computing stft transformation in a differentiable way. 
@@ -88,6 +91,19 @@ class STFT(nn.Layer):
         Name of window function, see `scipy.signal.get_window` for more 
         details. Defaults to "hanning".
         
+    center : bool
+        If True, the signal y is padded so that frame D[:, t] is centered 
+        at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
+        Defaults to True.
+    
+    pad_mode : string or function
+       If center=True, this argument is passed to np.pad for padding the edges 
+       of the signal y. By default (pad_mode="reflect"), y is padded on both 
+       sides with its own reflection, mirrored around its first and last 
+       sample respectively. If center=False, this argument is ignored.
+    
+        
+        
     Notes
     -----------
     It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more 
@@ -101,29 +117,47 @@ class STFT(nn.Layer):
     
     """
 
-    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
+    def __init__(self, n_fft, hop_length=None, win_length=None, window="hanning", center=True, pad_mode="reflect"):
         super(STFT, self).__init__()
+        # By default, use the entire frame
+        if win_length is None:
+            win_length = n_fft
+    
+        # Set the default hop, if it's not already specified
+        if hop_length is None:
+            hop_length = int(win_length // 4)
+            
         self.hop_length = hop_length
         self.n_bin = 1 + n_fft // 2
         self.n_fft = n_fft
+        self.center = center
+        self.pad_mode = pad_mode
 
         # calculate window
-        window = signal.get_window(window, win_length)
+        window = signal.get_window(window, win_length, fftbins=True)
+        
+        # pad window to n_fft size
         if n_fft != win_length:
-            pad = (n_fft - win_length) // 2
-            window = np.pad(window, ((pad, pad), ), 'constant')
+            window = pad_center(window, n_fft, mode="constant")
+            #lpad = (n_fft - win_length) // 2
+            #rpad = n_fft - win_length - lpad
+            #window = np.pad(window, ((lpad, pad), ), 'constant')
 
         # calculate weights
-        r = np.arange(0, n_fft)
-        M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        w_real = np.reshape(window *
-                            np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-                            (self.n_bin, 1, 1, self.n_fft))
-        w_imag = np.reshape(window *
-                            np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-                            (self.n_bin, 1, 1, self.n_fft))
-
+        #r = np.arange(0, n_fft)
+        #M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
+        #w_real = np.reshape(window *
+                            #np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
+                            #(self.n_bin, 1, self.n_fft))
+        #w_imag = np.reshape(window *
+                            #np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
+                            #(self.n_bin, 1, self.n_fft))
+        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
+        w_real = weight.real
+        w_imag = weight.imag
         w = np.concatenate([w_real, w_imag], axis=0)
+        w = w * window
+        w = np.expand_dims(w, 1)
         self.weight = paddle.cast(
             paddle.to_tensor(w), paddle.get_default_dtype())
 
@@ -137,23 +171,20 @@ class STFT(nn.Layer):
 
         Returns
         ------------
-        real : Tensor [shape=(B, C, 1, frames)] 
+        real : Tensor [shape=(B, C, frames)] 
             The real part of the spectrogram.
             
-        imag : Tensor [shape=(B, C, 1, frames)] 
+        imag : Tensor [shape=(B, C, frames)] 
             The image part of the spectrogram.
         """
-        # x(batch_size, time_steps)
-        # pad it first with reflect mode
-        # TODO(chenfeiyu): report an issue on paddle.flip
-        pad_start = paddle.reverse(x[:, 1:1 + self.n_fft // 2], axis=[1])
-        pad_stop = paddle.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=[1])
-        x = paddle.concat([pad_start, x, pad_stop], axis=-1)
+        x = paddle.unsqueeze(x, axis=1)
+        if self.center:
+            x = F.pad(x, [self.n_fft // 2, self.n_fft // 2], 
+                      data_format='NCL', mode=self.pad_mode)
 
-        # to BC1T, C=1
-        x = paddle.unsqueeze(x, axis=[1, 2])
-        out = F.conv2d(x, self.weight, stride=(1, self.hop_length))
-        real, imag = paddle.chunk(out, 2, axis=1)  # BC1T
+        # to BCT, C=1
+        out = F.conv1d(x, self.weight, stride=self.hop_length)
+        real, imag = paddle.chunk(out, 2, axis=1)  # BCT
         return real, imag
 
     def power(self, x):
@@ -166,7 +197,7 @@ class STFT(nn.Layer):
 
         Returns
         ------------
-        Tensor [shape=(B, C, 1, T)] 
+        Tensor [shape=(B, C, T)] 
             The power spectrum.
         """
         real, imag = self(x)
@@ -183,7 +214,7 @@ class STFT(nn.Layer):
 
         Returns
         ------------
-        Tensor [shape=(B, C, 1, T)] 
+        Tensor [shape=(B, C, T)] 
             The magnitude of the spectrum.
         """
         power = self.power(x)

From 6749ce40eab7c5de0ca11fe5d2fd57120b10415c Mon Sep 17 00:00:00 2001
From: iclementine <chenfeiyu@baidu.com>
Date: Mon, 19 Apr 2021 16:16:40 +0800
Subject: [PATCH 2/2] add audio datasets

---
 parakeet/datasets/common.py | 81 ++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 15 deletions(-)

diff --git a/parakeet/datasets/common.py b/parakeet/datasets/common.py
index a1d16d6..78bfc2b 100644
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
@@ -15,24 +15,75 @@
 from paddle.io import Dataset
 import os
 import librosa
+from pathlib import Path
+import numpy as np
+from typing import List
 
-__all__ = ["AudioFolderDataset"]
+__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
 
 
-class AudioFolderDataset(Dataset):
-    def __init__(self, path, sample_rate, extension="wav"):
-        self.root = os.path.expanduser(path)
-        self.sample_rate = sample_rate
-        self.extension = extension
-        self.file_names = [
-            os.path.join(self.root, x) for x in os.listdir(self.root) \
-                if os.path.splitext(x)[-1] == self.extension]
-        self.length = len(self.file_names)
-
-    def __len__(self):
-        return self.length
+class AudioSegmentDataset(Dataset):
+    """A simple dataset adaptor for audio files to train vocoders.
+    Read -> trim silence -> normalize -> extract a segment
+    """
+    def __init__(self, file_paths: List[Path], sample_rate: int, length: int,
+                 top_db: float):
+        self.file_paths = file_paths
+        self.sr = sample_rate
+        self.top_db = top_db
+        self.length = length  # samples in the clip
 
     def __getitem__(self, i):
-        file_name = self.file_names[i]
-        y, _ = librosa.load(file_name, sr=self.sample_rate)  # pylint: disable=unused-variable
+        fpath = self.file_paths[i]
+        y, sr = librosa.load(fpath, self.sr)
+        y, _ = librosa.effects.trim(y, top_db=self.top_db)
+        y = librosa.util.normalize(y)
+        y = y.astype(np.float32)
+
+        # pad or trim
+        if y.size <= self.length:
+            y = np.pad(y, [0, self.length - len(y)], mode='constant')
+        else:
+            start = np.random.randint(0, 1 + len(y) - self.length)
+            y = y[start:start + self.length]
         return y
+
+    def __len__(self):
+        return len(self.file_paths)
+
+
+class AudioDataset(Dataset):
+    """A simple dataset adaptor for the audio files. 
+    Read -> trim silence -> normalize
+    """
+    def __init__(self,
+                 file_paths: List[Path],
+                 sample_rate: int,
+                 top_db: float = 60):
+        self.file_paths = file_paths
+        self.sr = sample_rate
+        self.top_db = top_db
+
+    def __getitem__(self, i):
+        fpath = self.file_paths[i]
+        y, sr = librosa.load(fpath, self.sr)
+        y, _ = librosa.effects.trim(y, top_db=self.top_db)
+        y = librosa.util.normalize(y)
+        y = y.astype(np.float32)
+        return y
+
+    def __len__(self):
+        return len(self.file_paths)
+
+
+class AudioFolderDataset(AudioDataset):
+    def __init__(
+        self,
+        root,
+        sample_rate,
+        top_db=60,
+        extension=".wav",
+    ):
+        root = Path(root).expanduser()
+        file_paths = sorted(list(root.rglob("*{}".format(extension))))
+        super().__init__(file_paths, sample_rate, top_db)