From 49f2c4b3fb2e86b62cdf0397fbf3a20c4ce9f0fe Mon Sep 17 00:00:00 2001 From: iclementine Date: Fri, 16 Apr 2021 14:57:17 +0800 Subject: [PATCH 1/2] change stft to use conv1d --- parakeet/modules/audio.py | 85 ++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 27 deletions(-) diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py index 03e42b0..46614ab 100644 --- a/parakeet/modules/audio.py +++ b/parakeet/modules/audio.py @@ -16,6 +16,7 @@ import paddle from paddle import nn from paddle.nn import functional as F from scipy import signal +from librosa.util import pad_center import numpy as np __all__ = ["quantize", "dequantize", "STFT"] @@ -69,6 +70,8 @@ def dequantize(quantized, n_bands, dtype=None): value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0 return value +import librosa +librosa.stft( class STFT(nn.Layer): """A module for computing stft transformation in a differentiable way. @@ -88,6 +91,19 @@ class STFT(nn.Layer): Name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning". + center : bool + If True, the signal y is padded so that frame D[:, t] is centered + at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length]. + Defaults to True. + + pad_mode : string or function + If center=True, this argument is passed to np.pad for padding the edges + of the signal y. By default (pad_mode="reflect"), y is padded on both + sides with its own reflection, mirrored around its first and last + sample respectively. If center=False, this argument is ignored. + + + Notes ----------- It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more @@ -101,29 +117,47 @@ class STFT(nn.Layer): """ - def __init__(self, n_fft, hop_length, win_length, window="hanning"): + def __init__(self, n_fft, hop_length=None, win_length=None, window="hanning", center=True, pad_mode="reflect"): super(STFT, self).__init__() + # By default, use the entire frame + if win_length is None: + win_length = n_fft + + # Set the default hop, if it's not already specified + if hop_length is None: + hop_length = int(win_length // 4) + self.hop_length = hop_length self.n_bin = 1 + n_fft // 2 self.n_fft = n_fft + self.center = center + self.pad_mode = pad_mode # calculate window - window = signal.get_window(window, win_length) + window = signal.get_window(window, win_length, fftbins=True) + + # pad window to n_fft size if n_fft != win_length: - pad = (n_fft - win_length) // 2 - window = np.pad(window, ((pad, pad), ), 'constant') + window = pad_center(window, n_fft, mode="constant") + #lpad = (n_fft - win_length) // 2 + #rpad = n_fft - win_length - lpad + #window = np.pad(window, ((lpad, pad), ), 'constant') # calculate weights - r = np.arange(0, n_fft) - M = np.expand_dims(r, -1) * np.expand_dims(r, 0) - w_real = np.reshape(window * - np.cos(2 * np.pi * M / n_fft)[:self.n_bin], - (self.n_bin, 1, 1, self.n_fft)) - w_imag = np.reshape(window * - np.sin(-2 * np.pi * M / n_fft)[:self.n_bin], - (self.n_bin, 1, 1, self.n_fft)) - + #r = np.arange(0, n_fft) + #M = np.expand_dims(r, -1) * np.expand_dims(r, 0) + #w_real = np.reshape(window * + #np.cos(2 * np.pi * M / n_fft)[:self.n_bin], + #(self.n_bin, 1, self.n_fft)) + #w_imag = np.reshape(window * + #np.sin(-2 * np.pi * M / n_fft)[:self.n_bin], + #(self.n_bin, 1, self.n_fft)) + weight = np.fft.fft(np.eye(n_fft))[:self.n_bin] + w_real = weight.real + w_imag = weight.imag w = np.concatenate([w_real, w_imag], axis=0) + w = w * window + w = np.expand_dims(w, 1) self.weight = paddle.cast( paddle.to_tensor(w), paddle.get_default_dtype()) @@ -137,23 +171,20 @@ class STFT(nn.Layer): Returns ------------ - real : Tensor [shape=(B, C, 1, frames)] + real : Tensor [shape=(B, C, frames)] The real part of the spectrogram. - imag : Tensor [shape=(B, C, 1, frames)] + imag : Tensor [shape=(B, C, frames)] The image part of the spectrogram. """ - # x(batch_size, time_steps) - # pad it first with reflect mode - # TODO(chenfeiyu): report an issue on paddle.flip - pad_start = paddle.reverse(x[:, 1:1 + self.n_fft // 2], axis=[1]) - pad_stop = paddle.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=[1]) - x = paddle.concat([pad_start, x, pad_stop], axis=-1) + x = paddle.unsqueeze(x, axis=1) + if self.center: + x = F.pad(x, [self.n_fft // 2, self.n_fft // 2], + data_format='NCL', mode=self.pad_mode) - # to BC1T, C=1 - x = paddle.unsqueeze(x, axis=[1, 2]) - out = F.conv2d(x, self.weight, stride=(1, self.hop_length)) - real, imag = paddle.chunk(out, 2, axis=1) # BC1T + # to BCT, C=1 + out = F.conv1d(x, self.weight, stride=self.hop_length) + real, imag = paddle.chunk(out, 2, axis=1) # BCT return real, imag def power(self, x): @@ -166,7 +197,7 @@ class STFT(nn.Layer): Returns ------------ - Tensor [shape=(B, C, 1, T)] + Tensor [shape=(B, C, T)] The power spectrum. """ real, imag = self(x) @@ -183,7 +214,7 @@ class STFT(nn.Layer): Returns ------------ - Tensor [shape=(B, C, 1, T)] + Tensor [shape=(B, C, T)] The magnitude of the spectrum. """ power = self.power(x) From 6749ce40eab7c5de0ca11fe5d2fd57120b10415c Mon Sep 17 00:00:00 2001 From: iclementine Date: Mon, 19 Apr 2021 16:16:40 +0800 Subject: [PATCH 2/2] add audio datasets --- parakeet/datasets/common.py | 81 ++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/parakeet/datasets/common.py b/parakeet/datasets/common.py index a1d16d6..78bfc2b 100644 --- a/parakeet/datasets/common.py +++ b/parakeet/datasets/common.py @@ -15,24 +15,75 @@ from paddle.io import Dataset import os import librosa +from pathlib import Path +import numpy as np +from typing import List -__all__ = ["AudioFolderDataset"] +__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"] -class AudioFolderDataset(Dataset): - def __init__(self, path, sample_rate, extension="wav"): - self.root = os.path.expanduser(path) - self.sample_rate = sample_rate - self.extension = extension - self.file_names = [ - os.path.join(self.root, x) for x in os.listdir(self.root) \ - if os.path.splitext(x)[-1] == self.extension] - self.length = len(self.file_names) - - def __len__(self): - return self.length +class AudioSegmentDataset(Dataset): + """A simple dataset adaptor for audio files to train vocoders. + Read -> trim silence -> normalize -> extract a segment + """ + def __init__(self, file_paths: List[Path], sample_rate: int, length: int, + top_db: float): + self.file_paths = file_paths + self.sr = sample_rate + self.top_db = top_db + self.length = length # samples in the clip def __getitem__(self, i): - file_name = self.file_names[i] - y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable + fpath = self.file_paths[i] + y, sr = librosa.load(fpath, self.sr) + y, _ = librosa.effects.trim(y, top_db=self.top_db) + y = librosa.util.normalize(y) + y = y.astype(np.float32) + + # pad or trim + if y.size <= self.length: + y = np.pad(y, [0, self.length - len(y)], mode='constant') + else: + start = np.random.randint(0, 1 + len(y) - self.length) + y = y[start:start + self.length] return y + + def __len__(self): + return len(self.file_paths) + + +class AudioDataset(Dataset): + """A simple dataset adaptor for the audio files. + Read -> trim silence -> normalize + """ + def __init__(self, + file_paths: List[Path], + sample_rate: int, + top_db: float = 60): + self.file_paths = file_paths + self.sr = sample_rate + self.top_db = top_db + + def __getitem__(self, i): + fpath = self.file_paths[i] + y, sr = librosa.load(fpath, self.sr) + y, _ = librosa.effects.trim(y, top_db=self.top_db) + y = librosa.util.normalize(y) + y = y.astype(np.float32) + return y + + def __len__(self): + return len(self.file_paths) + + +class AudioFolderDataset(AudioDataset): + def __init__( + self, + root, + sample_rate, + top_db=60, + extension=".wav", + ): + root = Path(root).expanduser() + file_paths = sorted(list(root.rglob("*{}".format(extension)))) + super().__init__(file_paths, sample_rate, top_db)