change wavenet to use on-the-fly prepeocessing

2021-04-19 19:58:36 +08:00 · 2021-04-19 19:58:36 +08:00 · 3741cc49ca
parent e06c6cdfe1
commit 3741cc49ca
13 changed files with 262 additions and 216 deletions
--- a/examples/tacotron2/ljspeech.py
+++ b/examples/tacotron2/ljspeech.py
@ -92,8 +92,8 @@ class LJSpeechCollector(object):
        text_lens = np.array(sorted(text_lens, reverse=True), dtype=np.int64)
        # Pad sequence with largest len of the batch
-        texts = batch_text_id(texts, pad_id=self.padding_idx)
+        texts, _ = batch_text_id(texts, pad_id=self.padding_idx)
-        mels = np.transpose(
+        mels, _ = np.transpose(
            batch_spec(
                mels, pad_value=self.padding_value), axes=(0, 2, 1))
--- a/examples/tacotron2_aishell3/aishell3.py
+++ b/examples/tacotron2_aishell3/aishell3.py
@ -44,9 +44,9 @@ def collate_aishell3_examples(examples):
    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
    T_dec = np.max(spec_lengths)
    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
-    phones = batch_text_id(phones)
+    phones, _ = batch_text_id(phones)
-    tones = batch_text_id(tones)
+    tones, _ = batch_text_id(tones)
-    mel = batch_spec(mel)
+    mel, _ = batch_spec(mel)
    mel = np.transpose(mel, (0, 2, 1))
    embed = np.stack(embed)
    # 7 fields
--- a/examples/tacotron2_baker/data.py
+++ b/examples/tacotron2_baker/data.py
@ -40,9 +40,9 @@ def collate_baker_examples(examples):
    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
    T_dec = np.max(spec_lengths)
    stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
-    phones = batch_text_id(phones)
+    phone, _ = batch_text_id(phones)
-    tones = batch_text_id(tones)
+    tones, _ = batch_text_id(tones)
-    mel = batch_spec(mel)
+    mel, _ = batch_spec(mel)
    mel = np.transpose(mel, (0, 2, 1))
    return phones, tones, mel, text_lengths, spec_lengths, stop_tokens
--- a/examples/tacotron2_vctk/vctk.py
+++ b/examples/tacotron2_vctk/vctk.py
@ -53,6 +53,6 @@ def collate_vctk_examples(examples):
    slens = np.array([item.shape[1] for item in mels], dtype=np.int64)
    speaker_ids = np.array(speaker_ids, dtype=np.int64)
-    phonemes = batch_text_id(phonemes, pad_id=0)
+    phonemes, _ = batch_text_id(phonemes, pad_id=0)
-    mels = np.transpose(batch_spec(mels, pad_value=0.), [0, 2, 1])
+    mels, _ = np.transpose(batch_spec(mels, pad_value=0.), [0, 2, 1])
    return phonemes, plens, mels, slens, speaker_ids
--- a/examples/transformer_tts/ljspeech.py
+++ b/examples/transformer_tts/ljspeech.py
@ -76,9 +76,9 @@ class LJSpeechCollector(object):
        mels = [example[1] for example in examples]
        stop_probs = [example[2] for example in examples]
-        ids = batch_text_id(ids, pad_id=self.padding_idx)
+        ids, _ = batch_text_id(ids, pad_id=self.padding_idx)
-        mels = batch_spec(mels, pad_value=self.padding_value)
+        mels, _ = batch_spec(mels, pad_value=self.padding_value)
-        stop_probs = batch_text_id(stop_probs, pad_id=self.padding_idx)
+        stop_probs, _ = batch_text_id(stop_probs, pad_id=self.padding_idx)
        return ids, np.transpose(mels, [0, 2, 1]), stop_probs
--- a/examples/waveflow/ljspeech.py
+++ b/examples/waveflow/ljspeech.py
@ -61,8 +61,8 @@ class LJSpeechCollector(object):
    def __call__(self, examples):
        mels = [example[0] for example in examples]
        wavs = [example[1] for example in examples]
-        mels = batch_spec(mels, pad_value=self.padding_value)
+        mels, _ = batch_spec(mels, pad_value=self.padding_value)
-        wavs = batch_wav(wavs, pad_value=self.padding_value)
+        wavs, _ = batch_wav(wavs, pad_value=self.padding_value)
        return mels, wavs
--- a/examples/wavenet/config.py
+++ b/examples/wavenet/config.py
@ -20,10 +20,12 @@ _C.data = CN(
        batch_size=8,  # batch size
        valid_size=16,  # the first N examples are reserved for validation
        sample_rate=22050,  # Hz, sample rate
-        n_fft=2048,  # fft frame size
+        n_fft=1024,  # fft frame size
        win_length=1024,  # window size
        hop_length=256,  # hop size between ajacent frame
-        # f_max=8000, # Hz, max frequency when converting to mel
+        top_db=60, # db, used to trim silence
        fmin = 0,  # Hz, max frequency when converting to mel
        fmax=8000, # Hz, max frequency when converting to mel
        n_mels=80,  # mel bands
        train_clip_seconds=0.5,  # audio clip length(in seconds)
    ))
--- a/examples/wavenet/ljspeech.py
+++ b/examples/wavenet/ljspeech.py
@ -16,136 +16,43 @@ import os
 from pathlib import Path
 import pickle
 import numpy as np
 import librosa
 import pandas
 from paddle.io import Dataset, DataLoader
 from parakeet.data.batch import batch_spec, batch_wav
 from parakeet.data import dataset
 from parakeet.audio import AudioProcessor
 class LJSpeech(Dataset):
    """A simple dataset adaptor for the processed ljspeech dataset."""
-    def __init__(self, root):
+    def __init__(self, root, sample_rate, length, top_db):
        self.root = Path(root).expanduser()
-        meta_data = pandas.read_csv(
+        self.metadata = pandas.read_csv(
            str(self.root / "metadata.csv"),
-            sep="\t",
+            sep="|",
            header=None,
-            names=["fname", "frames", "samples"])
+            names=["fname", "text", "normalized_text"])
-
+        self.wav_dir = self.root / "wavs"
-        records = []
+        self.sr = sample_rate
-        for row in meta_data.itertuples():
+        self.top_db = top_db
-            mel_path = str(self.root / "mel" / (row.fname + ".npy"))
+        self.length = length # samples in the clip
            wav_path = str(self.root / "wav" / (row.fname + ".npy"))
            records.append((mel_path, wav_path))
        self.records = records
    def __getitem__(self, i):
-        mel_name, wav_name = self.records[i]
+        fname = self.metadata.iloc[0].fname
-        mel = np.load(mel_name)
+        fpath = (self.wav_dir / fname).with_suffix(".wav")
-        wav = np.load(wav_name)
+        y, sr = librosa.load(fpath, self.sr)
-        return mel, wav
+        y, _ = librosa.effects.trim(y, top_db=self.top_db)
        y = librosa.util.normalize(y)
        y = y.astype(np.float32)
        # pad or trim
        if y.size <= self.length:
            y = np.pad(y, [0, self.length - len(y)], mode='constant')
        else:
            start = np.random.randint(0, 1 + len(y) - self.length)
            y = y[start: start + self.length]
        return y
    def __len__(self):
-        return len(self.records)
+        return len(self.metadata)
 class LJSpeechCollector(object):
    """A simple callable to batch LJSpeech examples."""
    def __init__(self, padding_value=0.):
        self.padding_value = padding_value
    def __call__(self, examples):
        batch_size = len(examples)
        mels = [example[0] for example in examples]
        wavs = [example[1] for example in examples]
        mels = batch_spec(mels, pad_value=self.padding_value)
        wavs = batch_wav(wavs, pad_value=self.padding_value)
        audio_starts = np.zeros((batch_size, ), dtype=np.int64)
        return mels, wavs, audio_starts
 class LJSpeechClipCollector(object):
    def __init__(self, clip_frames=65, hop_length=256):
        self.clip_frames = clip_frames
        self.hop_length = hop_length
    def __call__(self, examples):
        mels = []
        wavs = []
        starts = []
        for example in examples:
            mel, wav_clip, start = self.clip(example)
            mels.append(mel)
            wavs.append(wav_clip)
            starts.append(start)
        mels = batch_spec(mels)
        wavs = np.stack(wavs)
        starts = np.array(starts, dtype=np.int64)
        return mels, wavs, starts
    def clip(self, example):
        mel, wav = example
        frames = mel.shape[-1]
        start = np.random.randint(0, frames - self.clip_frames)
        wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
                       self.hop_length]
        return mel, wav_clip, start
 class DataCollector(object):
    def __init__(self,
                 context_size,
                 sample_rate,
                 hop_length,
                 train_clip_seconds,
                 valid=False):
        frames_per_second = sample_rate // hop_length
        train_clip_frames = int(
            np.ceil(train_clip_seconds * frames_per_second))
        context_frames = context_size // hop_length
        self.num_frames = train_clip_frames + context_frames
        self.sample_rate = sample_rate
        self.hop_length = hop_length
        self.valid = valid
    def random_crop(self, sample):
        audio, mel_spectrogram = sample
        audio_frames = int(audio.size) // self.hop_length
        max_start_frame = audio_frames - self.num_frames
        assert max_start_frame >= 0, "audio is too short to be cropped"
        frame_start = np.random.randint(0, max_start_frame)
        # frame_start = 0  # norandom
        frame_end = frame_start + self.num_frames
        audio_start = frame_start * self.hop_length
        audio_end = frame_end * self.hop_length
        audio = audio[audio_start:audio_end]
        return audio, mel_spectrogram, audio_start
    def __call__(self, samples):
        # transform them first
        if self.valid:
            samples = [(audio, mel_spectrogram, 0)
                       for audio, mel_spectrogram in samples]
        else:
            samples = [self.random_crop(sample) for sample in samples]
        # batch them
        audios = [sample[0] for sample in samples]
        audio_starts = [sample[2] for sample in samples]
        mels = [sample[1] for sample in samples]
        mels = batch_spec(mels)
        if self.valid:
            audios = batch_wav(audios, dtype=np.float32)
        else:
            audios = np.array(audios, dtype=np.float32)
        audio_starts = np.array(audio_starts, dtype=np.int64)
        return audios, mels, audio_starts
--- a/examples/wavenet/train.py
+++ b/examples/wavenet/train.py
@ -30,9 +30,13 @@ from parakeet.utils import scheduler, mp_tools
 from parakeet.training.cli import default_argument_parser
 from parakeet.training.experiment import ExperimentBase
 from parakeet.utils.mp_tools import rank_zero_only
 from parakeet.datasets import AudioDataset, AudioSegmentDataset
 from parakeet.data import batch_wav
 from parakeet.modules.audio import STFT, MelScale
 from config import get_cfg_defaults
-from ljspeech import LJSpeech, LJSpeechClipCollector, LJSpeechCollector
+from ljspeech import LJSpeech
 class Experiment(ExperimentBase):
@ -61,38 +65,47 @@ class Experiment(ExperimentBase):
            grad_clip=paddle.nn.ClipGradByGlobalNorm(
                config.training.gradient_max_norm))
        self.stft = STFT(config.data.n_fft, config.data.hop_length, config.data.win_length)
        self.mel_scale = MelScale(config.data.sample_rate, config.data.n_fft, config.data.n_mels, config.data.fmin, config.data.fmax)
        self.model = model
        self.model_core = model._layers if self.parallel else model
        self.optimizer = optimizer
    def setup_dataloader(self):
        config = self.config
        args = self.args
        ljspeech_dataset = LJSpeech(args.data)
        valid_set, train_set = dataset.split(ljspeech_dataset,
                                             config.data.valid_size)
        # convolutional net's causal padding size
        context_size = config.model.n_stack \
                      * sum([(config.model.filter_size - 1) * 2**i for i in range(config.model.n_loop)]) \
                      + 1
        context_frames = context_size // config.data.hop_length
        # frames used to compute loss
-        frames_per_second = config.data.sample_rate // config.data.hop_length
+        train_clip_size = int(config.data.train_clip_seconds * config.data.sample_rate)
-        train_clip_frames = math.ceil(config.data.train_clip_seconds *
+        length = context_size + train_clip_size
-                                      frames_per_second)
+        
        root = Path(args.data).expanduser()
        file_paths = sorted(list((root / "wavs").rglob("*.wav")))
        train_set = AudioSegmentDataset(
            file_paths[config.data.valid_size:],
            config.data.sample_rate,
            length,
            top_db=config.data.top_db)
        valid_set = AudioDataset(
            file_paths[:config.data.valid_size],
            config.data.sample_rate,
            top_db=config.data.top_db)
        num_frames = train_clip_frames + context_frames
        batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
        if not self.parallel:
            train_loader = DataLoader(
                train_set,
                batch_size=config.data.batch_size,
                shuffle=True,
                drop_last=True,
-                collate_fn=batch_fn)
+                num_workers=1,
            )
        else:
            sampler = DistributedBatchSampler(
                train_set,
@ -100,25 +113,36 @@ class Experiment(ExperimentBase):
                shuffle=True,
                drop_last=True)
            train_loader = DataLoader(
-                train_set, batch_sampler=sampler, collate_fn=batch_fn)
+                train_set, batch_sampler=sampler, num_workers=1)
        valid_batch_fn = LJSpeechCollector()
        valid_loader = DataLoader(
-            valid_set, batch_size=1, collate_fn=valid_batch_fn)
+            valid_set, 
            batch_size=config.data.batch_size, 
            num_workers=1, 
            collate_fn=batch_wav)
        self.train_loader = train_loader
        self.valid_loader = valid_loader
    def train_batch(self):
        # load data
        start = time.time()
        batch = self.read_batch()
        data_loader_time = time.time() - start
        self.model.train()
        self.optimizer.clear_grad()
-        mel, wav, audio_starts = batch
+        wav = batch
-        y = self.model(wav, mel, audio_starts)
+        # data preprocessing
        S = self.stft.magnitude(wav)
        mel = self.mel_scale(S)
        logmel = 20 * paddle.log10(mel, paddle.clip(mel, min=1e-5))
        logmel = paddle.clip((logmel + 80) / 100, min=0.0, max=1.0)
        # forward & backward
        y = self.model(wav, logmel)
        loss = self.model_core.loss(y, wav)
        loss.backward()
        self.optimizer.step()
@ -129,24 +153,43 @@ class Experiment(ExperimentBase):
        msg += "step: {}, ".format(self.iteration)
        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
                                                  iteration_time)
-        msg += "loss: {:>.6f}".format(loss_value)
+        msg += "train/loss: {:>.6f}, ".format(loss_value)
        msg += "lr: {:>.6f}".format(self.optimizer.get_lr())
        self.logger.info(msg)
        if dist.get_rank() == 0:
            self.visualizer.add_scalar(
-                "train/loss", loss_value, global_step=self.iteration)
+                "train/loss", loss_value, self.iteration)
            self.visualizer.add_scalar(
                "train/lr", self.optimizer.get_lr(), self.iteration)
        # now we have to call learning rate scheduler.step() mannually
        self.optimizer._learning_rate.step()
    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def valid(self):
        valid_iterator = iter(self.valid_loader)
        valid_losses = []
-        mel, wav, audio_starts = next(valid_iterator)
+        
-        y = self.model(wav, mel, audio_starts)
+        for batch in self.valid_loader:
-        loss = self.model_core.loss(y, wav)
+            wav, length = batch
-        valid_losses.append(float(loss))
+            # data preprocessing
-        valid_loss = np.mean(valid_losses)
+            S = self.stft.magnitude(wav)
            mel = self.mel_scale(S)
            logmel = 20 * paddle.log10(mel, paddle.clip(mel, min=1e-5))
            logmel = paddle.clip((logmel + 80) / 100, min=0.0, max=1.0)
            y = self.model(wav, logmel)
            loss = self.model_core.loss(y, wav)
            valid_losses.append(float(loss))
            valid_loss = np.mean(valid_losses)
        msg = "Rank: {}, ".format(dist.get_rank())
        msg += "step: {}, ".format(self.iteration)
        msg += "valid/loss: {:>.6f}".format(valid_loss)
        self.logger.info(msg)
        self.visualizer.add_scalar(
-            "valid/loss", valid_loss, global_step=self.iteration)
+            "valid/loss", valid_loss, self.iteration)
 def main_sp(config, args):
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@ -65,7 +65,7 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
                   mode='constant',
                   constant_values=pad_id))
-    return np.array(batch, dtype=dtype)
+    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
 class WavBatcher(object):
@ -106,7 +106,7 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
            np.pad(example, [(0, pad_len)],
                   mode='constant',
                   constant_values=pad_value))
-    return np.array(batch, dtype=dtype)
+    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
 class SpecBatcher(object):
@ -160,4 +160,4 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
                np.pad(example, [(0, 0), (0, pad_len)],
                       mode='constant',
                       constant_values=pad_value))
-    return np.array(batch, dtype=dtype)
+    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
@ -15,24 +15,75 @@
 from paddle.io import Dataset
 import os
 import librosa
 from pathlib import Path
 import numpy as np
 from typing import List
-__all__ = ["AudioFolderDataset"]
+__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
-class AudioFolderDataset(Dataset):
+class AudioSegmentDataset(Dataset):
-    def __init__(self, path, sample_rate, extension="wav"):
+    """A simple dataset adaptor for audio files to train vocoders.
-        self.root = os.path.expanduser(path)
+    Read -> trim silence -> normalize -> extract a segment
-        self.sample_rate = sample_rate
+    """
-        self.extension = extension
+    def __init__(self, file_paths: List[Path], sample_rate: int, length: int,
-        self.file_names = [
+                 top_db: float):
-            os.path.join(self.root, x) for x in os.listdir(self.root) \
+        self.file_paths = file_paths
-                if os.path.splitext(x)[-1] == self.extension]
+        self.sr = sample_rate
-        self.length = len(self.file_names)
+        self.top_db = top_db
-
+        self.length = length  # samples in the clip
    def __len__(self):
        return self.length
    def __getitem__(self, i):
-        file_name = self.file_names[i]
+        fpath = self.file_paths[i]
-        y, _ = librosa.load(file_name, sr=self.sample_rate)  # pylint: disable=unused-variable
+        y, sr = librosa.load(fpath, self.sr)
        y, _ = librosa.effects.trim(y, top_db=self.top_db)
        y = librosa.util.normalize(y)
        y = y.astype(np.float32)
        # pad or trim
        if y.size <= self.length:
            y = np.pad(y, [0, self.length - len(y)], mode='constant')
        else:
            start = np.random.randint(0, 1 + len(y) - self.length)
            y = y[start:start + self.length]
        return y
    def __len__(self):
        return len(self.file_paths)
 class AudioDataset(Dataset):
    """A simple dataset adaptor for the audio files. 
    Read -> trim silence -> normalize
    """
    def __init__(self,
                 file_paths: List[Path],
                 sample_rate: int,
                 top_db: float = 60):
        self.file_paths = file_paths
        self.sr = sample_rate
        self.top_db = top_db
    def __getitem__(self, i):
        fpath = self.file_paths[i]
        y, sr = librosa.load(fpath, self.sr)
        y, _ = librosa.effects.trim(y, top_db=self.top_db)
        y = librosa.util.normalize(y)
        y = y.astype(np.float32)
        return y
    def __len__(self):
        return len(self.file_paths)
 class AudioFolderDataset(AudioDataset):
    def __init__(
        self,
        root,
        sample_rate,
        top_db=60,
        extension=".wav",
    ):
        root = Path(root).expanduser()
        file_paths = sorted(list(root.rglob("*{}".format(extension))))
        super().__init__(file_paths, sample_rate, top_db)
--- a/parakeet/models/wavenet.py
+++ b/parakeet/models/wavenet.py
@ -101,9 +101,7 @@ class UpsampleNet(nn.LayerList):
    def __init__(self, upscale_factors=[16, 16]):
        super(UpsampleNet, self).__init__()
        self.upscale_factors = list(upscale_factors)
-        self.upscale_factor = 1
+        self.upscale_factor = np.prod(upscale_factors)
        for item in upscale_factors:
            self.upscale_factor *= item
        for factor in self.upscale_factors:
            self.append(
@ -224,13 +222,15 @@ class ResidualBlock(nn.Layer):
            other ResidualBlocks. 
    """
        h = x
        length = x.shape[-1]
        # dilated conv
        h = self.conv(h)
        # condition
        # NOTE: expanded condition may have a larger timesteps than x
        if condition is not None:
-            h += self.condition_proj(condition)
+            h += self.condition_proj(condition)[:, :, :length]
        # gated tanh
        content, gate = paddle.split(h, 2, axis=1)
@ -822,7 +822,7 @@ class ConditionalWaveNet(nn.Layer):
            loss_type=loss_type,
            log_scale_min=log_scale_min)
-    def forward(self, audio, mel, audio_start):
+    def forward(self, audio, mel):
        """Compute the output distribution given the mel spectrogram and the input(for teacher force training).
        Parameters
@ -845,13 +845,13 @@ class ConditionalWaveNet(nn.Layer):
        """
        audio_length = audio.shape[1]  # audio clip's length
        condition = self.encoder(mel)
-        condition_slice = crop(condition, audio_start, audio_length)
+        
        # shifting 1 step
        audio = audio[:, :-1]
-        condition_slice = condition_slice[:, :, 1:]
+        condition = condition[:, :, 1:]
-        y = self.decoder(audio, condition_slice)
+        y = self.decoder(audio, condition)
        return y
    def loss(self, y, t):
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@ -16,6 +16,8 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from scipy import signal
 import librosa
 from librosa.util import pad_center
 import numpy as np
 __all__ = ["quantize", "dequantize", "STFT"]
@ -88,6 +90,19 @@ class STFT(nn.Layer):
        Name of window function, see `scipy.signal.get_window` for more 
        details. Defaults to "hanning".
    center : bool
        If True, the signal y is padded so that frame D[:, t] is centered 
        at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
        Defaults to True.
    pad_mode : string or function
       If center=True, this argument is passed to np.pad for padding the edges 
       of the signal y. By default (pad_mode="reflect"), y is padded on both 
       sides with its own reflection, mirrored around its first and last 
       sample respectively. If center=False, this argument is ignored.
    Notes
    -----------
    It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more 
@ -101,29 +116,47 @@ class STFT(nn.Layer):
    """
-    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
+    def __init__(self, n_fft, hop_length=None, win_length=None, window="hanning", center=True, pad_mode="reflect"):
-        super(STFT, self).__init__()
+        super().__init__()
        # By default, use the entire frame
        if win_length is None:
            win_length = n_fft
        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length // 4)
        self.hop_length = hop_length
        self.n_bin = 1 + n_fft // 2
        self.n_fft = n_fft
        self.center = center
        self.pad_mode = pad_mode
        # calculate window
-        window = signal.get_window(window, win_length)
+        window = signal.get_window(window, win_length, fftbins=True)
        # pad window to n_fft size
        if n_fft != win_length:
-            pad = (n_fft - win_length) // 2
+            window = pad_center(window, n_fft, mode="constant")
-            window = np.pad(window, ((pad, pad), ), 'constant')
+            #lpad = (n_fft - win_length) // 2
            #rpad = n_fft - win_length - lpad
            #window = np.pad(window, ((lpad, pad), ), 'constant')
        # calculate weights
-        r = np.arange(0, n_fft)
+        #r = np.arange(0, n_fft)
-        M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
+        #M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        w_real = np.reshape(window *
+        #w_real = np.reshape(window *
-                            np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
+                            #np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-                            (self.n_bin, 1, 1, self.n_fft))
+                            #(self.n_bin, 1, self.n_fft))
-        w_imag = np.reshape(window *
+        #w_imag = np.reshape(window *
-                            np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
+                            #np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-                            (self.n_bin, 1, 1, self.n_fft))
+                            #(self.n_bin, 1, self.n_fft))
-
+        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
        w_real = weight.real
        w_imag = weight.imag
        w = np.concatenate([w_real, w_imag], axis=0)
        w = w * window
        w = np.expand_dims(w, 1)
        self.weight = paddle.cast(
            paddle.to_tensor(w), paddle.get_default_dtype())
@ -137,23 +170,20 @@ class STFT(nn.Layer):
        Returns
        ------------
-        real : Tensor [shape=(B, C, 1, frames)] 
+        real : Tensor [shape=(B, C, frames)] 
            The real part of the spectrogram.
-        imag : Tensor [shape=(B, C, 1, frames)] 
+        imag : Tensor [shape=(B, C, frames)] 
            The image part of the spectrogram.
        """
-        # x(batch_size, time_steps)
+        x = paddle.unsqueeze(x, axis=1)
-        # pad it first with reflect mode
+        if self.center:
-        # TODO(chenfeiyu): report an issue on paddle.flip
+            x = F.pad(x, [self.n_fft // 2, self.n_fft // 2], 
-        pad_start = paddle.reverse(x[:, 1:1 + self.n_fft // 2], axis=[1])
+                      data_format='NCL', mode=self.pad_mode)
        pad_stop = paddle.reverse(x[:, -(1 + self.n_fft // 2):-1], axis=[1])
        x = paddle.concat([pad_start, x, pad_stop], axis=-1)
-        # to BC1T, C=1
+        # to BCT, C=1
-        x = paddle.unsqueeze(x, axis=[1, 2])
+        out = F.conv1d(x, self.weight, stride=self.hop_length)
-        out = F.conv2d(x, self.weight, stride=(1, self.hop_length))
+        real, imag = paddle.chunk(out, 2, axis=1)  # BCT
        real, imag = paddle.chunk(out, 2, axis=1)  # BC1T
        return real, imag
    def power(self, x):
@ -166,7 +196,7 @@ class STFT(nn.Layer):
        Returns
        ------------
-        Tensor [shape=(B, C, 1, T)] 
+        Tensor [shape=(B, C, T)] 
            The power spectrum.
        """
        real, imag = self(x)
@ -183,9 +213,22 @@ class STFT(nn.Layer):
        Returns
        ------------
-        Tensor [shape=(B, C, 1, T)] 
+        Tensor [shape=(B, C, T)] 
            The magnitude of the spectrum.
        """
        power = self.power(x)
        magnitude = paddle.sqrt(power)
        return magnitude
 class MelScale(nn.Layer):
    def __init__(self, sr, n_fft, n_mels, fmin, fmax):
        super().__init__()
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
        print(mel_basis.shape)
        self.weight = paddle.to_tensor(mel_basis)
    def forward(self, spec):
        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
        mel = paddle.matmul(self.weight, spec)
        return mel