Merge pull request #115 from iclementine/pwg

Add Parallel WaveGan and example
2021-07-01 04:04:52 -05:00 · 2021-07-01 04:04:52 -05:00 · 68e9a84ada
parent de61534153 e41423caf0
commit 68e9a84ada
41 changed files with 4243 additions and 338 deletions
--- a/.gitignore
+++ b/.gitignore
@ -142,3 +142,5 @@ dmypy.json
 *.swp
 runs
 syn_audios
 exp/
 dump/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,10 +1,11 @@
 repos:
 -   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
-    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+    rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
    hooks:
    -   id: yapf
        files: \.py$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: a11d9314b22d8f8c7556443875b731ef05965464
+    rev: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
    -   id: check-merge-conflict
    -   id: check-symlinks
@ -15,7 +16,7 @@
    -   id: trailing-whitespace
        files: \.md$
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    sha: v1.0.1
+    rev: v1.0.1
    hooks:
    -   id: forbid-crlf
        files: \.md$
--- a/examples/parallelwave_gan/baker/batch_fn.py
+++ b/examples/parallelwave_gan/baker/batch_fn.py
@ -0,0 +1,110 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import paddle
 class Clip(object):
    """Collate functor for training vocoders.
    """
    def __init__(
            self,
            batch_max_steps=20480,
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.
        Args:
            batch_max_steps (int): The maximum length of input signal in batch.
            hop_size (int): Hop size of auxiliary features.
            aux_context_window (int): Context window size for auxiliary feature conv.
        """
        if batch_max_steps % hop_size != 0:
            batch_max_steps += -(batch_max_steps % hop_size)
        assert batch_max_steps % hop_size == 0
        self.batch_max_steps = batch_max_steps
        self.batch_max_frames = batch_max_steps // hop_size
        self.hop_size = hop_size
        self.aux_context_window = aux_context_window
        # set useful values in random cutting
        self.start_offset = aux_context_window
        self.end_offset = -(self.batch_max_frames + aux_context_window)
        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
    def __call__(self, examples):
        """Convert into batch tensors.
        Args:
            batch (list): list of tuple of the pair of audio and features. Audio shape
                (T, ), features shape(T', C).
        Returns:
            Tensor: Auxiliary feature batch (B, C, T'), where
                T = (T' - 2 * aux_context_window) * hop_size.
            Tensor: Target signal batch (B, 1, T).
        """
        # check length
        examples = [
            self._adjust_length(b['wave'], b['feats']) for b in examples
            if b['feats'].shape[0] > self.mel_threshold
        ]
        xs, cs = [b[0] for b in examples], [b[1] for b in examples]
        # make batch with random cut
        c_lengths = [c.shape[0] for c in cs]
        start_frames = np.array([
            np.random.randint(self.start_offset, cl + self.end_offset)
            for cl in c_lengths
        ])
        x_starts = start_frames * self.hop_size
        x_ends = x_starts + self.batch_max_steps
        c_starts = start_frames - self.aux_context_window
        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
        y_batch = np.stack(
            [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)])
        c_batch = np.stack(
            [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
        # convert each batch to tensor, asuume that each item in batch has the same length
        y_batch = paddle.to_tensor(
            y_batch, dtype=paddle.float32).unsqueeze(1)  # (B, 1, T)
        c_batch = paddle.to_tensor(
            c_batch, dtype=paddle.float32).transpose([0, 2, 1])  # (B, C, T')
        return y_batch, c_batch
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.
        Note:
            Basically we assume that the length of x and c are adjusted
            through preprocessing stage, but if we use other library processed
            features, this process will be needed.
        """
        if len(x) < c.shape[1] * self.hop_size:
            x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)),
                       mode="edge")
        # check the legnth is valid
        assert len(x) == c.shape[
            0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
        return x, c
--- a/examples/parallelwave_gan/baker/compute_statistics.py
+++ b/examples/parallelwave_gan/baker/compute_statistics.py
@ -0,0 +1,110 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Calculate statistics of feature files."""
 import argparse
 import logging
 import os
 import numpy as np
 import yaml
 import json
 import jsonlines
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
 from parakeet.datasets.data_table import DataTable
 from parakeet.utils.h5_utils import read_hdf5
 from parakeet.utils.h5_utils import write_hdf5
 from config import get_cfg_default
 def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of dumped raw features.")
    parser.add_argument(
        "--metadata", type=str, help="json file with id and file paths ")
    parser.add_argument(
        "--field-name",
        type=str,
        help="name of the field to compute statistics for.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--dumpdir",
        type=str,
        help="directory to save statistics. if not provided, "
        "stats will be saved in the above root directory. (default=None)")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
        logging.warning('Skip DEBUG/INFO messages')
    config = get_cfg_default()
    # load config
    if args.config:
        config.merge_from_file(args.config)
    # check directory existence
    if args.dumpdir is None:
        args.dumpdir = os.path.dirname(args.metadata)
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir)
    with jsonlines.open(args.metadata, 'r') as reader:
        metadata = list(reader)
    dataset = DataTable(
        metadata,
        fields=[args.field_name],
        converters={args.field_name: np.load}, )
    logging.info(f"The number of files = {len(dataset)}.")
    # calculate statistics
    scaler = StandardScaler()
    for datum in tqdm(dataset):
        # StandardScalar supports (*, num_features) by default
        scaler.partial_fit(datum[args.field_name])
    stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
    np.save(
        os.path.join(args.dumpdir, "stats.npy"),
        stats.astype(np.float32),
        allow_pickle=False)
 if __name__ == "__main__":
    main()
--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -0,0 +1,128 @@
 # This is the hyperparameter configuration file for Parallel WaveGAN.
 # Please make sure this is adjusted for the CSMSC dataset. If you want to
 # apply to the other dataset, you might need to carefully change some parameters.
 # This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 sr: 24000     # Sampling rate.
 n_fft: 2048           # FFT size.
 hop_length: 300            # Hop size.
 win_length: 1200         # Window length.
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80             # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation.
 fmax: 7600               # Maximum frequency in mel basis calculation.
 # global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 trim_silence: false      # Whether to trim the start and end of silence.
 top_db: 60 # Need to tune carefully if the recording is not good.
 trim_frame_length: 2048    # Frame size in trimming.(in samples)
 trim_hop_length: 512       # Hop size in trimming.(in samples)
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
 generator_params:
    in_channels: 1        # Number of input channels.
    out_channels: 1       # Number of output channels.
    kernel_size: 3        # Kernel size of dilated convolution.
    layers: 30            # Number of residual block layers.
    stacks: 3             # Number of stacks i.e., dilation cycles.
    residual_channels: 64 # Number of channels in residual conv.
    gate_channels: 128    # Number of channels in gated conv.
    skip_channels: 64     # Number of channels in skip conv.
    aux_channels: 80      # Number of channels for auxiliary feature conv.
                          # Must be the same as num_mels.
    aux_context_window: 2 # Context window size for auxiliary feature.
                          # If set to 2, previous 2 and future 2 frames will be considered.
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
    bias: true            # use bias in residual blocks
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    use_causal_conv: false               # use causal conv in residual blocks and upsample layers
    # upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
    interpolate_mode: "nearest" # upsample net interpolate mode
    freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
    nonlinear_activation: null
    nonlinear_activation_params: {}
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
 discriminator_params:
    in_channels: 1        # Number of input channels.
    out_channels: 1       # Number of output channels.
    kernel_size: 3        # Number of output channels.
    layers: 10            # Number of conv layers.
    conv_channels: 64     # Number of chnn layers.
    bias: true            # Whether to use bias parameter in conv.
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
    nonlinear_activation_params:      # Nonlinear function parameters
        negative_slope: 0.2           # Alpha in LeakyReLU.
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"         # Window function for STFT-based loss
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
 ###########################################################
 lambda_adv: 4.0  # Loss balancing coefficient.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 6              # Batch size.
 batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
 ###########################################################
 #             OPTIMIZER & SCHEDULER SETTING               #
 ###########################################################
 generator_optimizer_params:
    epsilon: 1.0e-6            # Generator's epsilon.
    weight_decay: 0.0      # Generator's weight decay coefficient.
 generator_scheduler_params:
    learning_rate: 0.0001             # Generator's learning rate.
    step_size: 200000      # Generator's scheduler step size.
    gamma: 0.5             # Generator's scheduler gamma.
                           # At each step size, lr will be multiplied by this parameter.
 generator_grad_norm: 10    # Generator's gradient norm.
 discriminator_optimizer_params:
    epsilon: 1.0e-6            # Discriminator's epsilon.
    weight_decay: 0.0      # Discriminator's weight decay coefficient.
 discriminator_scheduler_params:
    learning_rate: 0.00005            # Discriminator's learning rate.
    step_size: 200000      # Discriminator's scheduler step size.
    gamma: 0.5             # Discriminator's scheduler gamma.
                           # At each step size, lr will be multiplied by this parameter.
 discriminator_grad_norm: 1 # Discriminator's gradient norm.
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
 train_max_steps: 400000                 # Number of training steps.
 save_interval_steps: 5000               # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
 num_snapshots: 10                 # max number of snapshots to keep while training
 seed: 42	                  # random seed for paddle, random, and np.random
--- a/examples/parallelwave_gan/baker/config.py
+++ b/examples/parallelwave_gan/baker/config.py
@ -0,0 +1,25 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import yaml
 from yacs.config import CfgNode as Configuration
 with open("conf/default.yaml", 'rt') as f:
    _C = yaml.safe_load(f)
    _C = Configuration(_C)
 def get_cfg_default():
    config = _C.clone()
    return config
--- a/examples/parallelwave_gan/baker/normalize.py
+++ b/examples/parallelwave_gan/baker/normalize.py
@ -0,0 +1,145 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Normalize feature files and dump them."""
 import argparse
 import logging
 import os
 from operator import itemgetter
 from pathlib import Path
 import numpy as np
 import yaml
 import jsonlines
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
 from parakeet.datasets.data_table import DataTable
 from config import get_cfg_default
 def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
    )
    parser.add_argument(
        "--metadata",
        type=str,
        required=True,
        help="directory including feature files to be normalized. "
        "you need to specify either *-scp or rootdir.")
    parser.add_argument(
        "--dumpdir",
        type=str,
        required=True,
        help="directory to dump normalized feature files.")
    parser.add_argument(
        "--stats", type=str, required=True, help="statistics file.")
    parser.add_argument(
        "--skip-wav-copy",
        default=False,
        action="store_true",
        help="whether to skip the copy of wav files.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
        logging.warning('Skip DEBUG/INFO messages')
    # load config
    config = get_cfg_default()
    if args.config:
        config.merge_from_file(args.config)
    # check directory existence
    dumpdir = Path(args.dumpdir).resolve()
    dumpdir.mkdir(parents=True, exist_ok=True)
    # get dataset
    with jsonlines.open(args.metadata, 'r') as reader:
        metadata = list(reader)
    dataset = DataTable(
        metadata,
        fields=["utt_id", "wave", "feats"],
        converters={
            'utt_id': None,
            'wave': None if args.skip_wav_copy else np.load,
            'feats': np.load,
        })
    logging.info(f"The number of files = {len(dataset)}.")
    # restore scaler
    scaler = StandardScaler()
    scaler.mean_ = np.load(args.stats)[0]
    scaler.scale_ = np.load(args.stats)[1]
    # from version 0.23.0, this information is needed
    scaler.n_features_in_ = scaler.mean_.shape[0]
    # process each file
    output_metadata = []
    for item in tqdm(dataset):
        utt_id = item['utt_id']
        wave = item['wave']
        mel = item['feats']
        # normalize
        mel = scaler.transform(mel)
        # save
        mel_path = dumpdir / f"{utt_id}-feats.npy"
        np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
        if not args.skip_wav_copy:
            wav_path = dumpdir / f"{utt_id}-wave.npy"
            np.save(wav_path, wave.astype(np.float32), allow_pickle=False)
        else:
            wav_path = wave
        output_metadata.append({
            'utt_id': utt_id,
            'wave': str(wav_path),
            'feats': str(mel_path),
        })
    output_metadata.sort(key=itemgetter('utt_id'))
    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
    with jsonlines.open(output_metadata_path, 'w') as writer:
        for item in output_metadata:
            writer.write(item)
    logging.info(f"metadata dumped into {output_metadata_path}")
 if __name__ == "__main__":
    main()
--- a/examples/parallelwave_gan/baker/preprocess.py
+++ b/examples/parallelwave_gan/baker/preprocess.py
@ -0,0 +1,287 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Dict, Any
 import soundfile as sf
 import librosa
 import numpy as np
 import argparse
 import yaml
 import json
 import jsonlines
 import concurrent.futures
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from pathlib import Path
 import tqdm
 from operator import itemgetter
 from praatio import tgio
 import logging
 from config import get_cfg_default
 def logmelfilterbank(audio,
                     sr,
                     n_fft=1024,
                     hop_length=256,
                     win_length=None,
                     window="hann",
                     n_mels=80,
                     fmin=None,
                     fmax=None,
                     eps=1e-10):
    """Compute log-Mel filterbank feature.
    Parameters
    ----------
    audio : ndarray
        Audio signal (T,).
    sr : int
        Sampling rate.
    n_fft : int
        FFT size. (Default value = 1024)
    hop_length : int
        Hop size. (Default value = 256)
    win_length : int
        Window length. If set to None, it will be the same as fft_size. (Default value = None)
    window : str
        Window function type. (Default value = "hann")
    n_mels : int
        Number of mel basis. (Default value = 80)
    fmin : int
        Minimum frequency in mel basis calculation. (Default value = None)
    fmax : int
        Maximum frequency in mel basis calculation. (Default value = None)
    eps : float
        Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
    Returns
    -------
    np.ndarray
        Log Mel filterbank feature (#frames, num_mels).
    """
    # get amplitude spectrogram
    x_stft = librosa.stft(
        audio,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=window,
        pad_mode="reflect")
    spc = np.abs(x_stft)  # (#bins, #frames,)
    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sr / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
    return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
 def process_sentence(config: Dict[str, Any],
                     fp: Path,
                     alignment_fp: Path,
                     output_dir: Path):
    utt_id = fp.stem
    # reading
    y, sr = librosa.load(fp, sr=config.sr)  # resampling may occur
    assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
    assert np.abs(y).max(
    ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
    duration = librosa.get_duration(y, sr=sr)
    # trim according to the alignment file
    alignment = tgio.openTextgrid(alignment_fp)
    intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
    first, last = intervals[0], intervals[-1]
    start = 0
    end = last.end
    if first.label == "sil" and first.end < duration:
        start = first.end
    else:
        logging.warning(
            f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
        )
    if last.label == "sil" and last.start < duration:
        end = last.start
    else:
        end = duration
        logging.warning(
            f" There is something wrong with the last interval {last} in utterance: {utt_id}"
        )
    # silence trimmed
    start, end = librosa.time_to_samples([first.end, last.start], sr=sr)
    y = y[start:end]
    # energy based silence trimming
    if config.trim_silence:
        y, _ = librosa.effects.trim(
            y,
            top_db=config.top_db,
            frame_length=config.trim_frame_length,
            hop_length=config.trim_hop_length)
    logmel = logmelfilterbank(
        y,
        sr=sr,
        n_fft=config.n_fft,
        window=config.window,
        win_length=config.win_length,
        hop_length=config.hop_length,
        n_mels=config.n_mels,
        fmin=config.fmin,
        fmax=config.fmax)
    # adjust time to make num_samples == num_frames * hop_length
    num_frames = logmel.shape[1]
    if y.size < num_frames * config.hop_length:
        y = np.pad(y, (0, num_frames * config.hop_length - y.size),
                   mode="reflect")
    else:
        y = y[:num_frames * config.hop_length]
    num_sample = y.shape[0]
    mel_path = output_dir / (utt_id + "_feats.npy")
    wav_path = output_dir / (utt_id + "_wave.npy")
    np.save(wav_path, y)  # (num_samples, )
    np.save(mel_path, logmel.T)  # (num_frames, n_mels)
    record = {
        "utt_id": utt_id,
        "num_samples": num_sample,
        "num_frames": num_frames,
        "feats": str(mel_path.resolve()),
        "wave": str(wav_path.resolve()),
    }
    return record
 def process_sentences(config,
                      fps: List[Path],
                      alignment_fps: List[Path],
                      output_dir: Path,
                      nprocs: int=1):
    if nprocs == 1:
        results = []
        for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
            results.append(
                process_sentence(config, fp, alignment_fp, output_dir))
    else:
        with ThreadPoolExecutor(nprocs) as pool:
            futures = []
            with tqdm.tqdm(total=len(fps)) as progress:
                for fp, alignment_fp in zip(fps, alignment_fps):
                    future = pool.submit(process_sentence, config, fp,
                                         alignment_fp, output_dir)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)
                results = []
                for ft in futures:
                    results.append(ft.result())
    results.sort(key=itemgetter("utt_id"))
    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
        for item in results:
            writer.write(item)
    print("Done")
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help="directory including wav files. you need to specify either scp or rootdir."
    )
    parser.add_argument(
        "--dumpdir",
        type=str,
        required=True,
        help="directory to dump feature files.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num_cpu", type=int, default=1, help="number of process.")
    args = parser.parse_args()
    C = get_cfg_default()
    if args.config:
        C.merge_from_file(args.config)
        C.freeze()
    if args.verbose > 1:
        print(vars(args))
        print(C)
    root_dir = Path(args.rootdir).expanduser()
    dumpdir = Path(args.dumpdir).expanduser()
    dumpdir.mkdir(parents=True, exist_ok=True)
    wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
    alignment_files = sorted(
        list((root_dir / "PhoneLabeling").rglob("*.interval")))
    # split data into 3 sections
    num_train = 9800
    num_dev = 100
    train_wav_files = wav_files[:num_train]
    dev_wav_files = wav_files[num_train:num_train + num_dev]
    test_wav_files = wav_files[num_train + num_dev:]
    train_alignment_files = alignment_files[:num_train]
    dev_alignment_files = alignment_files[num_train:num_train + num_dev]
    test_alignment_files = alignment_files[num_train + num_dev:]
    train_dump_dir = dumpdir / "train" / "raw"
    train_dump_dir.mkdir(parents=True, exist_ok=True)
    dev_dump_dir = dumpdir / "dev" / "raw"
    dev_dump_dir.mkdir(parents=True, exist_ok=True)
    test_dump_dir = dumpdir / "test" / "raw"
    test_dump_dir.mkdir(parents=True, exist_ok=True)
    # process for the 3 sections
    process_sentences(
        C,
        train_wav_files,
        train_alignment_files,
        train_dump_dir,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        dev_wav_files,
        dev_alignment_files,
        dev_dump_dir,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        test_wav_files,
        test_alignment_files,
        test_dump_dir,
        nprocs=args.num_cpu)
 if __name__ == "__main__":
    main()
--- a/examples/parallelwave_gan/baker/pwg_updater.py
+++ b/examples/parallelwave_gan/baker/pwg_updater.py
@ -0,0 +1,184 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from typing import Dict
 import paddle
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from paddle.optimizer.lr import LRScheduler
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from timer import timer
 from parakeet.datasets.data_table import DataTable
 from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState
 from parakeet.training.extensions.evaluator import StandardEvaluator
 from parakeet.training.trainer import Trainer
 from parakeet.training.reporter import report
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
 from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
 from parakeet.utils.profile import synchronize
 class PWGUpdater(StandardUpdater):
    def __init__(
            self,
            models: Dict[str, Layer],
            optimizers: Dict[str, Optimizer],
            criterions: Dict[str, Layer],
            schedulers: Dict[str, LRScheduler],
            dataloader: DataLoader,
            discriminator_train_start_steps: int,
            lambda_adv: float, ):
        self.models = models
        self.generator: Layer = models['generator']
        self.discriminator: Layer = models['discriminator']
        self.optimizers = optimizers
        self.optimizer_g: Optimizer = optimizers['generator']
        self.optimizer_d: Optimizer = optimizers['discriminator']
        self.criterions = criterions
        self.criterion_stft = criterions['stft']
        self.criterion_mse = criterions['mse']
        self.schedulers = schedulers
        self.scheduler_g = schedulers['generator']
        self.scheduler_d = schedulers['discriminator']
        self.dataloader = dataloader
        self.discriminator_train_start_steps = discriminator_train_start_steps
        self.lambda_adv = lambda_adv
        self.state = UpdaterState(iteration=0, epoch=0)
        self.train_iterator = iter(self.dataloader)
    def update_core(self, batch):
        # parse batch
        wav, mel = batch
        # Generator
        noise = paddle.randn(wav.shape)
        with timer() as t:
            wav_ = self.generator(noise, mel)
            logging.debug(f"Generator takes {t.elapse}s.")
        ## Multi-resolution stft loss
        with timer() as t:
            sc_loss, mag_loss = self.criterion_stft(
                wav_.squeeze(1), wav.squeeze(1))
            logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")
        report("train/spectral_convergence_loss", float(sc_loss))
        report("train/log_stft_magnitude_loss", float(mag_loss))
        gen_loss = sc_loss + mag_loss
        ## Adversarial loss
        if self.state.iteration > self.discriminator_train_start_steps:
            with timer() as t:
                p_ = self.discriminator(wav_)
                adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
                logging.debug(
                    f"Discriminator and adversarial loss takes {t.elapse}s")
            report("train/adversarial_loss", float(adv_loss))
            gen_loss += self.lambda_adv * adv_loss
        report("train/generator_loss", float(gen_loss))
        with timer() as t:
            self.optimizer_g.clear_grad()
            gen_loss.backward()
            logging.debug(f"Backward takes {t.elapse}s.")
        with timer() as t:
            self.optimizer_g.step()
            self.scheduler_g.step()
            logging.debug(f"Update takes {t.elapse}s.")
        # Disctiminator
        if self.state.iteration > self.discriminator_train_start_steps:
            with paddle.no_grad():
                wav_ = self.generator(noise, mel)
            p = self.discriminator(wav)
            p_ = self.discriminator(wav_.detach())
            real_loss = self.criterion_mse(p, paddle.ones_like(p))
            fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
            report("train/real_loss", float(real_loss))
            report("train/fake_loss", float(fake_loss))
            dis_loss = real_loss + fake_loss
            report("train/discriminator_loss", float(dis_loss))
            self.optimizer_d.clear_grad()
            dis_loss.backward()
            self.optimizer_d.step()
            self.scheduler_d.step()
 class PWGEvaluator(StandardEvaluator):
    def __init__(self, models, criterions, dataloader, lambda_adv):
        self.models = models
        self.generator = models['generator']
        self.discriminator = models['discriminator']
        self.criterions = criterions
        self.criterion_stft = criterions['stft']
        self.criterion_mse = criterions['mse']
        self.dataloader = dataloader
        self.lambda_adv = lambda_adv
    def evaluate_core(self, batch):
        logging.debug("Evaluate: ")
        wav, mel = batch
        noise = paddle.randn(wav.shape)
        with timer() as t:
            wav_ = self.generator(noise, mel)
            logging.debug(f"Generator takes {t.elapse}s")
        ## Adversarial loss
        with timer() as t:
            p_ = self.discriminator(wav_)
            adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
            logging.debug(
                f"Discriminator and adversarial loss takes {t.elapse}s")
        report("eval/adversarial_loss", float(adv_loss))
        gen_loss = self.lambda_adv * adv_loss
        # stft loss
        with timer() as t:
            sc_loss, mag_loss = self.criterion_stft(
                wav_.squeeze(1), wav.squeeze(1))
            logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")
        report("eval/spectral_convergence_loss", float(sc_loss))
        report("eval/log_stft_magnitude_loss", float(mag_loss))
        gen_loss += sc_loss + mag_loss
        report("eval/generator_loss", float(gen_loss))
        # Disctiminator
        p = self.discriminator(wav)
        real_loss = self.criterion_mse(p, paddle.ones_like(p))
        fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
        report("eval/real_loss", float(real_loss))
        report("eval/fake_loss", float(fake_loss))
        dis_loss = real_loss + fake_loss
        report("eval/discriminator_loss", float(dis_loss))
--- a/examples/parallelwave_gan/baker/synthesize.py
+++ b/examples/parallelwave_gan/baker/synthesize.py
@ -0,0 +1,93 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 from timer import timer
 import logging
 import argparse
 from pathlib import Path
 import yaml
 import jsonlines
 import paddle
 import numpy as np
 import soundfile as sf
 from paddle import distributed as dist
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.parallel_wavegan import PWGGenerator
 from config import get_cfg_default
 parser = argparse.ArgumentParser(
    description="synthesize with parallel wavegan.")
 parser.add_argument(
    "--config", type=str, help="config file to overwrite default config")
 parser.add_argument("--checkpoint", type=str, help="snapshot to load")
 parser.add_argument("--test-metadata", type=str, help="dev data")
 parser.add_argument("--output-dir", type=str, help="output dir")
 parser.add_argument("--device", type=str, default="gpu", help="device to run")
 parser.add_argument("--verbose", type=int, default=1, help="verbose")
 args = parser.parse_args()
 config = get_cfg_default()
 if args.config:
    config.merge_from_file(args.config)
 print("========Args========")
 print(yaml.safe_dump(vars(args)))
 print("========Config========")
 print(config)
 print(
    f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
 )
 paddle.set_device(args.device)
 generator = PWGGenerator(**config["generator_params"])
 state_dict = paddle.load(args.checkpoint)
 generator.set_state_dict(state_dict["generator_params"])
 generator.remove_weight_norm()
 generator.eval()
 with jsonlines.open(args.test_metadata, 'r') as reader:
    metadata = list(reader)
 test_dataset = DataTable(
    metadata,
    fields=['utt_id', 'feats'],
    converters={
        'utt_id': None,
        'feats': np.load,
    })
 output_dir = Path(args.output_dir)
 output_dir.mkdir(parents=True, exist_ok=True)
 N = 0
 T = 0
 for example in test_dataset:
    utt_id = example['utt_id']
    mel = example['feats']
    mel = paddle.to_tensor(mel)  # (T, C)
    with timer() as t:
        wav = generator.inference(c=mel)
        wav = wav.numpy()
        N += wav.size
        T += t.elapse
        speed = wav.size / t.elapse
    print(
        f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
    )
    sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr)
 print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")
--- a/examples/parallelwave_gan/baker/train.py
+++ b/examples/parallelwave_gan/baker/train.py
@ -0,0 +1,246 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import logging
 import argparse
 import dataclasses
 from pathlib import Path
 import yaml
 import jsonlines
 import paddle
 import numpy as np
 from paddle import nn
 from paddle.nn import functional as F
 from paddle import distributed as dist
 from paddle.io import DataLoader, DistributedBatchSampler
 from paddle.optimizer import Adam  # No RAdaom
 from paddle.optimizer.lr import StepDecay
 from paddle import DataParallel
 from visualdl import LogWriter
 from parakeet.datasets.data_table import DataTable
 from parakeet.training.updater import UpdaterBase
 from parakeet.training.trainer import Trainer
 from parakeet.training.reporter import report
 from parakeet.training import extension
 from parakeet.training.extensions.snapshot import Snapshot
 from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
 from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
 from parakeet.training.seeding import seed_everything
 from batch_fn import Clip
 from config import get_cfg_default
 from pwg_updater import PWGUpdater, PWGEvaluator
 def train_sp(args, config):
    # decides device type and whether to run in parallel
    # setup running environment correctly
    if not paddle.is_compiled_with_cuda:
        paddle.set_device("cpu")
    else:
        paddle.set_device("gpu")
        world_size = paddle.distributed.get_world_size()
        if world_size > 1:
            paddle.distributed.init_parallel_env()
    # set the random seed, it is a must for multiprocess training
    seed_everything(config.seed)
    print(
        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
    )
    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
    # construct dataset for training and validation
    with jsonlines.open(args.train_metadata, 'r') as reader:
        train_metadata = list(reader)
    train_dataset = DataTable(
        data=train_metadata,
        fields=["wave", "feats"],
        converters={
            "wave": np.load,
            "feats": np.load,
        }, )
    with jsonlines.open(args.dev_metadata, 'r') as reader:
        dev_metadata = list(reader)
    dev_dataset = DataTable(
        data=dev_metadata,
        fields=["wave", "feats"],
        converters={
            "wave": np.load,
            "feats": np.load,
        }, )
    # collate function and dataloader
    train_sampler = DistributedBatchSampler(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        drop_last=True)
    dev_sampler = DistributedBatchSampler(
        dev_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        drop_last=False)
    print("samplers done!")
    train_batch_fn = Clip(
        batch_max_steps=config.batch_max_steps,
        hop_size=config.hop_length,
        aux_context_window=config.generator_params.aux_context_window)
    train_dataloader = DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        collate_fn=train_batch_fn,
        num_workers=config.num_workers)
    dev_dataloader = DataLoader(
        dev_dataset,
        batch_sampler=dev_sampler,
        collate_fn=train_batch_fn,
        num_workers=config.num_workers)
    print("dataloaders done!")
    generator = PWGGenerator(**config["generator_params"])
    discriminator = PWGDiscriminator(**config["discriminator_params"])
    if world_size > 1:
        generator = DataParallel(generator)
        discriminator = DataParallel(discriminator)
    print("models done!")
    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
    criterion_mse = nn.MSELoss()
    print("criterions done!")
    lr_schedule_g = StepDecay(**config["generator_scheduler_params"])
    gradient_clip_g = nn.ClipGradByGlobalNorm(config["generator_grad_norm"])
    optimizer_g = Adam(
        learning_rate=lr_schedule_g,
        grad_clip=gradient_clip_g,
        parameters=generator.parameters(),
        **config["generator_optimizer_params"])
    lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
    gradient_clip_d = nn.ClipGradByGlobalNorm(config[
        "discriminator_grad_norm"])
    optimizer_d = Adam(
        learning_rate=lr_schedule_d,
        grad_clip=gradient_clip_d,
        parameters=discriminator.parameters(),
        **config["discriminator_optimizer_params"])
    print("optimizers done!")
    output_dir = Path(args.output_dir)
    checkpoint_dir = output_dir / "checkpoints"
    if dist.get_rank() == 0:
        output_dir.mkdir(parents=True, exist_ok=True)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
        with open(output_dir / "config.yaml", 'wt') as f:
            f.write(config.dump(default_flow_style=None))
    updater = PWGUpdater(
        models={
            "generator": generator,
            "discriminator": discriminator,
        },
        optimizers={
            "generator": optimizer_g,
            "discriminator": optimizer_d,
        },
        criterions={
            "stft": criterion_stft,
            "mse": criterion_mse,
        },
        schedulers={
            "generator": lr_schedule_g,
            "discriminator": lr_schedule_d,
        },
        dataloader=train_dataloader,
        discriminator_train_start_steps=config.discriminator_train_start_steps,
        lambda_adv=config.lambda_adv, )
    evaluator = PWGEvaluator(
        models={
            "generator": generator,
            "discriminator": discriminator,
        },
        criterions={
            "stft": criterion_stft,
            "mse": criterion_mse,
        },
        dataloader=dev_dataloader,
        lambda_adv=config.lambda_adv, )
    trainer = Trainer(
        updater,
        stop_trigger=(config.train_max_steps, "iteration"),
        out=output_dir, )
    trainer.extend(
        evaluator, trigger=(config.eval_interval_steps, 'iteration'))
    if dist.get_rank() == 0:
        writer = LogWriter(str(trainer.out))
        trainer.extend(VisualDL(writer), trigger=(1, 'iteration'))
        trainer.extend(
            Snapshot(max_size=config.num_snapshots),
            trigger=(config.save_interval_steps, 'iteration'))
    print(trainer.extensions.keys())
    print("Trainer Done!")
    trainer.run()
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
                                     "model with Baker Mandrin TTS dataset.")
    parser.add_argument(
        "--config", type=str, help="config file to overwrite default config")
    parser.add_argument("--train-metadata", type=str, help="training data")
    parser.add_argument("--dev-metadata", type=str, help="dev data")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--device", type=str, default="gpu", help="device type to use")
    parser.add_argument(
        "--nprocs", type=int, default=1, help="number of processes")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    if args.device == "cpu" and args.nprocs > 1:
        raise RuntimeError("Multiprocess training on CPU is not supported.")
    config = get_cfg_default()
    if args.config:
        config.merge_from_file(args.config)
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(config)
    print(
        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
    )
    # dispatch
    if args.nprocs > 1:
        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
    else:
        train_sp(args, config)
 if __name__ == "__main__":
    main()
--- a/parakeet/init.py
+++ b/parakeet/init.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.2.0-beta.0"
+__version__ = "0.0.0"
 import logging
 from parakeet import audio, data, datasets, frontend, models, modules, training, utils
--- a/parakeet/datasets/data_table.py
+++ b/parakeet/datasets/data_table.py
@ -0,0 +1,151 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union, Optional, Callable, Tuple, List, Dict, Any
 from pathlib import Path
 from multiprocessing import Manager
 import numpy as np
 from paddle.io import Dataset
 class DataTable(Dataset):
    """Dataset to load and convert data for general purpose.
    Parameters
    ----------
    data : List[Dict[str, Any]]
        Metadata, a list of meta datum, each of which is composed of 
        several fields
    fields : List[str], optional
        Fields to use, if not specified, all the fields in the data are 
        used, by default None
    converters : Dict[str, Callable], optional
        Converters used to process each field, by default None
    use_cache : bool, optional
        Whether to use cache, by default False
    Raises
    ------
    ValueError
        If there is some field that does not exist in data. 
    ValueError
        If there is some field in converters that does not exist in fields.
    """
    def __init__(self,
                 data: List[Dict[str, Any]],
                 fields: List[str]=None,
                 converters: Dict[str, Callable]=None,
                 use_cache: bool=False):
        # metadata
        self.data = data
        assert len(data) > 0, "This dataset has no examples"
        # peak an example to get existing fields.
        first_example = self.data[0]
        fields_in_data = first_example.keys()
        # check all the requested fields exist
        if fields is None:
            self.fields = fields_in_data
        else:
            for field in fields:
                if field not in fields_in_data:
                    raise ValueError(
                        f"The requested field ({field}) is not found"
                        f"in the data. Fields in the data is {fields_in_data}")
            self.fields = fields
        # check converters
        if converters is None:
            self.converters = {}
        else:
            for field in converters.keys():
                if field not in self.fields:
                    raise ValueError(
                        f"The converter has a non existing field ({field})")
            self.converters = converters
        self.use_cache = use_cache
        if use_cache:
            self._initialize_cache()
    def _initialize_cache(self):
        self.manager = Manager()
        self.caches = self.manager.list()
        self.caches += [None for _ in range(len(self))]
    def _get_metadata(self, idx: int) -> Dict[str, Any]:
        """Return a meta-datum given an index."""
        return self.data[idx]
    def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]:
        """Convert a meta datum to an example by applying the corresponding 
        converters to each fields requested.
        Parameters
        ----------
        meta_datum : Dict[str, Any]
            Meta datum
        Returns
        -------
        Dict[str, Any]
            Converted example
        """
        example = {}
        for field in self.fields:
            converter = self.converters.get(field, None)
            meta_datum_field = meta_datum[field]
            if converter is not None:
                converted_field = converter(meta_datum_field)
            else:
                converted_field = meta_datum_field
            example[field] = converted_field
        return example
    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """Get an example given an index.
        Parameters
        ----------
        idx : int
            Index of the example to get
        Returns
        -------
        Dict[str, Any]
            A converted example
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]
        meta_datum = self._get_metadata(idx)
        example = self._convert(meta_datum)
        if self.use_cache:
            self.caches[idx] = example
        return example
    def __len__(self) -> int:
        """Returns the size of the dataset.
        Returns
        -------
        int
            The length of the dataset
        """
        return len(self.data)
--- a/parakeet/models/parallel_wavegan.py
+++ b/parakeet/models/parallel_wavegan.py
@ -0,0 +1,770 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import List, Dict, Any, Union, Optional, Tuple
 import numpy as np
 import paddle
 from paddle import Tensor
 from paddle import nn
 from paddle.nn import functional as F
 class Stretch2D(nn.Layer):
    def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"):
        """Strech an image (or image-like object) with some interpolation.
        Parameters
        ----------
        w_scale : int
            Scalar of width.
        h_scale : int
            Scalar of the height.
        mode : str, optional
            Interpolation mode, modes suppored are "nearest", "bilinear", 
            "trilinear", "bicubic", "linear" and "area",by default "nearest"
            For more details about interpolation, see 
            `paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
        """
        super().__init__()
        self.w_scale = w_scale
        self.h_scale = h_scale
        self.mode = mode
    def forward(self, x: Tensor) -> Tensor:
        """
        Parameters
        ----------
        x : Tensor
            Shape (N, C, H, W)
        Returns
        -------
        Tensor
            Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
            The stretched image.
        """
        out = F.interpolate(
            x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode)
        return out
 class UpsampleNet(nn.Layer):
    """A Layer to upsample spectrogram by applying consecutive stretch and 
    convolutions. 
    Parameters
    ----------
    upsample_scales : List[int]
        Upsampling factors for each strech.
    nonlinear_activation : Optional[str], optional
        Activation after each convolution, by default None
    nonlinear_activation_params : Dict[str, Any], optional
        Parameters passed to construct the activation, by default {}
    interpolate_mode : str, optional
        Interpolation mode of the strech, by default "nearest"
    freq_axis_kernel_size : int, optional
        Convolution kernel size along the frequency axis, by default 1
    use_causal_conv : bool, optional
        Whether to use causal padding before convolution, by default False
        If True, Causal padding is used along the time axis, i.e. padding 
        amount is ``receptive field - 1`` and 0 for before and after, 
        respectively.
        If False, "same" padding is used along the time axis.
    """
    def __init__(self,
                 upsample_scales: List[int],
                 nonlinear_activation: Optional[str]=None,
                 nonlinear_activation_params: Dict[str, Any]={},
                 interpolate_mode: str="nearest",
                 freq_axis_kernel_size: int=1,
                 use_causal_conv: bool=False):
        super().__init__()
        self.use_causal_conv = use_causal_conv
        self.up_layers = nn.LayerList()
        for scale in upsample_scales:
            stretch = Stretch2D(scale, 1, interpolate_mode)
            assert freq_axis_kernel_size % 2 == 1
            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
            if use_causal_conv:
                padding = (freq_axis_padding, scale * 2)
            else:
                padding = (freq_axis_padding, scale)
            conv = nn.Conv2D(
                1, 1, kernel_size, padding=padding, bias_attr=False)
            self.up_layers.extend([stretch, conv])
            if nonlinear_activation is not None:
                nonlinear = getattr(
                    nn, nonlinear_activation)(**nonlinear_activation_params)
                self.up_layers.append(nonlinear)
    def forward(self, c: Tensor) -> Tensor:
        """
        Parameters
        ----------
        c : Tensor
            Shape (N, F, T), spectrogram
        Returns
        -------
        Tensor
            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
            spectrogram
        """
        c = c.unsqueeze(1)
        for f in self.up_layers:
            if self.use_causal_conv and isinstance(f, nn.Conv2D):
                c = f(c)[:, :, :, c.shape[-1]]
            else:
                c = f(c)
        return c.squeeze(1)
 class ConvInUpsampleNet(nn.Layer):
    """A Layer to upsample spectrogram composed of a convolution and an 
    UpsampleNet.
    Parameters
    ----------
    upsample_scales : List[int]
        Upsampling factors for each strech.
    nonlinear_activation : Optional[str], optional
        Activation after each convolution, by default None
    nonlinear_activation_params : Dict[str, Any], optional
        Parameters passed to construct the activation, by default {}
    interpolate_mode : str, optional
        Interpolation mode of the strech, by default "nearest"
    freq_axis_kernel_size : int, optional
        Convolution kernel size along the frequency axis, by default 1
    aux_channels : int, optional
        Feature size of the input, by default 80
    aux_context_window : int, optional
        Context window of the first 1D convolution applied to the input. It 
        related to the kernel size of the convolution, by default 0
        If use causal convolution, the kernel size is ``window + 1``, else
        the kernel size is ``2 * window + 1``.
    use_causal_conv : bool, optional
        Whether to use causal padding before convolution, by default False
        If True, Causal padding is used along the time axis, i.e. padding 
        amount is ``receptive field - 1`` and 0 for before and after, 
        respectively.
        If False, "same" padding is used along the time axis.
    """
    def __init__(self,
                 upsample_scales: List[int],
                 nonlinear_activation: Optional[str]=None,
                 nonlinear_activation_params: Dict[str, Any]={},
                 interpolate_mode: str="nearest",
                 freq_axis_kernel_size: int=1,
                 aux_channels: int=80,
                 aux_context_window: int=0,
                 use_causal_conv: bool=False):
        super().__init__()
        self.aux_context_window = aux_context_window
        self.use_causal_conv = use_causal_conv and aux_context_window > 0
        kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
        self.conv_in = nn.Conv1D(
            aux_channels,
            aux_channels,
            kernel_size=kernel_size,
            bias_attr=False)
        self.upsample = UpsampleNet(
            upsample_scales=upsample_scales,
            nonlinear_activation=nonlinear_activation,
            nonlinear_activation_params=nonlinear_activation_params,
            interpolate_mode=interpolate_mode,
            freq_axis_kernel_size=freq_axis_kernel_size,
            use_causal_conv=use_causal_conv)
    def forward(self, c: Tensor) -> Tensor:
        """
        Parameters
        ----------
        c : Tensor
            Shape (N, F, T), spectrogram
        Returns
        -------
        Tensors
            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
            spectrogram
        """
        c_ = self.conv_in(c)
        c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
        return self.upsample(c)
 class ResidualBlock(nn.Layer):
    """A gated activation unit composed of an 1D convolution, a gated tanh
    unit and parametric redidual and skip connections. For more details, 
    refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.
    Parameters
    ----------
    kernel_size : int, optional
        Kernel size of the 1D convolution, by default 3
    residual_channels : int, optional
        Feature size of the resiaudl output(and also the input), by default 64
    gate_channels : int, optional
        Output feature size of the 1D convolution, by default 128
    skip_channels : int, optional
        Feature size of the skip output, by default 64
    aux_channels : int, optional
        Feature size of the auxiliary input (e.g. spectrogram), by default 80
    dropout : float, optional
        Probability of the dropout before the 1D convolution, by default 0.
    dilation : int, optional
        Dilation of the 1D convolution, by default 1
    bias : bool, optional
        Whether to use bias in the 1D convolution, by default True
    use_causal_conv : bool, optional
        Whether to use causal padding for the 1D convolution, by default False
    """
    def __init__(self,
                 kernel_size: int=3,
                 residual_channels: int=64,
                 gate_channels: int=128,
                 skip_channels: int=64,
                 aux_channels: int=80,
                 dropout: float=0.,
                 dilation: int=1,
                 bias: bool=True,
                 use_causal_conv: bool=False):
        super().__init__()
        self.dropout = dropout
        if use_causal_conv:
            padding = (kernel_size - 1) * dilation
        else:
            assert kernel_size % 2 == 1
            padding = (kernel_size - 1) // 2 * dilation
        self.use_causal_conv = use_causal_conv
        self.conv = nn.Conv1D(
            residual_channels,
            gate_channels,
            kernel_size,
            padding=padding,
            dilation=dilation,
            bias_attr=bias)
        if aux_channels is not None:
            self.conv1x1_aux = nn.Conv1D(
                aux_channels, gate_channels, kernel_size=1, bias_attr=False)
        else:
            self.conv1x1_aux = None
        gate_out_channels = gate_channels // 2
        self.conv1x1_out = nn.Conv1D(
            gate_out_channels,
            residual_channels,
            kernel_size=1,
            bias_attr=bias)
        self.conv1x1_skip = nn.Conv1D(
            gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias)
    def forward(self, x: Tensor, c: Tensor) -> Tuple[Tensor, Tensor]:
        """
        Parameters
        ----------
        x : Tensor
            Shape (N, C_res, T), the input features.
        c : Tensor
            Shape (N, C_aux, T), the auxiliary input.
        Returns
        -------
        res : Tensor
            Shape (N, C_res, T), the residual output, which is used as the 
            input of the next ResidualBlock in a stack of ResidualBlocks.
        skip : Tensor
            Shape (N, C_skip, T), the skip output, which is collected among
            each layer in a stack of ResidualBlocks.
        """
        x_input = x
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.conv(x)
        x = x[:, :, x_input.shape[-1]] if self.use_causal_conv else x
        if c is not None:
            c = self.conv1x1_aux(c)
            x += c
        a, b = paddle.chunk(x, 2, axis=1)
        x = paddle.tanh(a) * F.sigmoid(b)
        skip = self.conv1x1_skip(x)
        res = (self.conv1x1_out(x) + x_input) * math.sqrt(0.5)
        return res, skip
 class PWGGenerator(nn.Layer):
    """Wave Generator for Parallel WaveGAN
    Parameters
    ----------
    in_channels : int, optional
        Number of channels of the input waveform, by default 1
    out_channels : int, optional
        Number of channels of the output waveform, by default 1
    kernel_size : int, optional
        Kernel size of the residual blocks inside, by default 3
    layers : int, optional
        Number of residual blocks inside, by default 30
    stacks : int, optional
        The number of groups to split the residual blocks into, by default 3
        Within each group, the dilation of the residual block grows 
        exponentially.
    residual_channels : int, optional
        Residual channel of the residual blocks, by default 64
    gate_channels : int, optional
        Gate channel of the residual blocks, by default 128
    skip_channels : int, optional
        Skip channel of the residual blocks, by default 64
    aux_channels : int, optional
        Auxiliary channel of the residual blocks, by default 80
    aux_context_window : int, optional
        The context window size of the first convolution applied to the 
        auxiliary input, by default 2
    dropout : float, optional
        Dropout of the residual blocks, by default 0.
    bias : bool, optional
        Whether to use bias in residual blocks, by default True
    use_weight_norm : bool, optional
        Whether to use weight norm in all convolutions, by default True
    use_causal_conv : bool, optional
        Whether to use causal padding in the upsample network and residual 
        blocks, by default False
    upsample_scales : List[int], optional
        Upsample scales of the upsample network, by default [4, 4, 4, 4]
    nonlinear_activation : Optional[str], optional
        Non linear activation in upsample network, by default None
    nonlinear_activation_params : Dict[str, Any], optional
        Parameters passed to the linear activation in the upsample network, 
        by default {}
    interpolate_mode : str, optional
        Interpolation mode of the upsample network, by default "nearest"
    freq_axis_kernel_size : int, optional
        Kernel size along the frequency axis of the upsample network, by default 1
    """
    def __init__(self,
                 in_channels: int=1,
                 out_channels: int=1,
                 kernel_size: int=3,
                 layers: int=30,
                 stacks: int=3,
                 residual_channels: int=64,
                 gate_channels: int=128,
                 skip_channels: int=64,
                 aux_channels: int=80,
                 aux_context_window: int=2,
                 dropout: float=0.,
                 bias: bool=True,
                 use_weight_norm: bool=True,
                 use_causal_conv: bool=False,
                 upsample_scales: List[int]=[4, 4, 4, 4],
                 nonlinear_activation: Optional[str]=None,
                 nonlinear_activation_params: Dict[str, Any]={},
                 interpolate_mode: str="nearest",
                 freq_axis_kernel_size: int=1):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.aux_channels = aux_channels
        self.aux_context_window = aux_context_window
        self.layers = layers
        self.stacks = stacks
        self.kernel_size = kernel_size
        assert layers % stacks == 0
        layers_per_stack = layers // stacks
        self.first_conv = nn.Conv1D(
            in_channels, residual_channels, 1, bias_attr=True)
        self.upsample_net = ConvInUpsampleNet(
            upsample_scales=upsample_scales,
            nonlinear_activation=nonlinear_activation,
            nonlinear_activation_params=nonlinear_activation_params,
            interpolate_mode=interpolate_mode,
            freq_axis_kernel_size=freq_axis_kernel_size,
            aux_channels=aux_channels,
            aux_context_window=aux_context_window,
            use_causal_conv=use_causal_conv)
        self.upsample_factor = np.prod(upsample_scales)
        self.conv_layers = nn.LayerList()
        for layer in range(layers):
            dilation = 2**(layer % layers_per_stack)
            conv = ResidualBlock(
                kernel_size=kernel_size,
                residual_channels=residual_channels,
                gate_channels=gate_channels,
                skip_channels=skip_channels,
                aux_channels=aux_channels,
                dilation=dilation,
                dropout=dropout,
                bias=bias,
                use_causal_conv=use_causal_conv)
            self.conv_layers.append(conv)
        self.last_conv_layers = nn.Sequential(
            nn.ReLU(),
            nn.Conv1D(
                skip_channels, skip_channels, 1, bias_attr=True),
            nn.ReLU(),
            nn.Conv1D(
                skip_channels, out_channels, 1, bias_attr=True))
        if use_weight_norm:
            self.apply_weight_norm()
    def forward(self, x: Tensor, c: Tensor) -> Tensor:
        """Generate waveform.
        Parameters
        ----------
        x : Tensor
            Shape (N, C_in, T), The input waveform.
        c : Tensor
            Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It 
            is upsampled to match the time resolution of the input.
        Returns
        -------
        Tensor
            Shape (N, C_out, T), the generated waveform.
        """
        c = self.upsample_net(c)
        assert c.shape[-1] == x.shape[-1]
        x = self.first_conv(x)
        skips = 0
        for f in self.conv_layers:
            x, s = f(x, c)
            skips += s
        skips *= math.sqrt(1.0 / len(self.conv_layers))
        x = self.last_conv_layers(skips)
        return x
    def apply_weight_norm(self):
        """Recursively apply weight normalization to all the Convolution layers
        in the sublayers.
        """
        def _apply_weight_norm(layer):
            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
                nn.utils.weight_norm(layer)
        self.apply(_apply_weight_norm)
    def remove_weight_norm(self):
        """Recursively remove weight normalization from all the Convolution 
        layers in the sublayers.
        """
        def _remove_weight_norm(layer):
            try:
                nn.utils.remove_weight_norm(layer)
            except ValueError:
                pass
        self.apply(_remove_weight_norm)
    def inference(self, c: Optional[Tensor]=None,
                  x: Optional[Tensor]=None) -> Tensor:
        """Waveform generation. This function is used for single instance 
        inference.
        Parameters
        ----------
        c : Tensor, optional
            Shape (T', C_aux), the auxiliary input, by default None
        x : Tensor, optional
            Shape (T, C_in), the noise waveform, by default None
            If not provided, a sample is drawn from a gaussian distribution.
        Returns
        -------
        Tensor
            Shape (T, C_out), the generated waveform
        """
        if x is not None:
            x = paddle.transpose(x, [1, 0]).unsqueeze(0)  # pseudo batch
        else:
            assert c is not None
            x = paddle.randn(
                [1, self.in_channels, c.shape[0] * self.upsample_factor])
        if c is not None:
            c = paddle.transpose(c, [1, 0]).unsqueeze(0)  # pseudo batch
            c = nn.Pad1D(self.aux_context_window, mode='replicate')(c)
        out = self.forward(x, c).squeeze(0).transpose([1, 0])
        return out
 class PWGDiscriminator(nn.Layer):
    """A convolutional discriminator for audio.
    Parameters
    ----------
    in_channels : int, optional
        Number of channels of the input audio, by default 1
    out_channels : int, optional
        Output feature size, by default 1
    kernel_size : int, optional
        Kernel size of convolutional sublayers, by default 3
    layers : int, optional
        Number of layers, by default 10
    conv_channels : int, optional
        Feature size of the convolutional sublayers, by default 64
    dilation_factor : int, optional
        The factor with which dilation of each convolutional sublayers grows 
        exponentially if it is greater than 1, else the dilation of each 
        convolutional sublayers grows linearly, by default 1
    nonlinear_activation : str, optional
        The activation after each convolutional sublayer, by default "LeakyReLU"
    nonlinear_activation_params : Dict[str, Any], optional
        The parameters passed to the activation's initializer, by default 
        {"negative_slope": 0.2}
    bias : bool, optional
        Whether to use bias in convolutional sublayers, by default True
    use_weight_norm : bool, optional
        Whether to use weight normalization at all convolutional sublayers, 
        by default True
    """
    def __init__(self,
                 in_channels: int=1,
                 out_channels: int=1,
                 kernel_size: int=3,
                 layers: int=10,
                 conv_channels: int=64,
                 dilation_factor: int=1,
                 nonlinear_activation: str="LeakyReLU",
                 nonlinear_activation_params: Dict[
                     str, Any]={"negative_slope": 0.2},
                 bias: bool=True,
                 use_weight_norm: bool=True):
        super().__init__()
        assert kernel_size % 2 == 1
        assert dilation_factor > 0
        conv_layers = []
        conv_in_channels = in_channels
        for i in range(layers - 1):
            if i == 0:
                dilation = 1
            else:
                dilation = i if dilation_factor == 1 else dilation_factor**i
                conv_in_channels = conv_channels
            padding = (kernel_size - 1) // 2 * dilation
            conv_layer = nn.Conv1D(
                conv_in_channels,
                conv_channels,
                kernel_size,
                padding=padding,
                dilation=dilation,
                bias_attr=bias)
            nonlinear = getattr(
                nn, nonlinear_activation)(**nonlinear_activation_params)
            conv_layers.append(conv_layer)
            conv_layers.append(nonlinear)
        padding = (kernel_size - 1) // 2
        last_conv = nn.Conv1D(
            conv_in_channels,
            out_channels,
            kernel_size,
            padding=padding,
            bias_attr=bias)
        conv_layers.append(last_conv)
        self.conv_layers = nn.Sequential(*conv_layers)
        if use_weight_norm:
            self.apply_weight_norm()
    def forward(self, x: Tensor) -> Tensor:
        """
        Parameters
        ----------
        x : Tensor
            Shape (N, in_channels, num_samples), the input audio.
        Returns
        -------
        Tensor
            Shape (N, out_channels, num_samples), the predicted logits.
        """
        return self.conv_layers(x)
    def apply_weight_norm(self):
        def _apply_weight_norm(layer):
            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
                nn.utils.weight_norm(layer)
        self.apply(_apply_weight_norm)
    def remove_weight_norm(self):
        def _remove_weight_norm(layer):
            try:
                nn.utils.remove_weight_norm(layer)
            except ValueError:
                pass
        self.apply(_remove_weight_norm)
 class ResidualPWGDiscriminator(nn.Layer):
    """A wavenet-style discriminator for audio.
    Parameters
    ----------
    in_channels : int, optional
        Number of channels of the input audio, by default 1
    out_channels : int, optional
        Output feature size, by default 1
    kernel_size : int, optional
        Kernel size of residual blocks, by default 3
    layers : int, optional
        Number of residual blocks, by default 30
    stacks : int, optional
        Number of groups of residual blocks, within which the dilation 
        of each residual blocks grows exponentially, by default 3
    residual_channels : int, optional
        Residual channels of residual blocks, by default 64
    gate_channels : int, optional
        Gate channels of residual blocks, by default 128
    skip_channels : int, optional
        Skip channels of residual blocks, by default 64
    dropout : float, optional
        Dropout probability of residual blocks, by default 0.
    bias : bool, optional
        Whether to use bias in residual blocks, by default True
    use_weight_norm : bool, optional
        Whether to use weight normalization in all convolutional layers, 
        by default True
    use_causal_conv : bool, optional
        Whether to use causal convolution in residual blocks, by default False
    nonlinear_activation : str, optional
        Activation after convolutions other than those in residual blocks, 
        by default "LeakyReLU"
    nonlinear_activation_params : Dict[str, Any], optional
        Parameters to pass to the activation, by default {"negative_slope": 0.2}
    """
    def __init__(self,
                 in_channels: int=1,
                 out_channels: int=1,
                 kernel_size: int=3,
                 layers: int=30,
                 stacks: int=3,
                 residual_channels: int=64,
                 gate_channels: int=128,
                 skip_channels: int=64,
                 dropout: float=0.,
                 bias: bool=True,
                 use_weight_norm: bool=True,
                 use_causal_conv: bool=False,
                 nonlinear_activation: str="LeakyReLU",
                 nonlinear_activation_params: Dict[
                     str, Any]={"negative_slope": 0.2}):
        super().__init__()
        assert kernel_size % 2 == 1
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.layers = layers
        self.stacks = stacks
        self.kernel_size = kernel_size
        assert layers % stacks == 0
        layers_per_stack = layers // stacks
        self.first_conv = nn.Sequential(
            nn.Conv1D(
                in_channels, residual_channels, 1, bias_attr=True),
            getattr(nn, nonlinear_activation)(**nonlinear_activation_params))
        self.conv_layers = nn.LayerList()
        for layer in range(layers):
            dilation = 2**(layer % layers_per_stack)
            conv = ResidualBlock(
                kernel_size=kernel_size,
                residual_channels=residual_channels,
                gate_channels=gate_channels,
                skip_channels=skip_channels,
                aux_channels=None,  # no auxiliary input
                dropout=dropout,
                dilation=dilation,
                bias=bias,
                use_causal_conv=use_causal_conv)
            self.conv_layers.append(conv)
        self.last_conv_layers = nn.Sequential(
            getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
            nn.Conv1D(
                skip_channels, skip_channels, 1, bias_attr=True),
            getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
            nn.Conv1D(
                skip_channels, out_channels, 1, bias_attr=True))
        if use_weight_norm:
            self.apply_weight_norm()
    def forward(self, x: Tensor) -> Tensor:
        """
        Parameters
        ----------
        x : Tensor
            Shape (N, in_channels, num_samples), the input audio.
        Returns
        -------
        Tensor
            Shape (N, out_channels, num_samples), the predicted logits.
        """
        x = self.first_conv(x)
        skip = 0
        for f in self.conv_layers:
            x, h = f(x, None)
            skip += h
        skip *= math.sqrt(1 / len(self.conv_layers))
        x = skip
        x = self.last_conv_layers(x)
        return x
    def apply_weight_norm(self):
        def _apply_weight_norm(layer):
            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
                nn.utils.weight_norm(layer)
        self.apply(_apply_weight_norm)
    def remove_weight_norm(self):
        def _remove_weight_norm(layer):
            try:
                nn.utils.remove_weight_norm(layer)
            except ValueError:
                pass
        self.apply(_remove_weight_norm)
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@ -20,7 +20,7 @@ import librosa
 from librosa.util import pad_center
 import numpy as np
-__all__ = ["quantize", "dequantize", "STFT"]
+__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
 def quantize(values, n_bands):
@ -96,10 +96,10 @@ class STFT(nn.Layer):
        Defaults to True.
    pad_mode : string or function
-       If center=True, this argument is passed to np.pad for padding the edges 
+        If center=True, this argument is passed to np.pad for padding the edges 
-       of the signal y. By default (pad_mode="reflect"), y is padded on both 
+        of the signal y. By default (pad_mode="reflect"), y is padded on both 
-       sides with its own reflection, mirrored around its first and last 
+        sides with its own reflection, mirrored around its first and last 
-       sample respectively. If center=False, this argument is ignored.
+        sample respectively. If center=False, this argument is ignored.
@ -163,17 +163,15 @@ class STFT(nn.Layer):
        w = np.concatenate([w_real, w_imag], axis=0)
        w = w * window
        w = np.expand_dims(w, 1)
-        self.weight = paddle.cast(
+        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-            paddle.to_tensor(w), paddle.get_default_dtype())
+        self.register_buffer("weight", weight)
    def forward(self, x):
        """Compute the stft transform.
        Parameters
        ------------
        x : Tensor [shape=(B, T)]
            The input waveform.
        Returns
        ------------
        real : Tensor [shape=(B, C, frames)] 
@ -195,36 +193,32 @@ class STFT(nn.Layer):
    def power(self, x):
        """Compute the power spectrum.
        Parameters
        ------------
        x : Tensor [shape=(B, T)]
            The input waveform.
        Returns
        ------------
        Tensor [shape=(B, C, T)] 
            The power spectrum.
        """
-        real, imag = self(x)
+        real, imag = self.forward(x)
        power = real**2 + imag**2
        return power
    def magnitude(self, x):
        """Compute the magnitude of the spectrum.
        Parameters
        ------------
        x : Tensor [shape=(B, T)]
            The input waveform.
        Returns
        ------------
        Tensor [shape=(B, C, T)] 
            The magnitude of the spectrum.
        """
        power = self.power(x)
-        magnitude = paddle.sqrt(power)
+        magnitude = paddle.sqrt(power)  # TODO(chenfeiyu): maybe clipping
        return magnitude
@ -232,7 +226,9 @@ class MelScale(nn.Layer):
    def __init__(self, sr, n_fft, n_mels, fmin, fmax):
        super().__init__()
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-        self.weight = paddle.to_tensor(mel_basis)
+        # self.weight = paddle.to_tensor(mel_basis)
        weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype())
        self.register_buffer("weight", weight)
    def forward(self, spec):
        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
--- a/parakeet/modules/stft_loss.py
+++ b/parakeet/modules/stft_loss.py
@ -0,0 +1,144 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from parakeet.modules.audio import STFT
 class SpectralConvergenceLoss(nn.Layer):
    """Spectral convergence loss module."""
    def __init__(self):
        """Initilize spectral convergence loss module."""
        super().__init__()
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
        Args:
            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, C, T).
            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, C, T).
        Returns:
            Tensor: Spectral convergence loss value.
        """
        return paddle.norm(
            y_mag - x_mag, p="fro") / paddle.clip(
                paddle.norm(
                    y_mag, p="fro"), min=1e-10)
 class LogSTFTMagnitudeLoss(nn.Layer):
    """Log STFT magnitude loss module."""
    def __init__(self, epsilon=1e-10):
        """Initilize los STFT magnitude loss module."""
        super().__init__()
        self.epsilon = epsilon
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
        Args:
            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns:
            Tensor: Log STFT magnitude loss value.
        """
        return F.l1_loss(
            paddle.log(paddle.clip(
                y_mag, min=self.epsilon)),
            paddle.log(paddle.clip(
                x_mag, min=self.epsilon)))
 class STFTLoss(nn.Layer):
    """STFT loss module."""
    def __init__(self,
                 fft_size=1024,
                 shift_size=120,
                 win_length=600,
                 window="hann"):
        """Initialize STFT loss module."""
        super().__init__()
        self.fft_size = fft_size
        self.shift_size = shift_size
        self.win_length = win_length
        self.stft = STFT(
            n_fft=fft_size,
            hop_length=shift_size,
            win_length=win_length,
            window=window)
        self.spectral_convergence_loss = SpectralConvergenceLoss()
        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
    def forward(self, x, y):
        """Calculate forward propagation.
        Args:
            x (Tensor): Predicted signal (B, T).
            y (Tensor): Groundtruth signal (B, T).
        Returns:
            Tensor: Spectral convergence loss value.
            Tensor: Log STFT magnitude loss value.
        """
        x_mag = self.stft.magnitude(x)
        y_mag = self.stft.magnitude(y)
        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
        return sc_loss, mag_loss
 class MultiResolutionSTFTLoss(nn.Layer):
    """Multi resolution STFT loss module."""
    def __init__(
            self,
            fft_sizes=[1024, 2048, 512],
            hop_sizes=[120, 240, 50],
            win_lengths=[600, 1200, 240],
            window="hann", ):
        """Initialize Multi resolution STFT loss module.
        Args:
            fft_sizes (list): List of FFT sizes.
            hop_sizes (list): List of hop sizes.
            win_lengths (list): List of window lengths.
            window (str): Window function type.
        """
        super().__init__()
        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
        self.stft_losses = nn.LayerList()
        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
    def forward(self, x, y):
        """Calculate forward propagation.
        Args:
            x (Tensor): Predicted signal (B, T).
            y (Tensor): Groundtruth signal (B, T).
        Returns:
            Tensor: Multi resolution spectral convergence loss value.
            Tensor: Multi resolution log STFT magnitude loss value.
        """
        sc_loss = 0.0
        mag_loss = 0.0
        for f in self.stft_losses:
            sc_l, mag_l = f(x, y)
            sc_loss += sc_l
            mag_loss += mag_l
        sc_loss /= len(self.stft_losses)
        mag_loss /= len(self.stft_losses)
        return sc_loss, mag_loss
--- a/parakeet/training/checkpoint.py
+++ b/parakeet/training/checkpoint.py
@ -1,162 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Callable, Mapping, List
 from pathlib import Path
 class KBest(object):
    """
    A utility class to help save the hard drive by only keeping K best 
    checkpoints. 
    To be as modularized as possible, this class does not assume anything like 
    a Trainer class or anything like a checkpoint directory, it does not know 
    about the model or the optimizer, etc. 
    It is basically a dynamically mantained K-bset Mapping. When a new item is 
    added to the map, save_fn is called. And when an item is removed from the 
    map, del_fn is called. `save_fn` and `del_fn` takes a Path object as input 
    and returns nothing.
    Though it is designed to control checkpointing behaviors, it can be used 
    to do something else if you pass some save_fn and del_fn.
    Example
    --------
    >>> from pathlib import Path
    >>> import shutil
    >>> import paddle
    >>> from paddle import nn
    >>> model = nn.Linear(2, 3)
    >>> def save_model(path):
    ...     paddle.save(model.state_dict(), path)
    >>> kbest_manager = KBest(max_size=5, save_fn=save_model)
    >>> checkpoint_dir = Path("checkpoints")
    >>> shutil.rmtree(checkpoint_dir)
    >>> checkpoint_dir.mkdir(parents=True)
    >>> a = np.random.rand(20)
    >>> for i, score in enumerate(a):
    ...     path = checkpoint_dir / f"step_{i}"
    ...     kbest_manager.add_checkpoint(score, path)
    >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5
    """
    def __init__(self,
                 max_size: int=5,
                 save_fn: Callable[[Path], None]=None,
                 del_fn: Callable[[Path], None]=lambda f: f.unlink()):
        self.best_records: Mapping[Path, float] = {}
        self.save_fn = save_fn
        self.del_fn = del_fn
        self.max_size = max_size
        self._save_all = (max_size == -1)
    def should_save(self, metric: float) -> bool:
        if not self.full():
            return True
        # already full
        worst_record_path = max(self.best_records, key=self.best_records.get)
        worst_metric = self.best_records[worst_record_path]
        return metric < worst_metric
    def full(self):
        return (not self._save_all) and len(self.best_records) == self.max_size
    def add_checkpoint(self, metric, path):
        if self.should_save(metric):
            self.save_checkpoint_and_update(metric, path)
    def save_checkpoint_and_update(self, metric, path):
        # remove the worst
        if self.full():
            worst_record_path = max(self.best_records,
                                    key=self.best_records.get)
            self.best_records.pop(worst_record_path)
            self.del_fn(worst_record_path)
        # add the new one
        self.save_fn(path)
        self.best_records[path] = metric
 class KLatest(object):
    """
    A utility class to help save the hard drive by only keeping K latest 
    checkpoints. 
    To be as modularized as possible, this class does not assume anything like 
    a Trainer class or anything like a checkpoint directory, it does not know 
    about the model or the optimizer, etc. 
    It is basically a dynamically mantained Queue. When a new item is 
    added to the queue, save_fn is called. And when an item is removed from the 
    queue, del_fn is called. `save_fn` and `del_fn` takes a Path object as input 
    and returns nothing.
    Though it is designed to control checkpointing behaviors, it can be used 
    to do something else if you pass some save_fn and del_fn.
    Example
    --------
    >>> from pathlib import Path
    >>> import shutil
    >>> import paddle
    >>> from paddle import nn
    >>> model = nn.Linear(2, 3)
    >>> def save_model(path):
    ...     paddle.save(model.state_dict(), path)
    >>> klatest_manager = KLatest(max_size=5, save_fn=save_model)
    >>> checkpoint_dir = Path("checkpoints")
    >>> shutil.rmtree(checkpoint_dir)
    >>> checkpoint_dir.mkdir(parents=True)
    >>> for i in range(20):
    ...     path = checkpoint_dir / f"step_{i}"
    ...     klatest_manager.add_checkpoint(path)
    >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5
    """
    def __init__(self,
                 max_size: int=5,
                 save_fn: Callable[[Path], None]=None,
                 del_fn: Callable[[Path], None]=lambda f: f.unlink()):
        self.latest_records: List[Path] = []
        self.save_fn = save_fn
        self.del_fn = del_fn
        self.max_size = max_size
        self._save_all = (max_size == -1)
    def full(self):
        return (
            not self._save_all) and len(self.latest_records) == self.max_size
    def add_checkpoint(self, path):
        self.save_checkpoint_and_update(path)
    def save_checkpoint_and_update(self, path):
        # remove the earist
        if self.full():
            eariest_record_path = self.latest_records.pop(0)
            self.del_fn(eariest_record_path)
        # add the new one
        self.save_fn(path)
        self.latest_records.append(path)
--- a/parakeet/training/extension.py
+++ b/parakeet/training/extension.py
@ -0,0 +1,80 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Callable
 PRIORITY_WRITER = 300
 PRIORITY_EDITOR = 200
 PRIORITY_READER = 100
 class Extension(object):
    """Extension to customize the behavior of Trainer."""
    trigger = (1, 'iteration')
    priority = PRIORITY_READER
    name = None
    @property
    def default_name(self):
        """Default name of the extension, class name by default."""
        return type(self).__name__
    def __call__(self, trainer):
        """Main action of the extention. After each update, it is executed
        when the trigger fires."""
        raise NotImplementedError(
            'Extension implementation must override __call__.')
    def initialize(self, trainer):
        """Action that is executed once to get the corect trainer state.
        It is called before training normally, but if the trainer restores
        states with an Snapshot extension, this method should also be called.g
        """
        pass
    def on_error(self, trainer, exc, tb):
        """Handles the error raised during training before finalization.
        """
        pass
    def finalize(self, trainer):
        """Action that is executed when training is done.
        For example, visualizers would need to be closed.
        """
        pass
 def make_extension(trigger: Callable=None,
                   default_name: str=None,
                   priority: int=None,
                   finalizer: Callable=None,
                   initializer: Callable=None,
                   on_error: Callable=None):
    """Make an Extension-like object by injecting required attributes to it.
    """
    if trigger is None:
        trigger = Extension.trigger
    if priority is None:
        priority = Extension.priority
    def decorator(ext):
        ext.trigger = trigger
        ext.default_name = default_name or ext.__name__
        ext.priority = priority
        ext.finalize = finalizer
        ext.on_error = on_error
        ext.initialize = initializer
        return ext
    return decorator
--- a/parakeet/training/extensions/evaluator.py
+++ b/parakeet/training/extensions/evaluator.py
@ -0,0 +1,73 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
 from typing import Optional, Callable, Dict
 from tqdm import tqdm
 import paddle
 from paddle import Tensor
 from paddle.nn import Layer
 from paddle.io import DataLoader
 from parakeet.training.reporter import scope, report, DictSummary
 from parakeet.training import extension
 class StandardEvaluator(extension.Extension):
    trigger = (1, 'epoch')
    default_name = 'validation'
    priority = extension.PRIORITY_WRITER
    name = None
    def __init__(self, model: Layer, dataloader: DataLoader):
        # it is designed to hold multiple models
        models = {"main": model}
        self.models: Dict[str, Layer] = models
        self.model = model
        # dataloaders
        self.dataloader = dataloader
    def evaluate_core(self, batch):
        # compute
        self.model(batch)  # you may report here
    def evaluate(self):
        # switch to eval mode
        for layer in self.models.values():
            layer.eval()
        # to average evaluation metrics
        summary = DictSummary()
        for batch in self.dataloader:
            observation = {}
            with scope(observation):
                # main evaluation computation here.
                with paddle.no_grad():
                    self.evaluate_core(batch)
            summary.add(observation)
        summary = summary.compute_mean()
        return summary
    def __call__(self, trainer=None):
        # evaluate and report the averaged metric to current observation
        # if it is used to extend a trainer, the metrics is reported to
        # to observation of the trainer
        # or otherwise, you can use your own observation
        summary = self.evaluate()
        for k, v in summary.items():
            report(k, v)
--- a/parakeet/training/extensions/snapshot.py
+++ b/parakeet/training/extensions/snapshot.py
@ -0,0 +1,110 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import logging
 from pathlib import Path
 from datetime import datetime
 from typing import List, Dict, Any
 import jsonlines
 from parakeet.utils.mp_tools import rank_zero_only
 from parakeet.training.trainer import Trainer
 from parakeet.training import extension
 def load_records(records_fp):
    """Load record files (json lines.)"""
    with jsonlines.open(records_fp, 'r') as reader:
        records = list(reader)
    return records
 class Snapshot(extension.Extension):
    """An extension to make snapshot of the updater object inside
    the trainer. It is done by calling the updater's `save` method.
    An Updater save its state_dict by default, which contains the
    updater state, (i.e. epoch and iteration) and all the model
    parameters and optimizer states. If the updater inside the trainer
    subclasses StandardUpdater, everything is good to go.
    Parameters
    ----------
    checkpoint_dir : Union[str, Path]
        The directory to save checkpoints into.
    """
    trigger = (1, 'epoch')
    priority = -100
    default_name = "snapshot"
    def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
        self.records: List[Dict[str, Any]] = []
        self.max_size = max_size
        self._snapshot_on_error = snapshot_on_error
        self._save_all = (max_size == -1)
        self.checkpoint_dir =...
    def initialize(self, trainer: Trainer):
        """Setting up this extention."""
        self.checkpoint_dir = trainer.out / "checkpoints"
        # load existing records
        record_path: Path = self.checkpoint_dir / "records.jsonl"
        if record_path.exists():
            logging.debug("Loading from an existing checkpoint dir")
            self.records = load_records(record_path)
            trainer.updater.load(self.records[-1]['path'])
    def on_error(self, trainer, exc, tb):
        if self._snapshot_on_error:
            self.save_checkpoint_and_update(trainer)
    def __call__(self, trainer: Trainer):
        self.save_checkpoint_and_update(trainer)
    def full(self):
        """Whether the number of snapshots it keeps track of is greater
        than the max_size."""
        return (not self._save_all) and len(self.records) > self.max_size
    @rank_zero_only
    def save_checkpoint_and_update(self, trainer: Trainer):
        """Saving new snapshot and remove the oldest snapshot if needed."""
        iteration = trainer.updater.state.iteration
        path = self.checkpoint_dir / f"snapshot_iter_{iteration}.pdz"
        # add the new one
        trainer.updater.save(path)
        record = {
            "time": str(datetime.now()),
            'path': str(path.resolve()),  # use absolute path
            'iteration': iteration
        }
        self.records.append(record)
        # remove the earist
        if self.full():
            eariest_record = self.records[0]
            os.remove(eariest_record["path"])
            self.records.pop(0)
        # update the record file
        record_path = self.checkpoint_dir / "records.jsonl"
        with jsonlines.open(record_path, 'w') as writer:
            for record in self.records:
                # jsonlines.open may return a Writer or a Reader
                writer.write(record)  # pylint: disable=no-member 
--- a/parakeet/training/extensions/visualizer.py
+++ b/parakeet/training/extensions/visualizer.py
@ -0,0 +1,40 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from visualdl import LogWriter
 from parakeet.training.trainer import Trainer
 from parakeet.training import extension
 class VisualDL(extension.Extension):
    """A wrapper of visualdl log writer. It assumes that the metrics to be visualized
    are all scalars which are recorded into the `.observation` dictionary of the
    trainer object. The dictionary is created for each step, thus the visualdl log
    writer uses the iteration from the updater's `iteration` as the global step to
    add records.
    """
    trigger = (1, 'iteration')
    default_name = 'visualdl'
    priority = extension.PRIORITY_READER
    def __init__(self, writer):
        self.writer = writer
    def __call__(self, trainer: Trainer):
        for k, v in trainer.observation.items():
            self.writer.add_scalar(k, v, step=trainer.updater.state.iteration)
    def finalize(self, trainer):
        self.writer.close()
--- a/parakeet/training/reporter.py
+++ b/parakeet/training/reporter.py
@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import contextlib
 from collections import defaultdict
 OBSERVATIONS = None
@ -45,3 +47,113 @@ def report(name, value):
        return
    else:
        observations[name] = value
 class Summary(object):
    """Online summarization of a sequence of scalars.
    Summary computes the statistics of given scalars online.
    """
    def __init__(self):
        self._x = 0.0
        self._x2 = 0.0
        self._n = 0
    def add(self, value, weight=1):
        """Adds a scalar value.
        Args:
            value: Scalar value to accumulate. It is either a NumPy scalar or
                a zero-dimensional array (on CPU or GPU).
            weight: An optional weight for the value. It is a NumPy scalar or
                a zero-dimensional array (on CPU or GPU).
                Default is 1 (integer).
        """
        self._x += weight * value
        self._x2 += weight * value * value
        self._n += weight
    def compute_mean(self):
        """Computes the mean."""
        x, n = self._x, self._n
        return x / n
    def make_statistics(self):
        """Computes and returns the mean and standard deviation values.
        Returns:
            tuple: Mean and standard deviation values.
        """
        x, n = self._x, self._n
        mean = x / n
        var = self._x2 / n - mean * mean
        std = math.sqrt(var)
        return mean, std
 class DictSummary(object):
    """Online summarization of a sequence of dictionaries.
    ``DictSummary`` computes the statistics of a given set of scalars online.
    It only computes the statistics for scalar values and variables of scalar
    values in the dictionaries.
    """
    def __init__(self):
        self._summaries = defaultdict(Summary)
    def add(self, d):
        """Adds a dictionary of scalars.
        Args:
            d (dict): Dictionary of scalars to accumulate. Only elements of
               scalars, zero-dimensional arrays, and variables of
               zero-dimensional arrays are accumulated. When the value
               is a tuple, the second element is interpreted as a weight.
        """
        summaries = self._summaries
        for k, v in d.items():
            w = 1
            if isinstance(v, tuple):
                w = v[1]
                v = v[0]
            summaries[k].add(v, weight=w)
    def compute_mean(self):
        """Creates a dictionary of mean values.
        It returns a single dictionary that holds a mean value for each entry
        added to the summary.
        Returns:
            dict: Dictionary of mean values.
        """
        return {
            name: summary.compute_mean()
            for name, summary in self._summaries.items()
        }
    def make_statistics(self):
        """Creates a dictionary of statistics.
        It returns a single dictionary that holds mean and standard deviation
        values for every entry added to the summary. For an entry of name
        ``'key'``, these values are added to the dictionary by names ``'key'``
        and ``'key.std'``, respectively.
        Returns:
            dict: Dictionary of statistics of all entries.
        """
        stats = {}
        for name, summary in self._summaries.items():
            mean, std = summary.make_statistics()
            stats[name] = mean
            stats[name + '.std'] = std
        return stats
--- a/parakeet/training/seeding.py
+++ b/parakeet/training/seeding.py
@ -0,0 +1,27 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
 import logging
 import paddle
 import numpy as np
 def seed_everything(seed: int):
    """Seed paddle, random and np.random to help reproductivity."""
    paddle.seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    logging.debug(f"Set the seed of paddle, random, np.random to {seed}.")
--- a/parakeet/training/trainer.py
+++ b/parakeet/training/trainer.py
@ -12,16 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import six
 import traceback
 from pathlib import Path
-import tqdm
+from collections import OrderedDict
-from dataclasses import dataclass
+from typing import Callable, Union, List
-from parakeet.training.trigger import get_trigger, IntervalTrigger
+import tqdm
 from parakeet.training.trigger import get_trigger, IntervalTrigger, LimitTrigger
 from parakeet.training.updater import UpdaterBase
 from parakeet.training.reporter import scope
 from parakeet.training.extension import Extension, PRIORITY_READER
-class ExtensionEntry(object):
+class _ExtensionEntry(object):
    def __init__(self, extension, trigger, priority):
        self.extension = extension
        self.trigger = trigger
@ -31,31 +37,76 @@ class ExtensionEntry(object):
 class Trainer(object):
    def __init__(self,
                 updater: UpdaterBase,
-                 stop_trigger=None,
+                 stop_trigger: Callable=None,
-                 out='result',
+                 out: Union[str, Path]='result',
-                 extensions=None):
+                 extensions: List[Extension]=None):
        self.updater = updater
-        self.extensions = {}
+        self.extensions = OrderedDict()
-        self.stop_trigger = get_trigger(stop_trigger)
+        self.stop_trigger = LimitTrigger(*stop_trigger)
        self.out = Path(out)
-        self.observation = {}
+        self.observation =...
-    def setup(self):
+        self._done = False
-        pass
+        if extensions:
            for ext in extensions:
                self.extend(ext)
    @property
    def is_before_training(self):
        return self.updater.state.iteration == 0
    def extend(self, extension, name=None, trigger=None, priority=None):
        # get name for the extension
        # argument \
        # -> extention's name \
        # -> default_name (class name, when it is an object) \
        # -> function name when it is a function \
        # -> error
        if name is None:
            name = getattr(extension, 'name', None)
            if name is None:
                name = getattr(extension, 'default_name', None)
                if name is None:
                    name = getattr(extension, '__name__', None)
                    if name is None:
                        raise ValueError(
                            "Name is not given for the extension.")
        if name == 'training':
            raise ValueError("training is a reserved name.")
        if trigger is None:
            trigger = getattr(extension, 'trigger', (1, 'iteration'))
        trigger = get_trigger(trigger)
        if priority is None:
            priority = getattr(extension, 'priority', PRIORITY_READER)
        # add suffix to avoid nameing conflict
        ordinal = 0
        modified_name = name
-        while name in self.extensions:
+        while modified_name in self.extensions:
            ordinal += 1
            modified_name = f"{name}_{ordinal}"
        extension.name = modified_name
-        self.extensions[modified_name] = ExtensionEntry(extension, trigger,
+        self.extensions[modified_name] = _ExtensionEntry(extension, trigger,
-                                                        priority)
+                                                         priority)
    def get_extension(self, name):
        """get extension by name."""
        extensions = self.extensions
        if name in extensions:
            return extensions[name].extension
        else:
            raise ValueError(f'extension {name} not found')
    def run(self):
        if self._done:
            raise RuntimeError("Training is already done!.")
        self.out.mkdir(parents=True, exist_ok=True)
        # sort extensions by priorities once
        extension_order = sorted(
            self.extensions.keys(),
@ -64,28 +115,72 @@ class Trainer(object):
        extensions = [(name, self.extensions[name])
                      for name in extension_order]
-        update = self.updater.update
+        # initializing all extensions
        for name, entry in extensions:
            if hasattr(entry.extension, "initialize"):
                entry.extension.initialize(self)
        update = self.updater.update  # training step
        stop_trigger = self.stop_trigger
-        # TODO(chenfeiyu): display progress bar correctly
+        print(self.updater.state)
-        # if the trainer is controlled by epoch: use 2 progressbars
+
-        # if the trainer is controlled by iteration: use 1 progressbar
+        # display only one progress bar
-        if isinstance(stop_trigger, IntervalTrigger):
+        max_iteration = None
        if isinstance(stop_trigger, LimitTrigger):
            if stop_trigger.unit is 'epoch':
-                max_epoch = self.stop_trigger.period
+                max_epoch = self.stop_trigger.limit
                updates_per_epoch = getattr(self.updater, "updates_per_epoch",
                                            None)
                max_iteration = max_epoch * updates_per_epoch if updates_per_epoch else None
            else:
-                max_iteration = self.stop_trigger.period
+                max_iteration = self.stop_trigger.limit
-        while not stop_trigger(self):
+        p = tqdm.tqdm(
-            self.observation = {}
+            initial=self.updater.state.iteration, total=max_iteration)
            # set observation as the report target
            # you can use report freely in Updater.update()
-            # updating parameters and state
+        try:
-            with scope(self.observation):
+            while not stop_trigger(self):
-                update()
+                self.observation = {}
                # set observation as the report target
                # you can use report freely in Updater.update()
-            # execute extension when necessary
+                # updating parameters and state
                with scope(self.observation):
                    update()
                    p.update()
                    # execute extension when necessary
                    for name, entry in extensions:
                        if entry.trigger(self):
                            entry.extension(self)
                # print("###", self.observation)
        except Exception as e:
            f = sys.stderr
            f.write(f"Exception in main training loop: {e}\n")
            f.write("Traceback (most recent call last):\n")
            traceback.print_tb(sys.exc_info()[2])
            f.write(
                "Trainer extensions will try to handle the extension. Then all extensions will finalize."
            )
            # capture the exception in the mian training loop
            exc_info = sys.exc_info()
            # try to handle it
            for name, entry in extensions:
-                if entry.trigger(self):
+                if hasattr(entry.extension, "on_error"):
-                    entry.extension(self)
+                    try:
                        entry.extension.on_error(self, e, sys.exc_info()[2])
                    except Exception as ee:
                        f.write(f"Exception in error handler: {ee}\n")
                        f.write('Traceback (most recent call last):\n')
                        traceback.print_tb(sys.exc_info()[2])
            # raise exception in main training loop
            six.reraise(*exc_info)
        finally:
            for name, entry in extensions:
                if hasattr(entry.extension, "finalize"):
                    entry.extension.finalize(self)
--- a/parakeet/training/trigger.py
+++ b/parakeet/training/trigger.py
@ -12,21 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from parakeet.training.triggers.interval_trigger import IntervalTrigger
-class IntervalTrigger(object):
+from parakeet.training.triggers.limit_trigger import LimitTrigger
-    def __init__(self, period: int, unit: str):
+from parakeet.training.triggers.time_trigger import TimeTrigger
        if unit not in ("iteration", "epoch"):
            raise ValueError("unit should be 'iteration' or 'epoch'")
        self.period = period
        self.unit = unit
    def __call__(self, trainer):
        state = trainer.updater.state
        if self.unit == "epoch":
            fire = not (state.epoch % self.period)
        else:
            fire = not (state.iteration % self.iteration)
        return fire
 def never_file_trigger(trainer):
--- a/parakeet/training/triggers/interval_trigger.py
+++ b/parakeet/training/triggers/interval_trigger.py
@ -0,0 +1,31 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 class IntervalTrigger(object):
    """A Predicate to do something every N cycle."""
    def __init__(self, period: int, unit: str):
        if unit not in ("iteration", "epoch"):
            raise ValueError("unit should be 'iteration' or 'epoch'")
        if period <= 0:
            raise ValueError("period should be a positive integer.")
        self.period = period
        self.unit = unit
    def __call__(self, trainer):
        state = trainer.updater.state
        index = getattr(state, self.unit)
        fire = index % self.period == 0
        return fire
--- a/parakeet/training/triggers/limit_trigger.py
+++ b/parakeet/training/triggers/limit_trigger.py
@ -0,0 +1,31 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 class LimitTrigger(object):
    """A Predicate to decide whether to stop."""
    def __init__(self, limit: int, unit: str):
        if unit not in ("iteration", "epoch"):
            raise ValueError("unit should be 'iteration' or 'epoch'")
        if limit <= 0:
            raise ValueError("limit should be a positive integer.")
        self.limit = limit
        self.unit = unit
    def __call__(self, trainer):
        state = trainer.updater.state
        index = getattr(state, self.unit)
        fire = index >= self.limit
        return fire
--- a/parakeet/training/triggers/time_trigger.py
+++ b/parakeet/training/triggers/time_trigger.py
@ -0,0 +1,35 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 class TimeTrigger(object):
    """Trigger based on a fixed time interval.
    This trigger accepts iterations with a given interval time.
    Args:
        period (float): Interval time. It is given in seconds.
    """
    def __init__(self, period):
        self._period = period
        self._next_time = self._period
    def __call__(self, trainer):
        if self._next_time < trainer.elapsed_time:
            self._next_time += self._period
            return True
        else:
            return False
--- a/parakeet/training/updater.py
+++ b/parakeet/training/updater.py
@ -12,12 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from dataclasses import dataclass
 from typing import Optional
 from typing import Dict
 from typing import Union
 from timer import timer
 import paddle
 from paddle import Tensor
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from parakeet.training.reporter import report
@dataclass
@ -56,68 +65,33 @@ class UpdaterBase(object):
    So the best practice is to define a model and define a updater for it.
    """
-    def update(self):
+    def __init__(self, init_state=None):
        pass
    def update_core(self):
        pass
 class StandardUpdater(UpdaterBase):
    """An example of over-simplification. Things may not be that simple, but
    you can subclass it to fit your need.
    """
    def __init__(self,
                 model: Layer,
                 dataloader: DataLoader,
                 optimizer: Optimizer,
                 loss_func=None,
                 auto_new_epoch: bool=True,
                 init_state: Optional[UpdaterState]=None):
        self.model = model
        self.dataloader = dataloader
        self.optimizer = optimizer
        self.loss_func = loss_func
        self.auto_new_epoch = auto_new_epoch
        self.iterator = iter(dataloader)
        if init_state is None:
            self.state = UpdaterState()
        else:
            self.state = init_state
-    def update(self):
+    def update(self, batch):
-        self.update_core()
+        raise NotImplementedError(
-        self.state.iteration += 1
+            "Implement your own `update` method for training a step.")
-    def new_epoch(self):
+    def state_dict(self):
-        self.iterator = iter(self.dataloader)
+        state_dict = {
-        self.state.epoch += 1
+            "epoch": self.state.epoch,
            "iteration": self.state.iteration,
        }
        return state_dict
-    def update_core(self):
+    def set_state_dict(self, state_dict):
-        model = self.model
+        self.state.epoch = state_dict["epoch"]
-        optimizer = self.optimizer
+        self.state.iteration = state_dict["iteration"]
        loss_func = self.loss_func
-        model.train()
+    def save(self, path):
-        optimizer.clear_grad()
+        logging.debug(f"Saving to {path}.")
        archive = self.state_dict()
        paddle.save(archive, str(path))
-        # fetch a batch
+    def load(self, path):
-        try:
+        logging.debug(f"Loading from {path}.")
-            batch = next(self.iterator)
+        archive = paddle.load(str(path))
-        except StopIteration as e:
+        self.set_state_dict(archive)
            if self.auto_new_epoch:
                self.new_epoch()
        # forward
        if self.loss_func is not None:
            loss = loss_func(batch)
        else:
            loss = model(batch)
        # backward
        loss.backward()
        # update parameters
        optimizer.step()
--- a/parakeet/training/updaters/standard_updater.py
+++ b/parakeet/training/updaters/standard_updater.py
@ -0,0 +1,190 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from dataclasses import dataclass
 from typing import Optional
 from typing import Dict
 from typing import Union
 from timer import timer
 import paddle
 from paddle import Tensor
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from parakeet.training.reporter import report
 from parakeet.training.updater import UpdaterBase, UpdaterState
 class StandardUpdater(UpdaterBase):
    """An example of over-simplification. Things may not be that simple, but
    you can subclass it to fit your need.
    """
    def __init__(self,
                 model: Layer,
                 optimizer: Optimizer,
                 dataloader: DataLoader,
                 init_state: Optional[UpdaterState]=None):
        # it is designed to hold multiple models
        models = {"main": model}
        self.models: Dict[str, Layer] = models
        self.model = model
        # it is designed to hold multiple optimizers
        optimizers = {"main": optimizer}
        self.optimizer = optimizer
        self.optimizers: Dict[str, Optimizer] = optimizers
        # dataloaders
        self.dataloader = dataloader
        # init state
        if init_state is None:
            self.state = UpdaterState()
        else:
            self.state = init_state
        self.train_iterator = iter(dataloader)
    def update(self):
        # We increase the iteration index after updating and before extension.
        # Here are the reasons.
        # 0. Snapshotting(as well as other extensions, like visualizer) is 
        #    executed after a step of updating;
        # 1. We decide to increase the iteration index after updating and 
        #    before any all extension is executed. 
        # 3. We do not increase the iteration after extension because we 
        #    prefer a consistent resume behavior, when load from a 
        #    `snapshot_iter_100.pdz` then the next step to train is `101`, 
        #    naturally. But if iteration is increased increased after 
        #    extension(including snapshot), then, a `snapshot_iter_99` is 
        #    loaded. You would need a extra increasing of the iteration idex 
        #    before training to avoid another iteration `99`, which has been 
        #    done before snapshotting.
        # 4. Thus iteration index represrnts "currently how mant epochs has 
        #    been done."
        # NOTE: use report to capture the correctly value. If you want to 
        # report the learning rate used for a step, you must report it before
        # the learning rate scheduler's step() has been called. In paddle's 
        # convention, we do not use an extension to change the learning rate.
        # so if you want to report it, do it in the updater.
        # Then here comes the next question. When is the proper time to 
        # increase the epoch index? Since all extensions are executed after 
        # updating, it is the time that after updating is the proper time to 
        # increase epoch index. 
        # 1. If we increase the epoch index before updating, then an extension
        #    based ot epoch would miss the correct timing. It could only be 
        #    triggerd after an extra updating.
        # 2. Theoretically, when an epoch is done, the epoch index should be 
        #    increased. So it would be increase after updating.
        # 3. Thus, eppoch index represents "currently how many epochs has been
        #    done." So it starts from 0.
        # switch to training mode
        for layer in self.models.values():
            layer.train()
        # training for a step is implemented here
        batch = self.read_batch()
        self.update_core(batch)
        self.state.iteration += 1
        if self.updaters_per_epoch is not None:
            if self.state.iteration % self.updaters_per_epoch == 0:
                self.state.epoch += 1
    def update_core(self, batch):
        """A simple case for a training step. Basic assumptions are:
        Single model;
        Single optimizer;
        A batch from the dataloader is just the input of the model;
        The model return a single loss, or a dict containing serval losses.
        Parameters updates at every batch, no gradient accumulation.
        """
        loss = self.model(*batch)
        if isinstance(loss, Tensor):
            loss_dict = {"main": loss}
        else:
            # Dict[str, Tensor]
            loss_dict = loss
            if "main" not in loss_dict:
                main_loss = 0
                for loss_item in loss.values():
                    main_loss += loss_item
                loss_dict["main"] = main_loss
        for name, loss_item in loss_dict.items():
            report(name, float(loss_item))
        self.optimizer.clear_gradient()
        loss_dict["main"].backward()
        self.optimizer.update()
    @property
    def updaters_per_epoch(self):
        """Number of updater per epoch, determined by the length of the 
        dataloader."""
        length_of_dataloader = None
        try:
            length_of_dataloader = len(self.dataloader)
        except TypeError:
            logging.debug("This dataloader has no __len__.")
        finally:
            return length_of_dataloader
    def new_epoch(self):
        """Start a new epoch."""
        # NOTE: all batch sampler for distributed training should
        # subclass DistributedBatchSampler and implement `set_epoch` method
        batch_sampler = self.dataloader.batch_sampler
        if isinstance(batch_sampler, DistributedBatchSampler):
            batch_sampler.set_epoch(self.state.epoch)
        self.train_iterator = iter(self.dataloader)
    def read_batch(self):
        """Read a batch from the data loader, auto renew when data is exhausted."""
        with timer() as t:
            try:
                batch = next(self.train_iterator)
            except StopIteration:
                self.new_epoch()
                batch = next(self.train_iterator)
            logging.debug(
                f"Read a batch takes {t.elapse}s.")  # replace it with logging
        return batch
    def state_dict(self):
        """State dict of a Updater, model, optimizer and updater state are included."""
        state_dict = super().state_dict()
        for name, layer in self.models.items():
            state_dict[f"{name}_params"] = layer.state_dict()
        for name, optim in self.optimizers.items():
            state_dict[f"{name}_optimizer"] = optim.state_dict()
        return state_dict
    def set_state_dict(self, state_dict):
        """Set state dict for a Updater. Parameters of models, states for
        optimizers and UpdaterState are restored."""
        for name, layer in self.models.items():
            layer.set_state_dict(state_dict[f"{name}_params"])
        for name, optim in self.optimizers.items():
            optim.set_state_dict(state_dict[f"{name}_optimizer"])
        super().set_state_dict(state_dict)
--- a/parakeet/utils/h5_utils.py
+++ b/parakeet/utils/h5_utils.py
@ -0,0 +1,105 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pathlib import Path
 from typing import Union, Any
 import sys
 import logging
 import h5py
 import numpy as np
 def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
    """Read a dataset from a HDF5 file.
    Parameters
    ----------
    filename : Union[Path, str]
        Path of the HDF5 file.
    dataset_name : str
        Name of the dataset to read.
    Returns
    -------
    Any
        The retrieved dataset.
    """
    filename = Path(filename)
    if not filename.exists():
        logging.error(f"There is no such a hdf5 file ({filename}).")
        sys.exit(1)
    hdf5_file = h5py.File(filename, "r")
    if dataset_name not in hdf5_file:
        logging.error(
            f"There is no such a data in hdf5 file. ({dataset_name})")
        sys.exit(1)
    # [()]: a special syntax of h5py to get the dataset as-is
    hdf5_data = hdf5_file[dataset_name][()]
    hdf5_file.close()
    return hdf5_data
 def write_hdf5(filename: Union[Path, str],
               dataset_name: str,
               write_data: np.ndarray,
               is_overwrite: bool=True) -> None:
    """Write dataset to HDF5 file.
    Parameters
    ----------
    filename : Union[Path, str]
        Path of the HDF5 file.
    dataset_name : str
        Name of the dataset to write to.
    write_data : np.ndarrays
        The data to write.
    is_overwrite : bool, optional
        Whether to overwrite, by default True
    """
    # convert to numpy array
    filename = Path(filename)
    write_data = np.array(write_data)
    # check folder existence
    filename.parent.mkdir(parents=True, exist_ok=True)
    # check hdf5 existence
    if filename.exists():
        # if already exists, open with r+ mode
        hdf5_file = h5py.File(filename, "r+")
        # check dataset existence
        if dataset_name in hdf5_file:
            if is_overwrite:
                logging.warning("Dataset in hdf5 file already exists. "
                                "recreate dataset in hdf5.")
                hdf5_file.__delitem__(dataset_name)
            else:
                logging.error(
                    "Dataset in hdf5 file already exists. "
                    "if you want to overwrite, please set is_overwrite = True.")
                hdf5_file.close()
                sys.exit(1)
    else:
        # if not exists, open with w mode
        hdf5_file = h5py.File(filename, "w")
    # write data to hdf5
    hdf5_file.create_dataset(dataset_name, data=write_data)
    hdf5_file.flush()
    hdf5_file.close()
--- a/parakeet/utils/profile.py
+++ b/parakeet/utils/profile.py
@ -0,0 +1,34 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle.framework import core
 from paddle.framework import CUDAPlace
 from contextlib import contextmanager
 def synchronize():
    """Trigger cuda synchronization for better timing."""
    place = paddle.fluid.framework._current_expected_place()
    if isinstance(place, CUDAPlace):
        paddle.fluid.core._cuda_synchronize(place)
@contextmanager
 def nvtx_span(name):
    try:
        core.nvprof_nvtx_push(name)
        yield
    finally:
        core.nvprof_nvtx_pop()
--- a/parakeet/utils/timeline.py
+++ b/parakeet/utils/timeline.py
@ -0,0 +1,319 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import json
 import six
 import sys
 import unittest
 import google.protobuf.text_format as text_format
 import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    '--profile_path',
    type=str,
    default='',
    help='Input profile file name. If there are multiple file, the format '
    'should be trainer1=file1,trainer2=file2,ps=file3')
 parser.add_argument(
    '--timeline_path', type=str, default='', help='Output timeline file name.')
 args = parser.parse_args()
 class _ChromeTraceFormatter(object):
    def __init__(self):
        self._events = []
        self._metadata = []
    def _create_event(self, ph, category, name, pid, tid, timestamp):
        """Creates a new Chrome Trace event.
        For details of the file format, see:
        https://github.com/catapult-project/catapult/blob/master/tracing/README.md
        Args:
          ph:  The type of event - usually a single character.
          category: The event category as a string.
          name:  The event name as a string.
          pid:  Identifier of the process generating this event as an integer.
          tid:  Identifier of the thread generating this event as an integer.
          timestamp:  The timestamp of this event as a long integer.
        Returns:
          A JSON compatible event object.
        """
        event = {}
        event['ph'] = ph
        event['cat'] = category
        event['name'] = name.replace("ParallelExecutor::Run/", "")
        event['pid'] = pid
        event['tid'] = tid
        event['ts'] = timestamp
        return event
    def emit_pid(self, name, pid):
        """Adds a process metadata event to the trace.
        Args:
          name:  The process name as a string.
          pid:  Identifier of the process as an integer.
        """
        event = {}
        event['name'] = 'process_name'
        event['ph'] = 'M'
        event['pid'] = pid
        event['args'] = {'name': name}
        self._metadata.append(event)
    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
        """Adds a region event to the trace.
        Args:
          timestamp:  The start timestamp of this region as a long integer.
          duration:  The duration of this region as a long integer.
          pid:  Identifier of the process generating this event as an integer.
          tid:  Identifier of the thread generating this event as an integer.
          category: The event category as a string.
          name:  The event name as a string.
          args:  A JSON-compatible dictionary of event arguments.
        """
        event = self._create_event('X', category, name, pid, tid, timestamp)
        event['dur'] = duration
        event['args'] = args
        self._events.append(event)
    def emit_counter(self, category, name, pid, timestamp, counter, value):
        """Emits a record for a single counter.
        Args:
            category: The event category as string
            name: The event name as string
            pid: Identifier of the process generating this event as integer
            timestamp: The timestamps of this event as long integer
            counter: Name of the counter as string
            value: Value of the counter as integer
            tid: Thread id of the allocation as integer
        """
        event = self._create_event('C', category, name, pid, 0, timestamp)
        event['args'] = {counter: value}
        self._events.append(event)
    def format_to_string(self, pretty=False):
        """Formats the chrome trace to a string.
        Args:
          pretty: (Optional.)  If True, produce human-readable JSON output.
        Returns:
          A JSON-formatted string in Chrome Trace format.
        """
        trace = {}
        trace['traceEvents'] = self._metadata + self._events
        if pretty:
            return json.dumps(trace, indent=4, separators=(',', ': '))
        else:
            return json.dumps(trace, separators=(',', ':'))
 class Timeline(object):
    def __init__(self, profile_dict):
        self._profile_dict = profile_dict
        self._pid = 0
        self._devices = dict()
        self._mem_devices = dict()
        self._chrome_trace = _ChromeTraceFormatter()
    def _allocate_pid(self):
        cur_pid = self._pid
        self._pid += 1
        return cur_pid
    def _allocate_pids(self):
        for k, profile_pb in six.iteritems(self._profile_dict):
            for event in profile_pb.events:
                if event.type == profiler_pb2.Event.CPU:
                    if (k, event.device_id, "CPU") not in self._devices:
                        pid = self._allocate_pid()
                        self._devices[(k, event.device_id, "CPU")] = pid
                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
                        if event.device_id == -1:
                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
                        else:
                            self._chrome_trace.emit_pid(
                                "%s:cpu:block:%d" % (k, event.device_id), pid)
                elif event.type == profiler_pb2.Event.GPUKernel:
                    if (k, event.device_id, "GPUKernel") not in self._devices:
                        pid = self._allocate_pid()
                        self._devices[(k, event.device_id, "GPUKernel")] = pid
                        self._chrome_trace.emit_pid("%s:gpu:%d" %
                                                    (k, event.device_id), pid)
            if not hasattr(profile_pb, "mem_events"):
                continue
            for mevent in profile_pb.mem_events:
                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
                            pid)
                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
                            pid)
                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
                    if (k, mevent.device_id, "CUDAPinnedPlace"
                        ) not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id,
                                           "CUDAPinnedPlace")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:cudapinnedplace:%d" %
                            (k, mevent.device_id), pid)
                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
                        pid = self._allocate_pid()
                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
                        self._chrome_trace.emit_pid(
                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
                            pid)
                if (k, 0, "CPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CPU")] = pid
                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
                                                (k, 0), pid)
                if (k, 0, "GPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "GPU")] = pid
                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
                                                (k, 0), pid)
                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                    self._chrome_trace.emit_pid(
                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
                if (k, 0, "NPU") not in self._mem_devices:
                    pid = self._allocate_pid()
                    self._mem_devices[(k, 0, "NPU")] = pid
                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
                                                (k, 0), pid)
    def _allocate_events(self):
        for k, profile_pb in six.iteritems(self._profile_dict):
            for event in profile_pb.events:
                if event.type == profiler_pb2.Event.CPU:
                    type = "CPU"
                elif event.type == profiler_pb2.Event.GPUKernel:
                    type = "GPUKernel"
                pid = self._devices[(k, event.device_id, type)]
                args = {'name': event.name}
                if event.memcopy.bytes > 0:
                    args['mem_bytes'] = event.memcopy.bytes
                if hasattr(event, "detail_info") and event.detail_info:
                    args['detail_info'] = event.detail_info
                # TODO(panyx0718): Chrome tracing only handles ms. However, some
                # ops takes micro-seconds. Hence, we keep the ns here.
                self._chrome_trace.emit_region(
                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
                    event.sub_device_id, 'Op', event.name, args)
    def _allocate_memory_event(self):
        if not hasattr(profiler_pb2, "MemEvent"):
            return
        place_to_str = {
            profiler_pb2.MemEvent.CPUPlace: "CPU",
            profiler_pb2.MemEvent.CUDAPlace: "GPU",
            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
            profiler_pb2.MemEvent.NPUPlace: "NPU"
        }
        for k, profile_pb in six.iteritems(self._profile_dict):
            mem_list = []
            end_profiler = 0
            for mevent in profile_pb.mem_events:
                crt_info = dict()
                crt_info['time'] = mevent.start_ns
                crt_info['size'] = mevent.bytes
                if mevent.place in place_to_str:
                    place = place_to_str[mevent.place]
                else:
                    place = "UnDefine"
                crt_info['place'] = place
                pid = self._mem_devices[(k, mevent.device_id, place)]
                crt_info['pid'] = pid
                crt_info['thread_id'] = mevent.thread_id
                crt_info['device_id'] = mevent.device_id
                mem_list.append(crt_info)
                crt_info = dict()
                crt_info['place'] = place
                crt_info['pid'] = pid
                crt_info['thread_id'] = mevent.thread_id
                crt_info['device_id'] = mevent.device_id
                crt_info['time'] = mevent.end_ns
                crt_info['size'] = -mevent.bytes
                mem_list.append(crt_info)
                end_profiler = max(end_profiler, crt_info['time'])
            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
            i = 0
            total_size = 0
            while i < len(mem_list):
                total_size += mem_list[i]['size']
                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
                        i + 1]['time']:
                    total_size += mem_list[i + 1]['size']
                    i += 1
                self._chrome_trace.emit_counter(
                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
                    0, total_size)
                i += 1
    def generate_chrome_trace(self):
        self._allocate_pids()
        self._allocate_events()
        self._allocate_memory_event()
        return self._chrome_trace.format_to_string()
 profile_path = '/tmp/profile'
 if args.profile_path:
    profile_path = args.profile_path
 timeline_path = '/tmp/timeline'
 if args.timeline_path:
    timeline_path = args.timeline_path
 profile_paths = profile_path.split(',')
 profile_dict = dict()
 if len(profile_paths) == 1:
    with open(profile_path, 'rb') as f:
        profile_s = f.read()
        profile_pb = profiler_pb2.Profile()
        profile_pb.ParseFromString(profile_s)
    profile_dict['trainer'] = profile_pb
 else:
    for profile_path in profile_paths:
        k, v = profile_path.split('=')
        with open(v, 'rb') as f:
            profile_s = f.read()
            profile_pb = profiler_pb2.Profile()
            profile_pb.ParseFromString(profile_s)
        profile_dict[k] = profile_pb
 tl = Timeline(profile_dict)
 with open(timeline_path, 'w') as f:
    f.write(tl.generate_chrome_trace())
--- a/setup.py
+++ b/setup.py
@ -64,7 +64,6 @@ setup_info = dict(
        'scipy',
        'pandas',
        'sox',
        # 'opencc',
        'soundfile',
        'g2p_en',
        'yacs',
@ -73,6 +72,9 @@ setup_info = dict(
        'webrtcvad',
        'g2pM',
        'praatio',
        "h5py",
        "timer",
        'jsonlines',
    ],
    extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@ -1,52 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pathlib import Path
 import shutil
 import numpy as np
 from parakeet.training.checkpoint import KBest, KLatest
 def test_kbest():
    def save_fn(path):
        with open(path, 'wt') as f:
            f.write(f"My path is {str(path)}\n")
    K = 1
    kbest_manager = KBest(max_size=K, save_fn=save_fn)
    checkpoint_dir = Path("checkpoints")
    shutil.rmtree(checkpoint_dir)
    checkpoint_dir.mkdir(parents=True)
    a = np.random.rand(20)
    for i, score in enumerate(a):
        path = checkpoint_dir / f"step_{i}"
        kbest_manager.add_checkpoint(score, path)
    assert len(list(checkpoint_dir.glob("step_*"))) == K
 def test_klatest():
    def save_fn(path):
        with open(path, 'wt') as f:
            f.write(f"My path is {str(path)}\n")
    K = 5
    klatest_manager = KLatest(max_size=K, save_fn=save_fn)
    checkpoint_dir = Path("checkpoints")
    shutil.rmtree(checkpoint_dir)
    checkpoint_dir.mkdir(parents=True)
    for i in range(20):
        path = checkpoint_dir / f"step_{i}"
        klatest_manager.add_checkpoint(path)
    assert len(list(checkpoint_dir.glob("step_*"))) == K
--- a/tests/test_data_table.py
+++ b/tests/test_data_table.py
@ -0,0 +1,22 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from parakeet.datasets.data_tabel import DataTable
 def test_audio_dataset():
    metadata = [{'name': 'Sonic', 'v': 1000}, {'name': 'Prestol', 'v': 2000}]
    converters = {'v': lambda x: x / 1000}
    dataset = DataTable(metadata, fields=['v'], converters=converters)
    assert dataset[0] == {'v': 1.0}
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@ -0,0 +1,39 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import shutil
 from pathlib import Path
 import paddle
 from paddle import nn
 from paddle.optimizer import Adam
 from paddle.optimizer.lr import StepDecay
 def test_optimizer():
    model1 = nn.Linear(3, 4)
    optim1 = Adam(
        parameters=model1.parameters(), learning_rate=StepDecay(0.1, 100))
    output_dir = Path("temp_test_optimizer")
    shutil.rmtree(output_dir, ignore_errors=True)
    output_dir.mkdir(exist_ok=True, parents=True)
    # model1.set_state_dict(model1.state_dict())
    optim1.set_state_dict(optim1.state_dict())
    x = paddle.randn([6, 3])
    y = model1(x).sum()
    y.backward()
    optim1.step()
--- a/tests/test_pwg.py
+++ b/tests/test_pwg.py
@ -0,0 +1,240 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 import torch
 from timer import timer
 from parallel_wavegan.layers import upsample, residual_block
 from parallel_wavegan.models import parallel_wavegan as pwgan
 from parakeet.utils.layer_tools import summary
 from parakeet.utils.profile import synchronize
 from parakeet.models.parallel_wavegan import ConvInUpsampleNet, ResidualBlock
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator, ResidualPWGDiscriminator
 paddle.set_device("gpu:0")
 device = torch.device("cuda:0")
 def test_convin_upsample_net():
    net = ConvInUpsampleNet(
        [4, 4, 4, 4],
        "LeakyReLU", {"negative_slope": 0.2},
        freq_axis_kernel_size=3,
        aux_context_window=0)
    net2 = upsample.ConvInUpsampleNetwork(
        [4, 4, 4, 4],
        nonlinear_activation="LeakyReLU",
        nonlinear_activation_params={"negative_slope": 0.2},
        freq_axis_kernel_size=3,
        aux_context_window=0).to(device)
    summary(net)
    for k, v in net2.named_parameters():
        print(k, v.shape)
        net.state_dict()[k].set_value(v.data.cpu().numpy())
    c = paddle.randn([4, 80, 180])
    synchronize()
    with timer(unit='s') as t:
        out = net(c)
        synchronize()
        print(f"paddle conv_in_upsample_net forward takes {t.elapse}s.")
    with timer(unit='s') as t:
        out.sum().backward()
        synchronize()
        print(f"paddle conv_in_upsample_net backward takes {t.elapse}s.")
    c_torch = torch.as_tensor(c.numpy()).to(device)
    torch.cuda.synchronize()
    with timer(unit='s') as t:
        out2 = net2(c_torch)
        print(f"torch conv_in_upsample_net forward takes {t.elapse}s.")
    with timer(unit='s') as t:
        out2.sum().backward()
        print(f"torch conv_in_upsample_net backward takes {t.elapse}s.")
    print("forward check")
    print(out.numpy()[0])
    print(out2.data.cpu().numpy()[0])
    print("backward check")
    print(net.conv_in.weight.grad.numpy()[0])
    print(net2.conv_in.weight.grad.data.cpu().numpy()[0])
 def test_residual_block():
    net = ResidualBlock(dilation=9)
    net2 = residual_block.ResidualBlock(dilation=9)
    summary(net)
    summary(net2)
    for k, v in net2.named_parameters():
        net.state_dict()[k].set_value(v.data.cpu().numpy())
    x = paddle.randn([4, 64, 180])
    c = paddle.randn([4, 80, 180])
    res, skip = net(x, c)
    res2, skip2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(c.numpy()))
    print("forward:")
    print(res.numpy()[0])
    print(res2.data.cpu().numpy()[0])
    print(skip.numpy()[0])
    print(skip2.data.cpu().numpy()[0])
    (res.sum() + skip.sum()).backward()
    (res2.sum() + skip2.sum()).backward()
    print("backward:")
    print(net.conv.weight.grad.numpy().squeeze()[0])
    print(net2.conv.weight.grad.data.cpu().numpy().squeeze()[0])
 def test_pwg_generator():
    net = PWGGenerator(
        layers=9,
        stacks=3,
        upsample_scales=[4, 4, 4, 4],
        nonlinear_activation="LeakyReLU",
        nonlinear_activation_params={"negative_slope": 0.5},
        use_weight_norm=True)
    net2 = pwgan.ParallelWaveGANGenerator(
        layers=9,
        stacks=3,
        upsample_params={
            "upsample_scales": [4, 4, 4, 4],
            "nonlinear_activation": "LeakyReLU",
            "nonlinear_activation_params": {
                "negative_slope": 0.5
            }
        },
        use_weight_norm=True).to(device)
    summary(net)
    summary(net2)
    for k, v in net2.named_parameters():
        p = net.state_dict()[k]
        if k.endswith("_g"):
            p.set_value(v.data.cpu().numpy().reshape([-1]))
        else:
            p.set_value(v.data.cpu().numpy())
    x = paddle.randn([4, 1, 80 * 256])
    c = paddle.randn([4, 80, 80 + 4])
    synchronize()
    with timer(unit='s') as t:
        out = net(x, c)
        synchronize()
        print(f"paddle generator forward takes {t.elapse}s.")
    synchronize()
    with timer(unit='s') as t:
        out.sum().backward()
        synchronize()
        print(f"paddle generator backward takes {t.elapse}s.")
    x_torch = torch.as_tensor(x.numpy()).to(device)
    c_torch = torch.as_tensor(c.numpy()).to(device)
    torch.cuda.synchronize()
    with timer(unit='s') as t:
        out2 = net2(x_torch, c_torch)
        torch.cuda.synchronize()
        print(f"torch generator forward takes {t.elapse}s.")
    torch.cuda.synchronize()
    with timer(unit='s') as t:
        out2.sum().backward()
        torch.cuda.synchronize()
        print(f"torch generator backward takes {t.elapse}s.")
    print("test forward:")
    print(out.numpy()[0])
    print(out2.data.cpu().numpy()[0])
    print("test backward:")
    print("wv")
    print(net.first_conv.weight_v.grad.numpy().squeeze())
    print(net2.first_conv.weight_v.grad.data.cpu().numpy().squeeze())
    print("wg")
    print(net.first_conv.weight_g.grad.numpy().squeeze())
    print(net2.first_conv.weight_g.grad.data.cpu().numpy().squeeze())
    # print(out.shape)
 def test_pwg_discriminator():
    net = PWGDiscriminator()
    net2 = pwgan.ParallelWaveGANDiscriminator().to(device)
    summary(net)
    summary(net2)
    for k, v in net2.named_parameters():
        p = net.state_dict()[k]
        if k.endswith("_g"):
            p.set_value(v.data.cpu().numpy().reshape([-1]))
        else:
            p.set_value(v.data.cpu().numpy())
    x = paddle.randn([4, 1, 180 * 256])
    synchronize()
    with timer() as t:
        y = net(x)
        synchronize()
        print(f"forward takes {t.elapse}s.")
    synchronize()
    with timer() as t:
        y.sum().backward()
        synchronize()
        print(f"backward takes {t.elapse}s.")
    x_torch = torch.as_tensor(x.numpy()).to(device)
    torch.cuda.synchronize()
    with timer() as t:
        y2 = net2(x_torch)
        torch.cuda.synchronize()
        print(f"forward takes {t.elapse}s.")
    torch.cuda.synchronize()
    with timer() as t:
        y2.sum().backward()
        torch.cuda.synchronize()
        print(f"backward takes {t.elapse}s.")
    print("test forward:")
    print(y.numpy()[0])
    print(y2.data.cpu().numpy()[0])
    print("test backward:")
    print(net.conv_layers[0].weight_v.grad.numpy().squeeze())
    print(net2.conv_layers[0].weight_v.grad.data.cpu().numpy().squeeze())
 def test_residual_pwg_discriminator():
    net = ResidualPWGDiscriminator()
    net2 = pwgan.ResidualParallelWaveGANDiscriminator()
    summary(net)
    summary(net2)
    for k, v in net2.named_parameters():
        p = net.state_dict()[k]
        if k.endswith("_g"):
            p.set_value(v.data.cpu().numpy().reshape([-1]))
        else:
            p.set_value(v.data.cpu().numpy())
    x = paddle.randn([4, 1, 180 * 256])
    y = net(x)
    y2 = net2(torch.as_tensor(x.numpy()))
    print(y.numpy()[0])
    print(y2.data.cpu().numpy()[0])
    print(y.shape)
--- a/tests/test_reporter.py
+++ b/tests/test_reporter.py
@ -0,0 +1,51 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from parakeet.training.reporter import report, scope
 from parakeet.training.reporter import Summary, DictSummary
 def test_reporter_scope():
    first = {}
    second = {}
    third = {}
    with scope(first):
        report("first_begin", 1)
        with scope(second):
            report("second_begin", 2)
            with scope(third):
                report("third_begin", 3)
                report("third_end", 4)
            report("seconf_end", 5)
        report("first_end", 6)
    assert first == {'first_begin': 1, 'first_end': 6}
    assert second == {'second_begin': 2, 'seconf_end': 5}
    assert third == {'third_begin': 3, 'third_end': 4}
    print(first)
    print(second)
    print(third)
 def test_summary():
    summary = Summary()
    summary.add(1)
    summary.add(2)
    summary.add(3)
    state = summary.make_statistics()
    print(state)
    np.testing.assert_allclose(
        np.array(list(state)), np.array([2.0, np.std([1, 2, 3])]))
--- a/tests/test_snapshot.py
+++ b/tests/test_snapshot.py
@ -0,0 +1,55 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pathlib import Path
 import shutil
 import numpy as np
 import paddle
 from paddle import nn
 from paddle.optimizer import Adam
 from itertools import count
 from parakeet.training.updater import StandardUpdater
 from parakeet.training.trainer import Trainer
 from parakeet.training.extensions.snapshot import Snapshot
 def test_snapshot():
    model = nn.Linear(3, 4)
    optimizer = Adam(parameters=model.parameters())
    # use a simplest iterable object as dataloader
    dataloader = count()
    # hack the training proecss: training does nothing except increse iteration
    updater = StandardUpdater(model, optimizer, dataloader=dataloader)
    updater.update_core = lambda x: None
    trainer = Trainer(
        updater, stop_trigger=(1000, 'iteration'), out='temp_test_snapshot')
    shutil.rmtree(trainer.out, ignore_errors=True)
    snap = Snapshot(max_size=5)
    trigger = (10, 'iteration')
    trainer.extend(snap, name='snapshot', trigger=trigger, priority=0)
    trainer.run()
    checkpoint_dir = trainer.out / "checkpoints"
    snapshots = sorted(list(checkpoint_dir.glob("snapshot_iter_*.pdz")))
    for snap in snapshots:
        print(snap)
    assert len(snapshots) == 5
    shutil.rmtree(trainer.out)
--- a/tests/test_stft.py
+++ b/tests/test_stft.py
@ -0,0 +1,73 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 import torch
 import librosa
 import numpy as np
 from parakeet.modules.stft_loss import STFT, MultiResolutionSTFTLoss
 from parallel_wavegan.losses import stft_loss as sl
 from scipy import signal
 def test_stft():
    stft = STFT(n_fft=1024, hop_length=256, win_length=1024)
    x = paddle.uniform([4, 46080])
    S = stft.magnitude(x)
    window = signal.get_window('hann', 1024, fftbins=True)
    D2 = torch.stft(
        torch.as_tensor(x.numpy()),
        n_fft=1024,
        hop_length=256,
        win_length=1024,
        window=torch.as_tensor(window))
    S2 = (D2**2).sum(-1).sqrt()
    S3 = np.abs(
        librosa.stft(
            x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024))
    print(S2.shape)
    print(S.numpy()[0])
    print(S2.data.cpu().numpy()[0])
    print(S3)
 def test_torch_stft():
    # NOTE: torch.stft use no window by default
    x = np.random.uniform(-1.0, 1.0, size=(46080, ))
    window = signal.get_window('hann', 1024, fftbins=True)
    D2 = torch.stft(
        torch.as_tensor(x),
        n_fft=1024,
        hop_length=256,
        win_length=1024,
        window=torch.as_tensor(window))
    D3 = librosa.stft(
        x, n_fft=1024, hop_length=256, win_length=1024, window='hann')
    print(D2[:, :, 0].data.cpu().numpy()[:, 30:60])
    print(D3.real[:, 30:60])
    # print(D3.imag[:, 30:60])
 def test_multi_resolution_stft_loss():
    net = MultiResolutionSTFTLoss()
    net2 = sl.MultiResolutionSTFTLoss()
    x = paddle.uniform([4, 46080])
    y = paddle.uniform([4, 46080])
    sc, m = net(x, y)
    sc2, m2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(y.numpy()))
    print(sc.numpy())
    print(sc2.data.cpu().numpy())
    print(m.numpy())
    print(m2.data.cpu().numpy())