WIP: training setup done

2021-06-13 17:00:44 +08:00 · 2021-06-13 17:00:44 +08:00 · 54c7905f40
parent 0067851950
commit 54c7905f40
7 changed files with 707 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -142,3 +142,5 @@ dmypy.json
 *.swp
 runs
 syn_audios
 exp/
 dump/
--- a/examples/parallelwave_gan/baker/batch_fn.py
+++ b/examples/parallelwave_gan/baker/batch_fn.py
@ -0,0 +1,105 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import paddle
 class Clip(object):
    """Collate functor for training vocoders.
    """
    def __init__(
            self,
            batch_max_steps=20480,
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for PyTorch DataLoader.
        Args:
            batch_max_steps (int): The maximum length of input signal in batch.
            hop_size (int): Hop size of auxiliary features.
            aux_context_window (int): Context window size for auxiliary feature conv.
        """
        if batch_max_steps % hop_size != 0:
            batch_max_steps += -(batch_max_steps % hop_size)
        assert batch_max_steps % hop_size == 0
        self.batch_max_steps = batch_max_steps
        self.batch_max_frames = batch_max_steps // hop_size
        self.hop_size = hop_size
        self.aux_context_window = aux_context_window
        # set useful values in random cutting
        self.start_offset = aux_context_window
        self.end_offset = -(self.batch_max_frames + aux_context_window)
        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
    def __call__(self, examples):
        """Convert into batch tensors.
        Args:
            batch (list): list of tuple of the pair of audio and features.
        Returns:
            Tensor: Auxiliary feature batch (B, C, T'), where
                T = (T' - 2 * aux_context_window) * hop_size.
            Tensor: Target signal batch (B, 1, T).
        """
        # check length
        examples = [
            self._adjust_length(*b) for b in examples
            if len(b[1]) > self.mel_threshold
        ]
        xs, cs = [b[0] for b in examples], [b[1] for b in examples]
        # make batch with random cut
        c_lengths = [len(c) for c in cs]
        start_frames = np.array([
            np.random.randint(self.start_offset, cl + self.end_offset)
            for cl in c_lengths
        ])
        x_starts = start_frames * self.hop_size
        x_ends = x_starts + self.batch_max_steps
        c_starts = start_frames - self.aux_context_window
        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
        y_batch = [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)]
        c_batch = [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]
        # convert each batch to tensor, asuume that each item in batch has the same length
        y_batch = paddle.to_tensor(
            y_batch, dtype=paddle.float32).unsqueeze(1)  # (B, 1, T)
        c_batch = paddle.to_tensor(
            c_batch, dtype=paddle.float32).transpose([0, 2, 1])  # (B, C, T')
        return (c_batch, ), y_batch
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.
        Note:
            Basically we assume that the length of x and c are adjusted
            through preprocessing stage, but if we use other library processed
            features, this process will be needed.
        """
        if len(x) < len(c) * self.hop_size:
            x = np.pad(x, (0, len(c) * self.hop_size - len(x)), mode="edge")
        # check the legnth is valid
        assert len(x) == len(c) * self.hop_size
        return x, c
--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -0,0 +1,127 @@
 # This is the hyperparameter configuration file for Parallel WaveGAN.
 # Please make sure this is adjusted for the CSMSC dataset. If you want to
 # apply to the other dataset, you might need to carefully change some parameters.
 # This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 sr: 24000     # Sampling rate.
 n_fft: 2048           # FFT size.
 hop_length: 300            # Hop size.
 win_length: 1200         # Window length.
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80             # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation.
 fmax: 7600               # Maximum frequency in mel basis calculation.
 # global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 trim_silence: false      # Whether to trim the start and end of silence.
 top_db: 60 # Need to tune carefully if the recording is not good.
 trim_frame_length: 2048    # Frame size in trimming.
 trim_hop_length: 512       # Hop size in trimming.
 # format: "npy"           # Feature file format. "npy" or "hdf5" is supported.
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
 generator_params:
    in_channels: 1        # Number of input channels.
    out_channels: 1       # Number of output channels.
    kernel_size: 3        # Kernel size of dilated convolution.
    layers: 30            # Number of residual block layers.
    stacks: 3             # Number of stacks i.e., dilation cycles.
    residual_channels: 64 # Number of channels in residual conv.
    gate_channels: 128    # Number of channels in gated conv.
    skip_channels: 64     # Number of channels in skip conv.
    aux_channels: 80      # Number of channels for auxiliary feature conv.
                          # Must be the same as num_mels.
    aux_context_window: 2 # Context window size for auxiliary feature.
                          # If set to 2, previous 2 and future 2 frames will be considered.
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
    bias: true            # use bias in residual blocks
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    use_causal_conv: false               # use causal conv in residual blocks and upsample layers
    # upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
    interpolate_mode: "nearest" # upsample net interpolate mode
    freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
    nonlinear_activation: null
    nonlinear_activation_params: {}
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
 discriminator_params:
    in_channels: 1        # Number of input channels.
    out_channels: 1       # Number of output channels.
    kernel_size: 3        # Number of output channels.
    layers: 10            # Number of conv layers.
    conv_channels: 64     # Number of chnn layers.
    bias: true            # Whether to use bias parameter in conv.
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
    nonlinear_activation_params:      # Nonlinear function parameters
        negative_slope: 0.2           # Alpha in LeakyReLU.
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"         # Window function for STFT-based loss
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
 ###########################################################
 lambda_adv: 4.0  # Loss balancing coefficient.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 6              # Batch size.
 batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 2             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
 ###########################################################
 #             OPTIMIZER & SCHEDULER SETTING               #
 ###########################################################
 generator_optimizer_params:
    epsilon: 1.0e-6            # Generator's epsilon.
    weight_decay: 0.0      # Generator's weight decay coefficient.
 generator_scheduler_params:
    learning_rate: 0.0001             # Generator's learning rate.
    step_size: 200000      # Generator's scheduler step size.
    gamma: 0.5             # Generator's scheduler gamma.
                           # At each step size, lr will be multiplied by this parameter.
 generator_grad_norm: 10    # Generator's gradient norm.
 discriminator_optimizer_params:
    epsilon: 1.0e-6            # Discriminator's epsilon.
    weight_decay: 0.0      # Discriminator's weight decay coefficient.
 discriminator_scheduler_params:
    learning_rate: 0.00005            # Discriminator's learning rate.
    step_size: 200000      # Discriminator's scheduler step size.
    gamma: 0.5             # Discriminator's scheduler gamma.
                           # At each step size, lr will be multiplied by this parameter.
 discriminator_grad_norm: 1 # Discriminator's gradient norm.
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
 train_max_steps: 400000                 # Number of training steps.
 save_interval_steps: 5000               # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 log_interval_steps: 100                 # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/examples/parallelwave_gan/baker/config.py
+++ b/examples/parallelwave_gan/baker/config.py
@ -0,0 +1,25 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import yaml
 from yacs.config import CfgNode as Configuration
 with open("conf/default.yaml", 'rt') as f:
    _C = yaml.safe_load(f)
    _C = Configuration(_C)
 def get_cfg_default():
    config = _C.clone()
    return config
--- a/examples/parallelwave_gan/baker/preprocess.py
+++ b/examples/parallelwave_gan/baker/preprocess.py
@ -0,0 +1,280 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Dict, Any
 import soundfile as sf
 import librosa
 import numpy as np
 from config import get_cfg_default
 import argparse
 import yaml
 import json
 import dacite
 import dataclasses
 import concurrent.futures
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from pathlib import Path
 import tqdm
 from operator import itemgetter
 from praatio import tgio
 import logging
 def logmelfilterbank(audio,
                     sr,
                     n_fft=1024,
                     hop_length=256,
                     win_length=None,
                     window="hann",
                     n_mels=80,
                     fmin=None,
                     fmax=None,
                     eps=1e-10):
    """Compute log-Mel filterbank feature.
    Parameters
    ----------
    audio : ndarray
        Audio signal (T,).
    sr : int
        Sampling rate.
    n_fft : int
        FFT size. (Default value = 1024)
    hop_length : int
        Hop size. (Default value = 256)
    win_length : int
        Window length. If set to None, it will be the same as fft_size. (Default value = None)
    window : str
        Window function type. (Default value = "hann")
    n_mels : int
        Number of mel basis. (Default value = 80)
    fmin : int
        Minimum frequency in mel basis calculation. (Default value = None)
    fmax : int
        Maximum frequency in mel basis calculation. (Default value = None)
    eps : float
        Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
    Returns
    -------
    np.ndarray
        Log Mel filterbank feature (#frames, num_mels).
    """
    # get amplitude spectrogram
    x_stft = librosa.stft(
        audio,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=window,
        pad_mode="reflect")
    spc = np.abs(x_stft)  # (#bins, #frames,)
    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sr / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
    return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
 def process_sentence(config: Dict[str, Any],
                     fp: Path,
                     alignment_fp: Path,
                     output_dir: Path):
    utt_id = fp.stem
    # reading
    y, sr = librosa.load(fp, sr=config.sr)  # resampling may occur
    assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
    assert np.abs(y).max(
    ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
    duration = librosa.get_duration(y, sr=sr)
    # trim according to the alignment file
    alignment = tgio.openTextgrid(alignment_fp)
    intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
    first, last = intervals[0], intervals[-1]
    start = 0
    end = last.end
    if first.label == "sil" and first.end < duration:
        start = first.end
    else:
        logging.warning(
            f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
        )
    if last.label == "sil" and last.start < duration:
        end = last.start
    else:
        end = duration
        logging.warning(
            f" There is something wrong with the last interval {last} in utterance: {utt_id}"
        )
    # silence trimmed
    start, end = librosa.time_to_samples([first.end, last.start], sr=sr)
    y = y[start:end]
    # energy based silence trimming
    if config.trim_silence:
        y, _ = librosa.effects.trim(
            y,
            top_db=config.top_db,
            frame_length=config.trim_frame_length,
            hop_length=config.trim_hop_length)
    logmel = logmelfilterbank(
        y,
        sr=sr,
        n_fft=config.n_fft,
        window=config.window,
        win_length=config.win_length,
        hop_length=config.hop_length,
        n_mels=config.n_mels,
        fmin=config.fmin,
        fmax=config.fmax)
    # adjust time to make num_samples == num_frames * hop_length
    num_frames = logmel.shape[1]
    y = np.pad(y, (0, config.n_fft), mode="reflect")
    y = y[:num_frames * config.hop_length]
    num_sample = y.shape[0]
    mel_path = output_dir / (utt_id + "_feats.npy")
    wav_path = output_dir / (utt_id + "_wave.npy")
    np.save(wav_path, y)
    np.save(mel_path, logmel)
    record = {
        "utt_id": utt_id,
        "num_samples": num_sample,
        "num_frames": num_frames,
        "feats_path": str(mel_path.resolve()),
        "wave_path": str(wav_path.resolve()),
    }
    return record
 def process_sentences(config,
                      fps: List[Path],
                      alignment_fps: List[Path],
                      output_dir: Path,
                      nprocs: int=1):
    if nprocs == 1:
        results = []
        for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
            results.append(
                process_sentence(config, fp, alignment_fp, output_dir))
    else:
        with ProcessPoolExecutor(nprocs) as pool:
            futures = []
            with tqdm.tqdm(total=len(fps)) as progress:
                for fp, alignment_fp in zip(fps, alignment_fps):
                    future = pool.submit(process_sentence, config, fp,
                                         alignment_fp, output_dir)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)
                results = []
                for ft in futures:
                    results.append(ft.result())
    results.sort(key=itemgetter("utt_id"))
    with open(output_dir / "metadata.json", 'wt') as f:
        json.dump(results, f)
    print("Done")
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help="directory including wav files. you need to specify either scp or rootdir."
    )
    parser.add_argument(
        "--dumpdir",
        type=str,
        required=True,
        help="directory to dump feature files.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num_cpu", type=int, default=1, help="number of process.")
    args = parser.parse_args()
    C = get_cfg_default()
    if args.config:
        C.merge_from_file(args.config)
        C.freeze()
    if args.verbose > 1:
        print(vars(args))
        print(yaml.dump(dataclasses.asdict(C)))
    root_dir = Path(args.rootdir)
    dumpdir = Path(args.dumpdir)
    dumpdir.mkdir(parents=True, exist_ok=True)
    wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
    alignment_files = sorted(
        list((root_dir / "PhoneLabeling").rglob("*.interval")))
    # split data into 3 sections
    train_wav_files = wav_files[:9800]
    dev_wav_files = wav_files[9800:9900]
    test_wav_files = wav_files[9900:]
    train_alignment_files = alignment_files[:9800]
    dev_alignment_files = alignment_files[9800:9900]
    test_alignment_files = alignment_files[9900:]
    train_dump_dir = dumpdir / "train"
    train_dump_dir.mkdir(parents=True, exist_ok=True)
    dev_dump_dir = dumpdir / "dev"
    dev_dump_dir.mkdir(parents=True, exist_ok=True)
    test_dump_dir = dumpdir / "test"
    test_dump_dir.mkdir(parents=True, exist_ok=True)
    # process for the 3 sections
    process_sentences(
        C,
        train_wav_files,
        train_alignment_files,
        train_dump_dir,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        dev_wav_files,
        dev_alignment_files,
        dev_dump_dir,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        test_wav_files,
        test_alignment_files,
        test_dump_dir,
        nprocs=args.num_cpu)
 if __name__ == "__main__":
    main()
--- a/examples/parallelwave_gan/baker/train.py
+++ b/examples/parallelwave_gan/baker/train.py
@ -0,0 +1,166 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import logging
 import argparse
 import dataclasses
 from pathlib import Path
 import yaml
 import dacite
 import json
 import paddle
 import numpy as np
 from paddle import nn
 from paddle.nn import functional as F
 from paddle import distributed as dist
 from paddle.io import DataLoader, DistributedBatchSampler
 from paddle.optimizer import Adam  # No RAdaom
 from paddle.optimizer.lr import StepDecay
 from paddle import DataParallel
 from visualdl import LogWriter
 from parakeet.datasets.data_table import DataTable
 from parakeet.training.updater import UpdaterBase
 from parakeet.training.trainer import Trainer
 from parakeet.training.reporter import report
 from parakeet.training.checkpoint import KBest, KLatest
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator
 from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
 from config import get_cfg_default
 def train_sp(args, config):
    # decides device type and whether to run in parallel
    # setup running environment correctly
    if not paddle.is_compiled_with_cuda:
        paddle.set_device("cpu")
    else:
        paddle.set_device("gpu")
        world_size = paddle.distributed.get_world_size()
        if world_size > 1:
            paddle.distributed.init_parallel_env()
    print(
        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
    )
    # construct dataset for training and validation
    with open(args.train_metadata) as f:
        train_metadata = json.load(f)
    train_dataset = DataTable(
        data=train_metadata,
        fields=["wave_path", "feats_path"],
        converters={
            "wave_path": np.load,
            "feats_path": np.load,
        }, )
    with open(args.dev_metadata) as f:
        dev_metadata = json.load(f)
    dev_dataset = DataTable(
        data=dev_metadata,
        fields=["wave_path", "feats_path"],
        converters={
            "wave_path": np.load,
            "feats_path": np.load,
        }, )
    # collate function and dataloader
    train_sampler = DistributedBatchSampler(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        drop_last=True)
    dev_sampler = DistributedBatchSampler(
        dev_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        drop_last=False)
    train_dataloader = DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        collate_fn=None,  # TODO(defaine collate fn)
        num_workers=4)
    dev_dataloader = DataLoader(
        dev_dataset,
        batch_sampler=dev_sampler,
        collate_fn=None,  # TODO(defaine collate fn)
        num_workers=4)
    generator = PWGGenerator(**config["generator_params"])
    discriminator = PWGDiscriminator(**config["discriminator_params"])
    if world_size > 1:
        generator = DataParallel(generator)
        discriminator = DataParallel(discriminator)
    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
    criterion_mse = nn.MSELoss()
    lr_schedule_g = StepDecay(**config["generator_scheduler_params"])
    optimizer_g = Adam(
        lr_schedule_g,
        parameters=generator.parameters(),
        **config["generator_optimizer_params"])
    lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
    optimizer_d = Adam(
        lr_schedule_d,
        parameters=discriminator.parameters(),
        **config["discriminator_optimizer_params"])
    output_dir = Path(args.output_dir)
    log_writer = None
    if dist.get_rank() == 0:
        output_dir.mkdir(parents=True, exist_ok=True)
        log_writer = LogWriter(output_dir)
    # training loop
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
                                     "model with Baker Mandrin TTS dataset.")
    parser.add_argument(
        "--config", type=str, help="config file to overwrite default config")
    parser.add_argument("--train-metadata", type=str, help="training data")
    parser.add_argument("--dev-metadata", type=str, help="dev data")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--nprocs", type=int, default=1, help="number of processes")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    config = get_cfg_default()
    if args.config:
        config.merge_from_file(args.config)
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(config)
    print(
        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
    )
    # dispatch
    if args.nprocs > 1:
        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
    else:
        train_sp(args, config)
 if __name__ == "__main__":
    main()
--- a/parakeet/models/parallel_wavegan.py
+++ b/parakeet/models/parallel_wavegan.py
@ -109,10 +109,11 @@ class UpsampleNet(nn.Layer):
                padding = (freq_axis_padding, scale)
            conv = nn.Conv2D(
                1, 1, kernel_size, padding=padding, bias_attr=False)
            self.up_layers.extend([stretch, conv])
            if nonlinear_activation is not None:
                nonlinear = getattr(
                    nn, nonlinear_activation)(**nonlinear_activation_params)
-            self.up_layers.extend([stretch, conv, nonlinear])
+                self.up_layers.append(nonlinear)
    def forward(self, c: Tensor) -> Tensor:
        """