refined training module

2021-06-30 12:30:14 +08:00 · 2021-06-30 12:30:14 +08:00 · ef51e1ab13
parent 29b8b8b0ea
commit ef51e1ab13
13 changed files with 169 additions and 59 deletions
--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -125,4 +125,5 @@ log_interval_steps: 100                 # Interval steps to record the training
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
-num_snapshots: 10
+num_snapshots: 10
 seed: 42
--- a/examples/parallelwave_gan/baker/pwg_updater.py
+++ b/examples/parallelwave_gan/baker/pwg_updater.py
@ -74,18 +74,15 @@ class PWGUpdater(StandardUpdater):
        # Generator
        noise = paddle.randn(wav.shape)
        synchronize()
        with timer() as t:
            wav_ = self.generator(noise, mel)
            synchronize()
            logging.debug(f"Generator takes {t.elapse}s.")
        ## Multi-resolution stft loss
-        synchronize()
+
        with timer() as t:
            sc_loss, mag_loss = self.criterion_stft(
                wav_.squeeze(1), wav.squeeze(1))
            synchronize()
            logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")
        report("train/spectral_convergence_loss", float(sc_loss))
@ -94,11 +91,9 @@ class PWGUpdater(StandardUpdater):
        ## Adversarial loss
        if self.state.iteration > self.discriminator_train_start_steps:
            synchronize()
            with timer() as t:
                p_ = self.discriminator(wav_)
                adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
                synchronize()
                logging.debug(
                    f"Discriminator and adversarial loss takes {t.elapse}s")
            report("train/adversarial_loss", float(adv_loss))
@ -106,18 +101,14 @@ class PWGUpdater(StandardUpdater):
        report("train/generator_loss", float(gen_loss))
        synchronize()
        with timer() as t:
            self.optimizer_g.clear_grad()
            gen_loss.backward()
            synchronize()
            logging.debug(f"Backward takes {t.elapse}s.")
        synchronize()
        with timer() as t:
            self.optimizer_g.step()
            self.scheduler_g.step()
            synchronize()
            logging.debug(f"Update takes {t.elapse}s.")
        # Disctiminator
@ -158,18 +149,15 @@ class PWGEvaluator(StandardEvaluator):
        wav, mel = batch
        noise = paddle.randn(wav.shape)
        synchronize()
        with timer() as t:
            wav_ = self.generator(noise, mel)
            synchronize()
            logging.debug(f"Generator takes {t.elapse}s")
        ## Multi-resolution stft loss
-        synchronize()
+
        with timer() as t:
            sc_loss, mag_loss = self.criterion_stft(
                wav_.squeeze(1), wav.squeeze(1))
            synchronize()
            logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")
        report("eval/spectral_convergence_loss", float(sc_loss))
@ -177,11 +165,9 @@ class PWGEvaluator(StandardEvaluator):
        gen_loss = sc_loss + mag_loss
        ## Adversarial loss
        synchronize()
        with timer() as t:
            p_ = self.discriminator(wav_)
            adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
            synchronize()
            logging.debug(
                f"Discriminator and adversarial loss takes {t.elapse}s")
        report("eval/adversarial_loss", float(adv_loss))
--- a/examples/parallelwave_gan/baker/synthesize.py
+++ b/examples/parallelwave_gan/baker/synthesize.py
@ -14,6 +14,7 @@
 import os
 import sys
 from timer import timer
 import logging
 import argparse
 from pathlib import Path
@ -25,6 +26,8 @@ import numpy as np
 import soundfile as sf
 from paddle import distributed as dist
 paddle.set_device("cpu")
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.parallel_wavegan import PWGGenerator
@ -71,11 +74,20 @@ test_dataset = DataTable(
 output_dir = Path(args.output_dir)
 output_dir.mkdir(parents=True, exist_ok=True)
 N = 0
 T = 0
 for example in test_dataset:
    utt_id = example['utt_id']
    mel = example['feats']
    mel = paddle.to_tensor(mel)  # (T, C)
-    wav = generator.inference(c=mel)
+    with timer() as t:
-    wav = wav.numpy()
+        wav = generator.inference(c=mel)
-    print(f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}")
+        wav = wav.numpy()
-    sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
+        N += wav.size
        T += t.elapse
        speed = wav.size / t.elapse
    print(
        f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
    )
    sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr)
 print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")
--- a/examples/parallelwave_gan/baker/train.py
+++ b/examples/parallelwave_gan/baker/train.py
@ -60,7 +60,7 @@ def train_sp(args, config):
            paddle.distributed.init_parallel_env()
    # set the random seed, it is a must for multiprocess training
-    seed_everything(42)
+    seed_everything(config.seed)
    print(
        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
@ -149,6 +149,8 @@ def train_sp(args, config):
    output_dir = Path(args.output_dir)
    checkpoint_dir = output_dir / "checkpoints"
    if dist.get_rank() == 0:
        with open(output_dir / "config.yaml", 'wt') as f:
            f.write(config.dump(default_flow_style=None))
        output_dir.mkdir(parents=True, exist_ok=True)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
--- a/parakeet/init.py
+++ b/parakeet/init.py
@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.2.0-beta.0"
+__version__ = "0.0.0"
 import logging
 from parakeet import audio, data, datasets, frontend, models, modules, training, utils
 logging.getLogger('parakeet').addHandler(logging.NullHandler())
--- a/parakeet/modules/stft_loss.py
+++ b/parakeet/modules/stft_loss.py
@ -43,9 +43,10 @@ class SpectralConvergenceLoss(nn.Layer):
 class LogSTFTMagnitudeLoss(nn.Layer):
    """Log STFT magnitude loss module."""
-    def __init__(self):
+    def __init__(self, epsilon=1e-10):
        """Initilize los STFT magnitude loss module."""
        super().__init__()
        self.epsilon = epsilon
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
@ -57,9 +58,9 @@ class LogSTFTMagnitudeLoss(nn.Layer):
        """
        return F.l1_loss(
            paddle.log(paddle.clip(
-                y_mag, min=1e-10)),
+                y_mag, min=self.epsilon)),
            paddle.log(paddle.clip(
-                x_mag, min=1e-10)))
+                x_mag, min=self.epsilon)))
 class STFTLoss(nn.Layer):
--- a/parakeet/training/extensions/snapshot.py
+++ b/parakeet/training/extensions/snapshot.py
@ -106,4 +106,5 @@ class Snapshot(extension.Extension):
        record_path = self.checkpoint_dir / "records.jsonl"
        with jsonlines.open(record_path, 'w') as writer:
            for record in self.records:
-                writer.write(record)
+                # jsonlines.open may return a Writer or a Reader
                writer.write(record)  # pylint: disable=no-member 
--- a/parakeet/training/trainer.py
+++ b/parakeet/training/trainer.py
@ -21,7 +21,7 @@ from typing import Callable, Union, List
 import tqdm
-from parakeet.training.trigger import get_trigger, IntervalTrigger
+from parakeet.training.trigger import get_trigger, IntervalTrigger, LimitTrigger
 from parakeet.training.updater import UpdaterBase
 from parakeet.training.reporter import scope
 from parakeet.training.extension import Extension, PRIORITY_READER
@ -42,7 +42,7 @@ class Trainer(object):
                 extensions: List[Extension]=None):
        self.updater = updater
        self.extensions = OrderedDict()
-        self.stop_trigger = get_trigger(stop_trigger)
+        self.stop_trigger = LimitTrigger(*stop_trigger)
        self.out = Path(out)
        self.observation =...
@ -125,16 +125,19 @@ class Trainer(object):
        print(self.updater.state)
-        # TODO(chenfeiyu): display progress bar correctly
+        # display only one progress bar
-        # if the trainer is controlled by epoch: use 2 progressbars
+        max_iteration = None
-        # if the trainer is controlled by iteration: use 1 progressbar
+        if isinstance(stop_trigger, LimitTrigger):
        if isinstance(stop_trigger, IntervalTrigger):
            if stop_trigger.unit is 'epoch':
-                max_epoch = self.stop_trigger.period
+                max_epoch = self.stop_trigger.limit
                updates_per_epoch = getattr(self.updater, "updates_per_epoch",
                                            None)
                max_iteration = max_epoch * updates_per_epoch if updates_per_epoch else None
            else:
-                max_iteration = self.stop_trigger.period
+                max_iteration = self.stop_trigger.limit
-        p = tqdm.tqdm(initial=self.updater.state.iteration)
+        p = tqdm.tqdm(
            initial=self.updater.state.iteration, total=max_iteration)
        try:
            while not stop_trigger(self):
--- a/parakeet/training/trigger.py
+++ b/parakeet/training/trigger.py
@ -13,6 +13,8 @@
 # limitations under the License.
 from parakeet.training.triggers.interval_trigger import IntervalTrigger
 from parakeet.training.triggers.limit_trigger import LimitTrigger
 from parakeet.training.triggers.time_trigger import TimeTrigger
 def never_file_trigger(trainer):
--- a/parakeet/training/triggers/interval_trigger.py
+++ b/parakeet/training/triggers/interval_trigger.py
@ -19,17 +19,13 @@ class IntervalTrigger(object):
    def __init__(self, period: int, unit: str):
        if unit not in ("iteration", "epoch"):
            raise ValueError("unit should be 'iteration' or 'epoch'")
        if period <= 0:
            raise ValueError("period should be a positive integer.")
        self.period = period
        self.unit = unit
    def __call__(self, trainer):
        state = trainer.updater.state
-        # we use a special scheme so we can use iteration % period == 0 as
+        index = getattr(state, self.unit)
-        # the predicate
+        fire = index % self.period == 0
        # increase the iteration then update parameters
        # instead of updating then increase iteration
        if self.unit == "epoch":
            fire = state.epoch % self.period == 0
        else:
            fire = state.iteration % self.period == 0
        return fire
--- a/parakeet/training/triggers/limit_trigger.py
+++ b/parakeet/training/triggers/limit_trigger.py
@ -0,0 +1,31 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 class LimitTrigger(object):
    """A Predicate to decide whether to stop."""
    def __init__(self, limit: int, unit: str):
        if unit not in ("iteration", "epoch"):
            raise ValueError("unit should be 'iteration' or 'epoch'")
        if limit <= 0:
            raise ValueError("limit should be a positive integer.")
        self.limit = limit
        self.unit = unit
    def __call__(self, trainer):
        state = trainer.updater.state
        index = getattr(state, self.unit)
        fire = index >= self.limit
        return fire
--- a/parakeet/training/updaters/standard_updater.py
+++ b/parakeet/training/updaters/standard_updater.py
@ -62,7 +62,40 @@ class StandardUpdater(UpdaterBase):
        self.train_iterator = iter(dataloader)
    def update(self):
-        self.state.iteration += 1
+        # We increase the iteration index after updating and before extension.
        # Here are the reasons.
        # 0. Snapshotting(as well as other extensions, like visualizer) is 
        #    executed after a step of updating;
        # 1. We decide to increase the iteration index after updating and 
        #    before any all extension is executed. 
        # 3. We do not increase the iteration after extension because we 
        #    prefer a consistent resume behavior, when load from a 
        #    `snapshot_iter_100.pdz` then the next step to train is `101`, 
        #    naturally. But if iteration is increased increased after 
        #    extension(including snapshot), then, a `snapshot_iter_99` is 
        #    loaded. You would need a extra increasing of the iteration idex 
        #    before training to avoid another iteration `99`, which has been 
        #    done before snapshotting.
        # 4. Thus iteration index represrnts "currently how mant epochs has 
        #    been done."
        # NOTE: use report to capture the correctly value. If you want to 
        # report the learning rate used for a step, you must report it before
        # the learning rate scheduler's step() has been called. In paddle's 
        # convention, we do not use an extension to change the learning rate.
        # so if you want to report it, do it in the updater.
        # Then here comes the next question. When is the proper time to 
        # increase the epoch index? Since all extensions are executed after 
        # updating, it is the time that after updating is the proper time to 
        # increase epoch index. 
        # 1. If we increase the epoch index before updating, then an extension
        #    based ot epoch would miss the correct timing. It could only be 
        #    triggerd after an extra updating.
        # 2. Theoretically, when an epoch is done, the epoch index should be 
        #    increased. So it would be increase after updating.
        # 3. Thus, eppoch index represents "currently how many epochs has been
        #    done." So it starts from 0.
        # switch to training mode
        for layer in self.models.values():
@ -72,6 +105,11 @@ class StandardUpdater(UpdaterBase):
        batch = self.read_batch()
        self.update_core(batch)
        self.state.iteration += 1
        if self.updaters_per_epoch is not None:
            if self.state.iteration % self.updaters_per_epoch == 0:
                self.state.epoch += 1
    def update_core(self, batch):
        """A simple case for a training step. Basic assumptions are:
        Single model;
@ -100,10 +138,20 @@ class StandardUpdater(UpdaterBase):
        loss_dict["main"].backward()
        self.optimizer.update()
    @property
    def updaters_per_epoch(self):
        """Number of updater per epoch, determined by the length of the 
        dataloader."""
        length_of_dataloader = None
        try:
            length_of_dataloader = len(self.dataloader)
        except TypeError:
            logging.debug("This dataloader has no __len__.")
        finally:
            return length_of_dataloader
    def new_epoch(self):
        """Start a new epoch."""
        self.state.epoch += 1
        # NOTE: all batch sampler for distributed training should
        # subclass DistributedBatchSampler and implement `set_epoch` method
        batch_sampler = self.dataloader.batch_sampler
@ -140,13 +188,3 @@ class StandardUpdater(UpdaterBase):
        for name, optim in self.optimizers.items():
            optim.set_state_dict(state_dict[f"{name}_optimizer"])
        super().set_state_dict(state_dict)
    def save(self, path):
        """Save Updater state dict."""
        archive = self.state_dict()
        paddle.save(archive, path)
    def load(self, path):
        """Load Updater state dict."""
        archive = paddle.load(path)
        self.set_state_dict(archive)
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@ -0,0 +1,39 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import shutil
 from pathlib import Path
 import paddle
 from paddle import nn
 from paddle.optimizer import Adam
 from paddle.optimizer.lr import StepDecay
 def test_optimizer():
    model1 = nn.Linear(3, 4)
    optim1 = Adam(
        parameters=model1.parameters(), learning_rate=StepDecay(0.1, 100))
    output_dir = Path("temp_test_optimizer")
    shutil.rmtree(output_dir, ignore_errors=True)
    output_dir.mkdir(exist_ok=True, parents=True)
    # model1.set_state_dict(model1.state_dict())
    optim1.set_state_dict(optim1.state_dict())
    x = paddle.randn([6, 3])
    y = model1(x).sum()
    y.backward()
    optim1.step()