From 618eb31ab4918af41e03be582f0ca4dec6046ad8 Mon Sep 17 00:00:00 2001 From: liuyibing01 Date: Thu, 26 Mar 2020 09:27:22 +0000 Subject: [PATCH] Adapt the change in save & load --- examples/waveflow/README.md | 10 +++---- examples/waveflow/benchmark.py | 2 +- examples/waveflow/synthesis.py | 4 +-- examples/waveflow/train.py | 24 ++++------------ examples/waveflow/waveflow.py | 34 +++++++++++------------ parakeet/models/waveflow/__init__.py | 2 +- parakeet/utils/io.py | 41 +++++++++++++--------------- 7 files changed, 49 insertions(+), 68 deletions(-) diff --git a/examples/waveflow/README.md b/examples/waveflow/README.md index 34e6908..16364f6 100644 --- a/examples/waveflow/README.md +++ b/examples/waveflow/README.md @@ -13,8 +13,8 @@ PaddlePaddle dynamic graph implementation of [WaveFlow: A Compact Flow-based Mod ├── synthesis.py # script for speech synthesis ├── train.py # script for model training ├── utils.py # helper functions for e.g., model checkpointing -├── parakeet/models/waveflow/data.py # dataset and dataloader settings for LJSpeech -├── parakeet/models/waveflow/waveflow.py # WaveFlow model high level APIs +├── data.py # dataset and dataloader settings for LJSpeech +├── waveflow.py # WaveFlow model high level APIs └── parakeet/models/waveflow/waveflow_modules.py # WaveFlow model implementation ``` @@ -48,12 +48,12 @@ python -u train.py \ --config=./configs/waveflow_ljspeech.yaml \ --root=./data/LJSpeech-1.1 \ --name=${ModelName} --batch_size=4 \ - --parallel=false --use_gpu=true + --use_gpu=true ``` #### Save and Load checkpoints -Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default. +Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default, where `${ModelName}` is the model name for one single experiment and it could be whatever you like. The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): @@ -68,7 +68,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3 python -u -m paddle.distributed.launch train.py \ --config=./configs/waveflow_ljspeech.yaml \ --root=./data/LJSpeech-1.1 \ - --name=${ModelName} --parallel=true --use_gpu=true + --name=${ModelName} --use_gpu=true ``` Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode. diff --git a/examples/waveflow/benchmark.py b/examples/waveflow/benchmark.py index 0581471..222e732 100644 --- a/examples/waveflow/benchmark.py +++ b/examples/waveflow/benchmark.py @@ -23,7 +23,7 @@ from paddle import fluid import utils from parakeet.utils import io -from parakeet.models.waveflow import WaveFlow +from waveflow import WaveFlow def add_options_to_parser(parser): diff --git a/examples/waveflow/synthesis.py b/examples/waveflow/synthesis.py index 5f3dd5a..15c4d3b 100644 --- a/examples/waveflow/synthesis.py +++ b/examples/waveflow/synthesis.py @@ -21,9 +21,9 @@ import numpy as np import paddle.fluid.dygraph as dg from paddle import fluid -import utils -from parakeet.models.waveflow import WaveFlow from parakeet.utils import io +import utils +from waveflow import WaveFlow def add_options_to_parser(parser): diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py index 548c5da..a033369 100644 --- a/examples/waveflow/train.py +++ b/examples/waveflow/train.py @@ -26,7 +26,7 @@ from tensorboardX import SummaryWriter import utils from parakeet.utils import io -from parakeet.models.waveflow import WaveFlow +from waveflow import WaveFlow def add_options_to_parser(parser): @@ -40,11 +40,6 @@ def add_options_to_parser(parser): parser.add_argument( '--root', type=str, help="root path of the LJSpeech dataset") - parser.add_argument( - '--parallel', - type=utils.str2bool, - default=True, - help="option to use data parallel training") parser.add_argument( '--use_gpu', type=utils.str2bool, @@ -66,11 +61,11 @@ def add_options_to_parser(parser): def train(config): use_gpu = config.use_gpu - parallel = config.parallel if use_gpu else False # Get the rank of the current training process. - rank = dg.parallel.Env().local_rank if parallel else 0 - nranks = dg.parallel.Env().nranks if parallel else 1 + rank = dg.parallel.Env().local_rank + nranks = dg.parallel.Env().nranks + parallel = nranks > 1 if rank == 0: # Print the whole config setting. @@ -100,16 +95,7 @@ def train(config): # Build model. model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb) - model.build() - - # Obtain the current iteration. - if config.checkpoint is None: - if config.iteration is None: - iteration = io.load_latest_checkpoint(checkpoint_dir, rank) - else: - iteration = config.iteration - else: - iteration = int(config.checkpoint.split('/')[-1].split('-')[-1]) + iteration = model.build() while iteration < config.max_iterations: # Run one single training step. diff --git a/examples/waveflow/waveflow.py b/examples/waveflow/waveflow.py index faf2fb6..700116b 100644 --- a/examples/waveflow/waveflow.py +++ b/examples/waveflow/waveflow.py @@ -21,11 +21,11 @@ import paddle.fluid.dygraph as dg from paddle import fluid from scipy.io.wavfile import write -import utils from parakeet.utils import io from parakeet.modules import weight_norm -from .data import LJSpeech -from .waveflow_modules import WaveFlowLoss, WaveFlowModule +from parakeet.models.waveflow import WaveFlowLoss, WaveFlowModule +from data import LJSpeech +import utils class WaveFlow(): @@ -93,13 +93,12 @@ class WaveFlow(): parameter_list=waveflow.parameters()) # Load parameters. - io.load_parameters( - self.checkpoint_dir, - self.rank, - waveflow, - optimizer, + iteration = io.load_parameters( + model=waveflow, + optimizer=optimizer, + checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, - file_path=config.checkpoint) + checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) # Data parallelism. @@ -113,13 +112,11 @@ class WaveFlow(): else: # Load parameters. - io.load_parameters( - self.checkpoint_dir, - self.rank, - waveflow, + iteration = io.load_parameters( + model=waveflow, + checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, - file_path=config.checkpoint, - dtype=self.dtype) + checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) for layer in waveflow.sublayers(): @@ -128,6 +125,8 @@ class WaveFlow(): self.waveflow = waveflow + return iteration + def train_step(self, iteration): """Train the model for one step. @@ -293,6 +292,5 @@ class WaveFlow(): Returns: None """ - io.save_latest_parameters(self.checkpoint_dir, iteration, - self.waveflow, self.optimizer) - io.save_latest_checkpoint(self.checkpoint_dir, iteration) + io.save_parameters(self.checkpoint_dir, iteration, self.waveflow, + self.optimizer) diff --git a/parakeet/models/waveflow/__init__.py b/parakeet/models/waveflow/__init__.py index 73a7914..b068b59 100644 --- a/parakeet/models/waveflow/__init__.py +++ b/parakeet/models/waveflow/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parakeet.models.waveflow.waveflow import WaveFlow +from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule diff --git a/parakeet/utils/io.py b/parakeet/utils/io.py index e9e1240..8c0e5aa 100644 --- a/parakeet/utils/io.py +++ b/parakeet/utils/io.py @@ -18,6 +18,7 @@ import time import ruamel.yaml import numpy as np import paddle.fluid.dygraph as dg +from paddle.fluid.framework import convert_np_dtype_to_dtype_ as convert_np_dtype def is_main_process(): @@ -90,9 +91,8 @@ def load_parameters(model, optimizer=None, checkpoint_dir=None, iteration=None, - checkpoint_path=None, - dtype="float32"): - """Load a specific model checkpoint from disk. + checkpoint_path=None): + """Load a specific model checkpoint from disk. Args: model (obj): model to load parameters. @@ -102,40 +102,37 @@ def load_parameters(model, iteration (int, optional): if specified, load the specific checkpoint, if not specified, load the latest one. Defaults to None. checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path. Defaults to None. - dtype (str, optional): precision of the model parameters. - Defaults to float32. + stored in the checkpoint_path and the argument 'checkpoint_dir' will + be ignored. Defaults to None. Returns: iteration (int): number of iterations that the loaded checkpoint has been trained. """ - if checkpoint_dir is not None and checkpoint_path is not None: - raise ValueError( - "Load from either from (checkpoint_dir and iteration) \n" - "or checkpoint_path. Do not pass both.") - if iteration is not None and checkpoint_dir is None: - raise ValueError( - "When iteration is specified, checkpoint_dir should not be None") - - if checkpoint_dir is not None: + if checkpoint_path is not None: + iteration = int(os.path.basename(checkpoint_path).split("-")[-1]) + elif checkpoint_dir is not None: if iteration is None: iteration = _load_latest_checkpoint(checkpoint_dir) - checkpoint_path = os.path.join(checkpoint_dir, - "step-{}".format(iteration)) - if iteration == 0 and not os.path.exists(checkpoint_path): + if iteration == 0: # if step-0 exist, it is also loaded return iteration + checkpoint_path = os.path.join(checkpoint_dir, + "step-{}".format(iteration)) else: - # checkpoint is not None - iteration = int(os.path.basename(checkpoint_path).split("-")[-1]) + raise ValueError( + "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + ) local_rank = dg.parallel.Env().local_rank model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path) - # cast to desired data type + state_dict = model.state_dict() + # cast to desired data type, for mixed-precision training/inference. for k, v in model_dict.items(): - model_dict[k] = v.astype(dtype) + if k in state_dict and convert_np_dtype(v.dtype) != state_dict[ + k].dtype: + model_dict[k] = v.astype(state_dict[k].numpy().dtype) model.set_dict(model_dict) print("[checkpoint] Rank {}: loaded model from {}.pdparams".format(