Adapt the change in save & load

This commit is contained in:
liuyibing01 2020-03-26 09:27:22 +00:00
parent fccbf6d797
commit 618eb31ab4
7 changed files with 49 additions and 68 deletions

View File

@ -13,8 +13,8 @@ PaddlePaddle dynamic graph implementation of [WaveFlow: A Compact Flow-based Mod
├── synthesis.py # script for speech synthesis ├── synthesis.py # script for speech synthesis
├── train.py # script for model training ├── train.py # script for model training
├── utils.py # helper functions for e.g., model checkpointing ├── utils.py # helper functions for e.g., model checkpointing
├── parakeet/models/waveflow/data.py # dataset and dataloader settings for LJSpeech ├── data.py # dataset and dataloader settings for LJSpeech
├── parakeet/models/waveflow/waveflow.py # WaveFlow model high level APIs ├── waveflow.py # WaveFlow model high level APIs
└── parakeet/models/waveflow/waveflow_modules.py # WaveFlow model implementation └── parakeet/models/waveflow/waveflow_modules.py # WaveFlow model implementation
``` ```
@ -48,12 +48,12 @@ python -u train.py \
--config=./configs/waveflow_ljspeech.yaml \ --config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \ --root=./data/LJSpeech-1.1 \
--name=${ModelName} --batch_size=4 \ --name=${ModelName} --batch_size=4 \
--parallel=false --use_gpu=true --use_gpu=true
``` ```
#### Save and Load checkpoints #### Save and Load checkpoints
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default. Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default, where `${ModelName}` is the model name for one single experiment and it could be whatever you like.
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
@ -68,7 +68,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3
python -u -m paddle.distributed.launch train.py \ python -u -m paddle.distributed.launch train.py \
--config=./configs/waveflow_ljspeech.yaml \ --config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \ --root=./data/LJSpeech-1.1 \
--name=${ModelName} --parallel=true --use_gpu=true --name=${ModelName} --use_gpu=true
``` ```
Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode. Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode.

View File

@ -23,7 +23,7 @@ from paddle import fluid
import utils import utils
from parakeet.utils import io from parakeet.utils import io
from parakeet.models.waveflow import WaveFlow from waveflow import WaveFlow
def add_options_to_parser(parser): def add_options_to_parser(parser):

View File

@ -21,9 +21,9 @@ import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from paddle import fluid from paddle import fluid
import utils
from parakeet.models.waveflow import WaveFlow
from parakeet.utils import io from parakeet.utils import io
import utils
from waveflow import WaveFlow
def add_options_to_parser(parser): def add_options_to_parser(parser):

View File

@ -26,7 +26,7 @@ from tensorboardX import SummaryWriter
import utils import utils
from parakeet.utils import io from parakeet.utils import io
from parakeet.models.waveflow import WaveFlow from waveflow import WaveFlow
def add_options_to_parser(parser): def add_options_to_parser(parser):
@ -40,11 +40,6 @@ def add_options_to_parser(parser):
parser.add_argument( parser.add_argument(
'--root', type=str, help="root path of the LJSpeech dataset") '--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument(
'--parallel',
type=utils.str2bool,
default=True,
help="option to use data parallel training")
parser.add_argument( parser.add_argument(
'--use_gpu', '--use_gpu',
type=utils.str2bool, type=utils.str2bool,
@ -66,11 +61,11 @@ def add_options_to_parser(parser):
def train(config): def train(config):
use_gpu = config.use_gpu use_gpu = config.use_gpu
parallel = config.parallel if use_gpu else False
# Get the rank of the current training process. # Get the rank of the current training process.
rank = dg.parallel.Env().local_rank if parallel else 0 rank = dg.parallel.Env().local_rank
nranks = dg.parallel.Env().nranks if parallel else 1 nranks = dg.parallel.Env().nranks
parallel = nranks > 1
if rank == 0: if rank == 0:
# Print the whole config setting. # Print the whole config setting.
@ -100,16 +95,7 @@ def train(config):
# Build model. # Build model.
model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb) model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb)
model.build() iteration = model.build()
# Obtain the current iteration.
if config.checkpoint is None:
if config.iteration is None:
iteration = io.load_latest_checkpoint(checkpoint_dir, rank)
else:
iteration = config.iteration
else:
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
while iteration < config.max_iterations: while iteration < config.max_iterations:
# Run one single training step. # Run one single training step.

View File

@ -21,11 +21,11 @@ import paddle.fluid.dygraph as dg
from paddle import fluid from paddle import fluid
from scipy.io.wavfile import write from scipy.io.wavfile import write
import utils
from parakeet.utils import io from parakeet.utils import io
from parakeet.modules import weight_norm from parakeet.modules import weight_norm
from .data import LJSpeech from parakeet.models.waveflow import WaveFlowLoss, WaveFlowModule
from .waveflow_modules import WaveFlowLoss, WaveFlowModule from data import LJSpeech
import utils
class WaveFlow(): class WaveFlow():
@ -93,13 +93,12 @@ class WaveFlow():
parameter_list=waveflow.parameters()) parameter_list=waveflow.parameters())
# Load parameters. # Load parameters.
io.load_parameters( iteration = io.load_parameters(
self.checkpoint_dir, model=waveflow,
self.rank, optimizer=optimizer,
waveflow, checkpoint_dir=self.checkpoint_dir,
optimizer,
iteration=config.iteration, iteration=config.iteration,
file_path=config.checkpoint) checkpoint_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank)) print("Rank {}: checkpoint loaded.".format(self.rank))
# Data parallelism. # Data parallelism.
@ -113,13 +112,11 @@ class WaveFlow():
else: else:
# Load parameters. # Load parameters.
io.load_parameters( iteration = io.load_parameters(
self.checkpoint_dir, model=waveflow,
self.rank, checkpoint_dir=self.checkpoint_dir,
waveflow,
iteration=config.iteration, iteration=config.iteration,
file_path=config.checkpoint, checkpoint_path=config.checkpoint)
dtype=self.dtype)
print("Rank {}: checkpoint loaded.".format(self.rank)) print("Rank {}: checkpoint loaded.".format(self.rank))
for layer in waveflow.sublayers(): for layer in waveflow.sublayers():
@ -128,6 +125,8 @@ class WaveFlow():
self.waveflow = waveflow self.waveflow = waveflow
return iteration
def train_step(self, iteration): def train_step(self, iteration):
"""Train the model for one step. """Train the model for one step.
@ -293,6 +292,5 @@ class WaveFlow():
Returns: Returns:
None None
""" """
io.save_latest_parameters(self.checkpoint_dir, iteration, io.save_parameters(self.checkpoint_dir, iteration, self.waveflow,
self.waveflow, self.optimizer) self.optimizer)
io.save_latest_checkpoint(self.checkpoint_dir, iteration)

View File

@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from parakeet.models.waveflow.waveflow import WaveFlow from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule

View File

@ -18,6 +18,7 @@ import time
import ruamel.yaml import ruamel.yaml
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from paddle.fluid.framework import convert_np_dtype_to_dtype_ as convert_np_dtype
def is_main_process(): def is_main_process():
@ -90,8 +91,7 @@ def load_parameters(model,
optimizer=None, optimizer=None,
checkpoint_dir=None, checkpoint_dir=None,
iteration=None, iteration=None,
checkpoint_path=None, checkpoint_path=None):
dtype="float32"):
"""Load a specific model checkpoint from disk. """Load a specific model checkpoint from disk.
Args: Args:
@ -102,40 +102,37 @@ def load_parameters(model,
iteration (int, optional): if specified, load the specific checkpoint, iteration (int, optional): if specified, load the specific checkpoint,
if not specified, load the latest one. Defaults to None. if not specified, load the latest one. Defaults to None.
checkpoint_path (str, optional): if specified, load the checkpoint checkpoint_path (str, optional): if specified, load the checkpoint
stored in the checkpoint_path. Defaults to None. stored in the checkpoint_path and the argument 'checkpoint_dir' will
dtype (str, optional): precision of the model parameters. be ignored. Defaults to None.
Defaults to float32.
Returns: Returns:
iteration (int): number of iterations that the loaded checkpoint has iteration (int): number of iterations that the loaded checkpoint has
been trained. been trained.
""" """
if checkpoint_dir is not None and checkpoint_path is not None: if checkpoint_path is not None:
raise ValueError( iteration = int(os.path.basename(checkpoint_path).split("-")[-1])
"Load from either from (checkpoint_dir and iteration) \n" elif checkpoint_dir is not None:
"or checkpoint_path. Do not pass both.")
if iteration is not None and checkpoint_dir is None:
raise ValueError(
"When iteration is specified, checkpoint_dir should not be None")
if checkpoint_dir is not None:
if iteration is None: if iteration is None:
iteration = _load_latest_checkpoint(checkpoint_dir) iteration = _load_latest_checkpoint(checkpoint_dir)
checkpoint_path = os.path.join(checkpoint_dir, if iteration == 0:
"step-{}".format(iteration))
if iteration == 0 and not os.path.exists(checkpoint_path):
# if step-0 exist, it is also loaded # if step-0 exist, it is also loaded
return iteration return iteration
checkpoint_path = os.path.join(checkpoint_dir,
"step-{}".format(iteration))
else: else:
# checkpoint is not None raise ValueError(
iteration = int(os.path.basename(checkpoint_path).split("-")[-1]) "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
)
local_rank = dg.parallel.Env().local_rank local_rank = dg.parallel.Env().local_rank
model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path) model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path)
# cast to desired data type state_dict = model.state_dict()
# cast to desired data type, for mixed-precision training/inference.
for k, v in model_dict.items(): for k, v in model_dict.items():
model_dict[k] = v.astype(dtype) if k in state_dict and convert_np_dtype(v.dtype) != state_dict[
k].dtype:
model_dict[k] = v.astype(state_dict[k].numpy().dtype)
model.set_dict(model_dict) model.set_dict(model_dict)
print("[checkpoint] Rank {}: loaded model from {}.pdparams".format( print("[checkpoint] Rank {}: loaded model from {}.pdparams".format(