change interface for io.py

2020-03-26 09:48:47 +08:00 · 2020-03-26 09:48:47 +08:00 · c845fbd51d
parent 64790853e5
commit c845fbd51d
1 changed files with 63 additions and 45 deletions
--- a/parakeet/utils/io.py
+++ b/parakeet/utils/io.py
@ -20,6 +20,11 @@ import numpy as np
 import paddle.fluid.dygraph as dg
 def is_main_process():
    local_rank = dg.parallel.Env().local_rank
    return local_rank == 0
 def add_yaml_config_to_args(config):
    """ Add args in yaml config to the args parsed by argparse. The argument in 
        yaml config will be overwritten by the same argument in argparse if they 
@ -41,7 +46,7 @@ def add_yaml_config_to_args(config):
    return config
-def load_latest_checkpoint(checkpoint_dir, rank=0):
+def _load_latest_checkpoint(checkpoint_dir):
    """Get the iteration number corresponding to the latest saved checkpoint
    Args:
@ -52,26 +57,20 @@ def load_latest_checkpoint(checkpoint_dir, rank=0):
    Returns:
        int: the latest iteration number.
    """
-    checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
+    checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
    # Create checkpoint index file if not exist.
-    if (not os.path.isfile(checkpoint_path)) and rank == 0:
+    if (not os.path.isfile(checkpoint_record)):
-        with open(checkpoint_path, "w") as handle:
+        return 0
            handle.write("model_checkpoint_path: step-0")
    # Make sure that other process waits until checkpoint file is created
    # by process 0.
    while not os.path.isfile(checkpoint_path):
        time.sleep(1)
    # Fetch the latest checkpoint index.
-    with open(checkpoint_path, "r") as handle:
+    with open(checkpoint_record, "r") as handle:
        latest_checkpoint = handle.readline().split()[-1]
        iteration = int(latest_checkpoint.split("-")[-1])
    return iteration
-def save_latest_checkpoint(checkpoint_dir, iteration):
+def _save_checkpoint(checkpoint_dir, iteration):
    """Save the iteration number of the latest model to be checkpointed.
    Args:
@ -81,60 +80,76 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
    Returns:
        None
    """
-    checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
+    checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
    # Update the latest checkpoint index.
-    with open(checkpoint_path, "w") as handle:
+    with open(checkpoint_record, "w") as handle:
        handle.write("model_checkpoint_path: step-{}".format(iteration))
-def load_parameters(checkpoint_dir,
+def load_parameters(model,
                    rank,
                    model,
                    optimizer=None,
                    checkpoint_dir=None,
                    iteration=None,
-                    file_path=None,
+                    checkpoint_path=None,
                    dtype="float32"):
    """Load a specific model checkpoint from disk.
    Args:
        checkpoint_dir (str): the directory where checkpoint is saved.
        rank (int): the rank of the process in multi-process setting.
        model (obj): model to load parameters.
        optimizer (obj, optional): optimizer to load states if needed.
            Defaults to None.
        checkpoint_dir (str, optional): the directory where checkpoint is saved.
        iteration (int, optional): if specified, load the specific checkpoint,
            if not specified, load the latest one. Defaults to None.
-        file_path (str, optional): if specified, load the checkpoint
+        checkpoint_path (str, optional): if specified, load the checkpoint
-            stored in the file_path. Defaults to None.
+            stored in the checkpoint_path. Defaults to None. 
        dtype (str, optional): precision of the model parameters.
            Defaults to float32.
    Returns:
-        None
+        iteration (int): number of iterations that the loaded checkpoint has 
            been trained.
    """
-    if file_path is None:
+    if checkpoint_dir is not None and checkpoint_path is not None:
-        if iteration is None:
+        raise ValueError(
-            iteration = load_latest_checkpoint(checkpoint_dir, rank)
+            "Load from either from (checkpoint_dir and iteration) \n"
-        if iteration == 0:
+            "or checkpoint_path. Do not pass both.")
-            return
+    if iteration is not None and checkpoint_dir is None:
-        file_path = "{}/step-{}".format(checkpoint_dir, iteration)
+        raise ValueError(
            "When iteration is specified, checkpoint_dir should not be None")
    if checkpoint_dir is not None:
        if iteration is None:
            iteration = _load_latest_checkpoint(checkpoint_dir)
        checkpoint_path = os.path.join(checkpoint_dir,
                                       "step-{}".format(iteration))
        if iteration == 0 and not os.path.exists(checkpoint_path):
            # if step-0 exist, it is also loaded
            return iteration
    else:
        # checkpoint is not None
        iteration = int(os.path.basename(checkpoint_path).split("-")[-1])
    local_rank = dg.parallel.Env().local_rank
    model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path)
    # cast to desired data type
    for k, v in model_dict.items():
        model_dict[k] = v.astype(dtype)
    model_dict, optimizer_dict = dg.load_dygraph(file_path)
    if dtype == "float16":
        for k, v in model_dict.items():
            if "conv2d_transpose" in k:
                model_dict[k] = v.astype("float32")
            else:
                model_dict[k] = v.astype(dtype)
    model.set_dict(model_dict)
-    print("[checkpoint] Rank {}: loaded model from {}".format(rank, file_path))
+    print("[checkpoint] Rank {}: loaded model from {}.pdparams".format(
        local_rank, checkpoint_path))
    if optimizer and optimizer_dict:
        optimizer.set_dict(optimizer_dict)
-        print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
+        print("[checkpoint] Rank {}: loaded optimizer state from {}.pdopt".
-            rank, file_path))
+              format(local_rank, checkpoint_path))
    return iteration
-def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
+def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
    """Checkpoint the latest trained model parameters.
    Args:
@ -147,12 +162,15 @@ def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
    Returns:
        None
    """
-    file_path = "{}/step-{}".format(checkpoint_dir, iteration)
+    checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
    model_dict = model.state_dict()
-    dg.save_dygraph(model_dict, file_path)
+    dg.save_dygraph(model_dict, checkpoint_path)
-    print("[checkpoint] Saved model to {}".format(file_path))
+    print("[checkpoint] Saved model to {}.pdparams".format(checkpoint_path))
    if optimizer:
        opt_dict = optimizer.state_dict()
-        dg.save_dygraph(opt_dict, file_path)
+        dg.save_dygraph(opt_dict, checkpoint_path)
-        print("[checkpoint] Saved optimzier state to {}".format(file_path))
+        print("[checkpoint] Saved optimzier state to {}.pdopt".format(
            checkpoint_path))
    _save_checkpoint(checkpoint_dir, iteration)