From e69ab88fe61a9464a2832e5f6c685ed8f95505f9 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 23 Feb 2021 10:40:14 +0800 Subject: [PATCH] fix multiprocessing training: other processes have to wait untils the output directory in created. --- parakeet/training/experiment.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parakeet/training/experiment.py b/parakeet/training/experiment.py index 5f9c588..69e5ef3 100644 --- a/parakeet/training/experiment.py +++ b/parakeet/training/experiment.py @@ -21,6 +21,7 @@ from paddle import distributed as dist from paddle.io import DataLoader, DistributedBatchSampler from tensorboardX import SummaryWriter from collections import defaultdict +import time import parakeet from parakeet.utils import checkpoint, mp_tools @@ -205,6 +206,8 @@ class ExperimentBase(object): output_dir = Path(self.args.output).expanduser() if dist.get_rank() == 0: output_dir.mkdir(parents=True, exist_ok=True) + while not output_dir.exists(): + time.sleep(1) self.output_dir = output_dir @@ -217,6 +220,8 @@ class ExperimentBase(object): checkpoint_dir = self.output_dir / "checkpoints" if dist.get_rank() == 0: checkpoint_dir.mkdir(exist_ok=True) + while not checkpoint_dir.exists(): + time.sleep(1) self.checkpoint_dir = checkpoint_dir