fix multiprocessing training: other processes have to wait untils the output directory in created.

This commit is contained in:
chenfeiyu 2021-02-23 10:40:14 +08:00
parent 7b0de356f9
commit e69ab88fe6
1 changed files with 5 additions and 0 deletions

View File

@ -21,6 +21,7 @@ from paddle import distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler from paddle.io import DataLoader, DistributedBatchSampler
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from collections import defaultdict from collections import defaultdict
import time
import parakeet import parakeet
from parakeet.utils import checkpoint, mp_tools from parakeet.utils import checkpoint, mp_tools
@ -205,6 +206,8 @@ class ExperimentBase(object):
output_dir = Path(self.args.output).expanduser() output_dir = Path(self.args.output).expanduser()
if dist.get_rank() == 0: if dist.get_rank() == 0:
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
while not output_dir.exists():
time.sleep(1)
self.output_dir = output_dir self.output_dir = output_dir
@ -217,6 +220,8 @@ class ExperimentBase(object):
checkpoint_dir = self.output_dir / "checkpoints" checkpoint_dir = self.output_dir / "checkpoints"
if dist.get_rank() == 0: if dist.get_rank() == 0:
checkpoint_dir.mkdir(exist_ok=True) checkpoint_dir.mkdir(exist_ok=True)
while not checkpoint_dir.exists():
time.sleep(1)
self.checkpoint_dir = checkpoint_dir self.checkpoint_dir = checkpoint_dir