PaddleOCR/ppocr/data/__init__.py

100 lines
3.2 KiB
Python
Raw Normal View History

2020-05-10 16:26:57 +08:00
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2020-10-13 17:13:33 +08:00
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
import sys
import numpy as np
import paddle
2020-11-04 20:43:27 +08:00
import signal
import random
2020-10-13 17:13:33 +08:00
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
import copy
2020-11-04 20:43:27 +08:00
from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler
2020-10-13 17:13:33 +08:00
import paddle.distributed as dist
from ppocr.data.imaug import transform, create_operators
2020-11-04 20:43:27 +08:00
from ppocr.data.simple_dataset import SimpleDataSet
2020-12-30 16:15:49 +08:00
from ppocr.data.lmdb_dataset import LMDBDataSet
2021-03-08 14:15:47 +08:00
from ppocr.data.pgnet_dataset import PGDateSet
2020-10-13 17:13:33 +08:00
__all__ = ['build_dataloader', 'transform', 'create_operators']
2020-11-05 15:13:36 +08:00
2020-11-04 20:43:27 +08:00
def term_mp(sig_num, frame):
""" kill all child processes
"""
pid = os.getpid()
pgid = os.getpgid(os.getpid())
print("main proc {} exit, kill process group " "{}".format(pid, pgid))
os.killpg(pgid, signal.SIGKILL)
2020-10-13 17:13:33 +08:00
2020-11-05 15:13:36 +08:00
2020-11-04 20:43:27 +08:00
signal.signal(signal.SIGINT, term_mp)
signal.signal(signal.SIGTERM, term_mp)
2020-10-13 17:13:33 +08:00
2020-11-05 15:13:36 +08:00
def build_dataloader(config, mode, device, logger, seed=None):
2020-11-04 20:43:27 +08:00
config = copy.deepcopy(config)
2020-11-05 15:13:36 +08:00
2021-03-08 14:15:47 +08:00
support_dict = ['SimpleDataSet', 'LMDBDateSet', 'PGDateSet']
2020-11-04 20:43:27 +08:00
module_name = config[mode]['dataset']['name']
2020-10-13 17:13:33 +08:00
assert module_name in support_dict, Exception(
'DataSet only support {}'.format(support_dict))
2020-11-05 15:13:36 +08:00
assert mode in ['Train', 'Eval', 'Test'
], "Mode should be Train, Eval or Test."
dataset = eval(module_name)(config, mode, logger, seed)
2020-11-04 20:43:27 +08:00
loader_config = config[mode]['loader']
batch_size = loader_config['batch_size_per_card']
drop_last = loader_config['drop_last']
2021-01-27 15:01:41 +08:00
shuffle = loader_config['shuffle']
2020-11-04 20:43:27 +08:00
num_workers = loader_config['num_workers']
if 'use_shared_memory' in loader_config.keys():
use_shared_memory = loader_config['use_shared_memory']
else:
use_shared_memory = True
2020-11-04 20:43:27 +08:00
if mode == "Train":
#Distribute data to multiple cards
batch_sampler = DistributedBatchSampler(
dataset=dataset,
batch_size=batch_size,
2021-01-27 15:01:41 +08:00
shuffle=shuffle,
2020-11-04 20:43:27 +08:00
drop_last=drop_last)
2020-10-13 17:13:33 +08:00
else:
2020-11-04 20:43:27 +08:00
#Distribute data to single card
batch_sampler = BatchSampler(
dataset=dataset,
batch_size=batch_size,
2021-01-27 15:01:41 +08:00
shuffle=shuffle,
2020-11-05 15:13:36 +08:00
drop_last=drop_last)
2020-11-04 20:43:27 +08:00
data_loader = DataLoader(
dataset=dataset,
batch_sampler=batch_sampler,
places=device,
num_workers=num_workers,
return_list=True,
use_shared_memory=use_shared_memory)
2020-11-05 15:13:36 +08:00
2020-11-04 20:43:27 +08:00
return data_loader