Merge pull request #3915 from WenmuZhou/whl

add 2.1 models to paddleocr whl
This commit is contained in:
DanielYang 2021-09-07 00:57:22 +08:00 committed by GitHub
commit 4016805f6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 206 additions and 125 deletions

View File

@ -33,104 +33,141 @@ from tools.infer.utility import draw_ocr, str2bool
from ppstructure.utility import init_args, draw_structure_result from ppstructure.utility import init_args, draw_structure_result
from ppstructure.predict_system import OCRSystem, save_structure_res from ppstructure.predict_system import OCRSystem, save_structure_res
__all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res','download_with_progressbar'] __all__ = [
'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result',
model_urls = { 'save_structure_res', 'download_with_progressbar'
'det': { ]
'ch':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar',
'en':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar',
'structure': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar'
},
'rec': {
'ch': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/ppocr_keys_v1.txt'
},
'en': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/en_dict.txt'
},
'french': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/french_dict.txt'
},
'german': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/german_dict.txt'
},
'korean': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/korean_dict.txt'
},
'japan': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/japan_dict.txt'
},
'chinese_cht': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt'
},
'ta': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/ta_dict.txt'
},
'te': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/te_dict.txt'
},
'ka': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/ka_dict.txt'
},
'latin': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/latin_dict.txt'
},
'arabic': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/arabic_dict.txt'
},
'cyrillic': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/cyrillic_dict.txt'
},
'devanagari': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/devanagari_dict.txt'
},
'structure': {
'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar',
'dict_path': 'ppocr/utils/dict/table_dict.txt'
}
},
'cls': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar',
'table': {
'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar',
'dict_path': 'ppocr/utils/dict/table_structure_dict.txt'
}
}
SUPPORT_DET_MODEL = ['DB'] SUPPORT_DET_MODEL = ['DB']
VERSION = '2.2.0.1' VERSION = '2.2.1'
SUPPORT_REC_MODEL = ['CRNN'] SUPPORT_REC_MODEL = ['CRNN']
BASE_DIR = os.path.expanduser("~/.paddleocr/") BASE_DIR = os.path.expanduser("~/.paddleocr/")
DEFAULT_MODEL_VERSION = '2.0'
MODEL_URLS = {
'2.1': {
'det': {
'ch': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.1/chinese/ch_ppocr_mobile_v2.1_det_infer.tar',
},
},
'rec': {
'ch': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.1/chinese/ch_ppocr_mobile_v2.1_rec_infer.tar',
'dict_path': './ppocr/utils/ppocr_keys_v1.txt'
}
}
},
'2.0': {
'det': {
'ch': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar',
},
'en': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar',
},
'structure': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar'
}
},
'rec': {
'ch': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/ppocr_keys_v1.txt'
},
'en': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/en_dict.txt'
},
'french': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/french_dict.txt'
},
'german': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/german_dict.txt'
},
'korean': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/korean_dict.txt'
},
'japan': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/japan_dict.txt'
},
'chinese_cht': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt'
},
'ta': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/ta_dict.txt'
},
'te': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/te_dict.txt'
},
'ka': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/ka_dict.txt'
},
'latin': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/latin_dict.txt'
},
'arabic': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/arabic_dict.txt'
},
'cyrillic': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/cyrillic_dict.txt'
},
'devanagari': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar',
'dict_path': './ppocr/utils/dict/devanagari_dict.txt'
},
'structure': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar',
'dict_path': 'ppocr/utils/dict/table_dict.txt'
}
},
'cls': {
'ch': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar',
}
},
'table': {
'en': {
'url':
'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar',
'dict_path': 'ppocr/utils/dict/table_structure_dict.txt'
}
}
}
}
def parse_args(mMain=True): def parse_args(mMain=True):
import argparse import argparse
@ -140,6 +177,7 @@ def parse_args(mMain=True):
parser.add_argument("--det", type=str2bool, default=True) parser.add_argument("--det", type=str2bool, default=True)
parser.add_argument("--rec", type=str2bool, default=True) parser.add_argument("--rec", type=str2bool, default=True)
parser.add_argument("--type", type=str, default='ocr') parser.add_argument("--type", type=str, default='ocr')
parser.add_argument("--version", type=str, default='2.1')
for action in parser._actions: for action in parser._actions:
if action.dest in ['rec_char_dict_path', 'table_char_dict_path']: if action.dest in ['rec_char_dict_path', 'table_char_dict_path']:
@ -155,19 +193,19 @@ def parse_args(mMain=True):
def parse_lang(lang): def parse_lang(lang):
latin_lang = [ latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi' 'sw', 'tl', 'tr', 'uz', 'vi'
] ]
arabic_lang = ['ar', 'fa', 'ug', 'ur'] arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [ cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
] ]
devanagari_lang = [ devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
'gom', 'sa', 'bgc' 'sa', 'bgc'
] ]
if lang in latin_lang: if lang in latin_lang:
lang = "latin" lang = "latin"
@ -177,9 +215,9 @@ def parse_lang(lang):
lang = "cyrillic" lang = "cyrillic"
elif lang in devanagari_lang: elif lang in devanagari_lang:
lang = "devanagari" lang = "devanagari"
assert lang in model_urls[ assert lang in MODEL_URLS[DEFAULT_MODEL_VERSION][
'rec'], 'param lang must in {}, but got {}'.format( 'rec'], 'param lang must in {}, but got {}'.format(
model_urls['rec'].keys(), lang) MODEL_URLS[DEFAULT_MODEL_VERSION]['rec'].keys(), lang)
if lang == "ch": if lang == "ch":
det_lang = "ch" det_lang = "ch"
elif lang == 'structure': elif lang == 'structure':
@ -189,6 +227,35 @@ def parse_lang(lang):
return lang, det_lang return lang, det_lang
def get_model_config(version, model_type, lang):
if version not in MODEL_URLS:
logger.warning('version {} not in {}, use version {} instead'.format(
version, MODEL_URLS.keys(), DEFAULT_MODEL_VERSION))
version = DEFAULT_MODEL_VERSION
if model_type not in MODEL_URLS[version]:
if model_type in MODEL_URLS[DEFAULT_MODEL_VERSION]:
logger.warning(
'version {} not support {} models, use version {} instead'.
format(version, model_type, DEFAULT_MODEL_VERSION))
version = DEFAULT_MODEL_VERSION
else:
logger.error('{} models is not support, we only support {}'.format(
model_type, MODEL_URLS[DEFAULT_MODEL_VERSION].keys()))
sys.exit(-1)
if lang not in MODEL_URLS[version][model_type]:
if lang in MODEL_URLS[DEFAULT_MODEL_VERSION][model_type]:
logger.warning('lang {} is not support in {}, use {} instead'.
format(lang, version, DEFAULT_MODEL_VERSION))
version = DEFAULT_MODEL_VERSION
else:
logger.error(
'lang {} is not support, we only support {} for {} models'.
format(lang, MODEL_URLS[DEFAULT_MODEL_VERSION][model_type].keys(
), model_type))
sys.exit(-1)
return MODEL_URLS[version][model_type][lang]
class PaddleOCR(predict_system.TextSystem): class PaddleOCR(predict_system.TextSystem):
def __init__(self, **kwargs): def __init__(self, **kwargs):
""" """
@ -204,15 +271,21 @@ class PaddleOCR(predict_system.TextSystem):
lang, det_lang = parse_lang(params.lang) lang, det_lang = parse_lang(params.lang)
# init model dir # init model dir
params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, det_model_config = get_model_config(params.version, 'det', det_lang)
os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang), params.det_model_dir, det_url = confirm_model_dir_url(
model_urls['det'][det_lang]) params.det_model_dir,
params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir, os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang),
os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang), det_model_config['url'])
model_urls['rec'][lang]['url']) rec_model_config = get_model_config(params.version, 'rec', lang)
params.cls_model_dir, cls_url = confirm_model_dir_url(params.cls_model_dir, params.rec_model_dir, rec_url = confirm_model_dir_url(
os.path.join(BASE_DIR, VERSION, 'ocr', 'cls'), params.rec_model_dir,
model_urls['cls']) os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang),
rec_model_config['url'])
cls_model_config = get_model_config(params.version, 'cls', 'ch')
params.cls_model_dir, cls_url = confirm_model_dir_url(
params.cls_model_dir,
os.path.join(BASE_DIR, VERSION, 'ocr', 'cls'),
cls_model_config['url'])
# download model # download model
maybe_download(params.det_model_dir, det_url) maybe_download(params.det_model_dir, det_url)
maybe_download(params.rec_model_dir, rec_url) maybe_download(params.rec_model_dir, rec_url)
@ -226,7 +299,8 @@ class PaddleOCR(predict_system.TextSystem):
sys.exit(0) sys.exit(0)
if params.rec_char_dict_path is None: if params.rec_char_dict_path is None:
params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec'][lang]['dict_path']) params.rec_char_dict_path = str(
Path(__file__).parent / rec_model_config['dict_path'])
print(params) print(params)
# init det_model and rec_model # init det_model and rec_model
@ -293,24 +367,32 @@ class PPStructure(OCRSystem):
lang, det_lang = parse_lang(params.lang) lang, det_lang = parse_lang(params.lang)
# init model dir # init model dir
params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, det_model_config = get_model_config(params.version, 'det', det_lang)
os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang), params.det_model_dir, det_url = confirm_model_dir_url(
model_urls['det'][det_lang]) params.det_model_dir,
params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir, os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang),
os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang), det_model_config['url'])
model_urls['rec'][lang]['url']) rec_model_config = get_model_config(params.version, 'rec', lang)
params.table_model_dir, table_url = confirm_model_dir_url(params.table_model_dir, params.rec_model_dir, rec_url = confirm_model_dir_url(
os.path.join(BASE_DIR, VERSION, 'ocr', 'table'), params.rec_model_dir,
model_urls['table']['url']) os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang),
rec_model_config['url'])
table_model_config = get_model_config(params.version, 'table', 'en')
params.table_model_dir, table_url = confirm_model_dir_url(
params.table_model_dir,
os.path.join(BASE_DIR, VERSION, 'ocr', 'table'),
table_model_config['url'])
# download model # download model
maybe_download(params.det_model_dir, det_url) maybe_download(params.det_model_dir, det_url)
maybe_download(params.rec_model_dir, rec_url) maybe_download(params.rec_model_dir, rec_url)
maybe_download(params.table_model_dir, table_url) maybe_download(params.table_model_dir, table_url)
if params.rec_char_dict_path is None: if params.rec_char_dict_path is None:
params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec'][lang]['dict_path']) params.rec_char_dict_path = str(
Path(__file__).parent / rec_model_config['dict_path'])
if params.table_char_dict_path is None: if params.table_char_dict_path is None:
params.table_char_dict_path = str(Path(__file__).parent / model_urls['table']['dict_path']) params.table_char_dict_path = str(
Path(__file__).parent / table_model_config['dict_path'])
print(params) print(params)
super().__init__(params) super().__init__(params)
@ -374,4 +456,3 @@ def main():
for item in result: for item in result:
item.pop('img') item.pop('img')
logger.info(item) logger.info(item)