diff --git a/configs/rec/rec_chinese_lite_train.yml b/configs/rec/rec_chinese_lite_train.yml index cbc43e06..90e9dab1 100755 --- a/configs/rec/rec_chinese_lite_train.yml +++ b/configs/rec/rec_chinese_lite_train.yml @@ -1,6 +1,6 @@ Global: algorithm: CRNN - use_gpu: true + use_gpu: false epoch_num: 3000 log_smooth_window: 20 print_batch_step: 10 @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 320] max_text_length: 25 @@ -15,7 +16,7 @@ Global: character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt loss_type: ctc reader_yml: ./configs/rec/rec_chinese_reader.yml - pretrain_weights: + pretrain_weights: output/rec_CRNN/rec_mv3_crnn/best_accuracy checkpoints: save_inference_dir: infer_img: diff --git a/configs/rec/rec_icdar15_train.yml b/configs/rec/rec_icdar15_train.yml index dacf3243..35a1b17d 100755 --- a/configs/rec/rec_icdar15_train.yml +++ b/configs/rec/rec_icdar15_train.yml @@ -8,13 +8,14 @@ Global: save_epoch_step: 300 eval_batch_step: 500 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 character_type: en loss_type: ctc reader_yml: ./configs/rec/rec_icdar15_reader.yml - pretrain_weights: ./pretrain_models/rec_mv3_none_bilstm_ctc/best_accuracy + pretrain_weights: checkpoints: save_inference_dir: infer_img: diff --git a/configs/rec/rec_mv3_none_bilstm_ctc.yml b/configs/rec/rec_mv3_none_bilstm_ctc.yml index cea72bba..6cbca16f 100755 --- a/configs/rec/rec_mv3_none_bilstm_ctc.yml +++ b/configs/rec/rec_mv3_none_bilstm_ctc.yml @@ -1,6 +1,6 @@ Global: algorithm: CRNN - use_gpu: true + use_gpu: false epoch_num: 72 log_smooth_window: 20 print_batch_step: 10 @@ -8,13 +8,14 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 character_type: en loss_type: ctc reader_yml: ./configs/rec/rec_benchmark_reader.yml - pretrain_weights: ./output/rec_CRNN/rec_mv3_none_bilstm_ctc/best_accuracy + pretrain_weights: checkpoints: save_inference_dir: infer_img: diff --git a/configs/rec/rec_mv3_none_none_ctc.yml b/configs/rec/rec_mv3_none_none_ctc.yml index ceec09ce..3f8a67b9 100755 --- a/configs/rec/rec_mv3_none_none_ctc.yml +++ b/configs/rec/rec_mv3_none_none_ctc.yml @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/configs/rec/rec_mv3_tps_bilstm_attn.yml b/configs/rec/rec_mv3_tps_bilstm_attn.yml index d2fb512f..792757b3 100755 --- a/configs/rec/rec_mv3_tps_bilstm_attn.yml +++ b/configs/rec/rec_mv3_tps_bilstm_attn.yml @@ -1,6 +1,6 @@ Global: algorithm: RARE - use_gpu: true + use_gpu: false epoch_num: 72 log_smooth_window: 20 print_batch_step: 10 @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/configs/rec/rec_mv3_tps_bilstm_ctc.yml b/configs/rec/rec_mv3_tps_bilstm_ctc.yml index bc5780bd..f15ddab7 100755 --- a/configs/rec/rec_mv3_tps_bilstm_ctc.yml +++ b/configs/rec/rec_mv3_tps_bilstm_ctc.yml @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml index b71e8fea..5f1437fc 100755 --- a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml +++ b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/configs/rec/rec_r34_vd_none_none_ctc.yml b/configs/rec/rec_r34_vd_none_none_ctc.yml index d9c9458d..ae608755 100755 --- a/configs/rec/rec_r34_vd_none_none_ctc.yml +++ b/configs/rec/rec_r34_vd_none_none_ctc.yml @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/configs/rec/rec_r34_vd_tps_bilstm_attn.yml b/configs/rec/rec_r34_vd_tps_bilstm_attn.yml index 405082bd..989fbb46 100755 --- a/configs/rec/rec_r34_vd_tps_bilstm_attn.yml +++ b/configs/rec/rec_r34_vd_tps_bilstm_attn.yml @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml index 517322c3..50f0ff63 100755 --- a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml +++ b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml @@ -8,6 +8,7 @@ Global: save_epoch_step: 3 eval_batch_step: 2000 train_batch_size_per_card: 256 + drop_last: true test_batch_size_per_card: 256 image_shape: [3, 32, 100] max_text_length: 25 diff --git a/ppocr/data/det/db_process.py b/ppocr/data/det/db_process.py index d347ed44..0993324a 100644 --- a/ppocr/data/det/db_process.py +++ b/ppocr/data/det/db_process.py @@ -17,6 +17,8 @@ import cv2 import numpy as np import json import sys +from ppocr.utils.utility import initial_logger +logger = initial_logger() from .data_augment import AugmentData from .random_crop_data import RandomCropData @@ -100,6 +102,7 @@ class DBProcessTrain(object): img_path, gt_label = self.convert_label_infor(label_infor) imgvalue = cv2.imread(img_path) if imgvalue is None: + logger.info("{} does not exist!".format(img_path)) return None data = self.make_data_dict(imgvalue, gt_label) data = AugmentData(data) diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py index a8a090f1..2429dd20 100755 --- a/ppocr/data/rec/dataset_traversal.py +++ b/ppocr/data/rec/dataset_traversal.py @@ -43,6 +43,7 @@ class LMDBReader(object): self.mode = params['mode'] if params['mode'] == 'train': self.batch_size = params['train_batch_size_per_card'] + self.drop_last = params['drop_last'] else: self.batch_size = params['test_batch_size_per_card'] self.infer_img = params['infer_img'] @@ -99,7 +100,7 @@ class LMDBReader(object): process_id = 0 def sample_iter_reader(): - if self.infer_img is not None: + if self.mode != 'train' and self.infer_img is not None: image_file_list = get_image_file_list(self.infer_img) for single_img in image_file_list: img = cv2.imread(single_img) @@ -146,10 +147,11 @@ class LMDBReader(object): if len(batch_outs) == self.batch_size: yield batch_outs batch_outs = [] - if len(batch_outs) != 0: - yield batch_outs + if not self.drop_last: + if len(batch_outs) != 0: + yield batch_outs - if self.infer_img is None: + if self.mode != 'train' and self.infer_img is None: return batch_iter_reader return sample_iter_reader @@ -171,6 +173,7 @@ class SimpleReader(object): self.infer_img = params['infer_img'] if params['mode'] == 'train': self.batch_size = params['train_batch_size_per_card'] + self.drop_last = params['drop_last'] else: self.batch_size = params['test_batch_size_per_card'] @@ -226,8 +229,9 @@ class SimpleReader(object): if len(batch_outs) == self.batch_size: yield batch_outs batch_outs = [] - if len(batch_outs) != 0: - yield batch_outs + if not self.drop_last: + if len(batch_outs) != 0: + yield batch_outs if self.infer_img is None: return batch_iter_reader diff --git a/ppocr/data/rec/img_tools.py b/ppocr/data/rec/img_tools.py index df1d3dd5..303e1fe3 100755 --- a/ppocr/data/rec/img_tools.py +++ b/ppocr/data/rec/img_tools.py @@ -51,7 +51,7 @@ def resize_norm_img(img, image_shape): def resize_norm_img_chinese(img, image_shape): imgC, imgH, imgW = image_shape # todo: change to 0 and modified image shape - max_wh_ratio = 10 + max_wh_ratio = 0 h, w = img.shape[0], img.shape[1] ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, ratio) diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py index d88c620b..fdc0a641 100755 --- a/ppocr/modeling/architectures/rec_model.py +++ b/ppocr/modeling/architectures/rec_model.py @@ -110,7 +110,11 @@ class RecModel(object): return loader, outputs elif mode == "export": predict = predicts['predict'] - predict = fluid.layers.softmax(predict) + if self.loss_type == "ctc": + predict = fluid.layers.softmax(predict) return [image, {'decoded_out': decoded_out, 'predicts': predict}] else: - return loader, {'decoded_out': decoded_out} + predict = predicts['predict'] + if self.loss_type == "ctc": + predict = fluid.layers.softmax(predict) + return loader, {'decoded_out': decoded_out, 'predicts': predict} diff --git a/ppocr/modeling/heads/rec_attention_head.py b/ppocr/modeling/heads/rec_attention_head.py index 8f5b4cc4..66c8f300 100755 --- a/ppocr/modeling/heads/rec_attention_head.py +++ b/ppocr/modeling/heads/rec_attention_head.py @@ -123,6 +123,8 @@ class AttentionPredict(object): full_ids = fluid.layers.fill_constant_batch_size_like( input=init_state, shape=[-1, 1], dtype='int64', value=1) + full_scores = fluid.layers.fill_constant_batch_size_like( + input=init_state, shape=[-1, 1], dtype='float32', value=1) cond = layers.less_than(x=counter, y=array_len) while_op = layers.While(cond=cond) @@ -171,6 +173,9 @@ class AttentionPredict(object): new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1) fluid.layers.assign(new_ids, full_ids) + new_scores = fluid.layers.concat([full_scores, topk_scores], axis=1) + fluid.layers.assign(new_scores, full_scores) + layers.increment(x=counter, value=1, in_place=True) # update the memories @@ -184,7 +189,7 @@ class AttentionPredict(object): length_cond = layers.less_than(x=counter, y=array_len) finish_cond = layers.logical_not(layers.is_empty(x=topk_indices)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) - return full_ids + return full_ids, full_scores def __call__(self, inputs, labels=None, mode=None): encoder_features = self.encoder(inputs) @@ -223,10 +228,10 @@ class AttentionPredict(object): decoder_size, char_num) _, decoded_out = layers.topk(input=predict, k=1) decoded_out = layers.lod_reset(decoded_out, y=label_out) - predicts = {'predict': predict, 'decoded_out': decoded_out} + predicts = {'predict':predict, 'decoded_out':decoded_out} else: - ids = self.gru_attention_infer( + ids, predict = self.gru_attention_infer( decoder_boot, self.max_length, char_num, word_vector_dim, encoded_vector, encoded_proj, decoder_size) - predicts = {'decoded_out': ids} + predicts = {'predict':predict, 'decoded_out':ids} return predicts diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index c005bfb7..48553106 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -80,26 +80,43 @@ class TextRecognizer(object): starttime = time.time() self.input_tensor.copy_from_cpu(norm_img_batch) self.predictor.zero_copy_run() - rec_idx_batch = self.output_tensors[0].copy_to_cpu() - rec_idx_lod = self.output_tensors[0].lod()[0] - predict_batch = self.output_tensors[1].copy_to_cpu() - predict_lod = self.output_tensors[1].lod()[0] - elapse = time.time() - starttime - predict_time += elapse - starttime = time.time() - for rno in range(len(rec_idx_lod) - 1): - beg = rec_idx_lod[rno] - end = rec_idx_lod[rno + 1] - rec_idx_tmp = rec_idx_batch[beg:end, 0] - preds_text = self.char_ops.decode(rec_idx_tmp) - beg = predict_lod[rno] - end = predict_lod[rno + 1] - probs = predict_batch[beg:end, :] - ind = np.argmax(probs, axis=1) - blank = probs.shape[1] - valid_ind = np.where(ind != (blank - 1))[0] - score = np.mean(probs[valid_ind, ind[valid_ind]]) - rec_res.append([preds_text, score]) + + if args.rec_algorithm != "RARE": + rec_idx_batch = self.output_tensors[0].copy_to_cpu() + rec_idx_lod = self.output_tensors[0].lod()[0] + predict_batch = self.output_tensors[1].copy_to_cpu() + predict_lod = self.output_tensors[1].lod()[0] + elapse = time.time() - starttime + predict_time += elapse + for rno in range(len(rec_idx_lod) - 1): + beg = rec_idx_lod[rno] + end = rec_idx_lod[rno + 1] + rec_idx_tmp = rec_idx_batch[beg:end, 0] + preds_text = self.char_ops.decode(rec_idx_tmp) + beg = predict_lod[rno] + end = predict_lod[rno + 1] + probs = predict_batch[beg:end, :] + ind = np.argmax(probs, axis=1) + blank = probs.shape[1] + valid_ind = np.where(ind != (blank - 1))[0] + score = np.mean(probs[valid_ind, ind[valid_ind]]) + rec_res.append([preds_text, score]) + else: + rec_idx_batch = self.output_tensors[0].copy_to_cpu() + predict_batch = self.output_tensors[1].copy_to_cpu() + for rno in range(len(rec_idx_batch)): + end_pos = np.where(rec_idx_batch[rno, :] == 1)[0] + if len(end_pos) <= 1: + preds = rec_idx_batch[rno, 1:] + score = np.mean(predict_batch[rno, 1:]) + else: + preds = rec_idx_batch[rno, 1:end_pos[1]] + score = np.mean(predict_batch[rno, 1:end_pos[1]]) + #todo: why index has 2 offset + preds = preds - 2 + preds_text = self.char_ops.decode(preds) + rec_res.append([preds_text, score]) + return rec_res, predict_time @@ -116,7 +133,13 @@ if __name__ == "__main__": continue valid_image_file_list.append(image_file) img_list.append(img) - rec_res, predict_time = text_recognizer(img_list) + try: + rec_res, predict_time = text_recognizer(img_list) + except: + logger.info( + "ERROR!! \nInput image shape is not equal with config. TPS does not support variable shape.\n" + "Please set --rec_image_shape=input_shape and --rec_char_type='ch' ") + exit() for ino in range(len(img_list)): print("Predicts of %s:%s" % (valid_image_file_list[ino], rec_res[ino])) print("Total predict time for %d images:%.3f" % diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 67e61451..ec64a38b 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -55,6 +55,7 @@ def main(): program.merge_config(FLAGS.opt) logger.info(config) char_ops = CharacterOps(config['Global']) + loss_type = config['Global']['loss_type'] config['Global']['char_ops'] = char_ops # check if set use_gpu=True in paddlepaddle cpu version @@ -85,29 +86,38 @@ def main(): if len(infer_list) == 0: logger.info("Can not find img in infer_img dir.") for i in range(max_img_num): - print("infer_img:", infer_list[i]) + print("infer_img:%s" % infer_list[i]) img = next(blobs) predict = exe.run(program=eval_prog, feed={"image": img}, fetch_list=fetch_varname_list, return_numpy=False) - - preds = np.array(predict[0]) - if preds.shape[1] == 1: + if loss_type == "ctc": + preds = np.array(predict[0]) preds = preds.reshape(-1) preds_lod = predict[0].lod()[0] preds_text = char_ops.decode(preds) - else: + probs = np.array(predict[1]) + ind = np.argmax(probs, axis=1) + blank = probs.shape[1] + valid_ind = np.where(ind != (blank - 1))[0] + score = np.mean(probs[valid_ind, ind[valid_ind]]) + elif loss_type == "attention": + preds = np.array(predict[0]) + probs = np.array(predict[1]) end_pos = np.where(preds[0, :] == 1)[0] if len(end_pos) <= 1: - preds_text = preds[0, 1:] + preds = preds[0, 1:] + score = np.mean(probs[0, 1:]) else: - preds_text = preds[0, 1:end_pos[1]] - preds_text = preds_text.reshape(-1) - preds_text = char_ops.decode(preds_text) + preds = preds[0, 1:end_pos[1]] + score = np.mean(probs[0, 1:end_pos[1]]) + preds = preds.reshape(-1) + preds_text = char_ops.decode(preds) print("\t index:", preds) print("\t word :", preds_text) + print("\t score :", score) # save for inference model target_var = []