From 09d8cb6d98ed3a7c4636aa5c324bcaef1e4280c2 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Fri, 14 Aug 2020 16:31:13 +0800 Subject: [PATCH 01/11] update for srn --- .../rec_r50fpn_vd_none_srn_pvam_test_all.yml | 48 + ppocr/data/rec/dataset_traversal.py | 55 +- ppocr/data/rec/img_tools.py | 81 ++ ppocr/modeling/architectures/rec_model.py | 39 +- ppocr/modeling/backbones/rec_resnet50_fpn.py | 172 +++ ppocr/modeling/backbones/rec_resnet_vd.py | 2 +- ppocr/modeling/heads/rec_srn_all_head.py | 218 ++++ .../modeling/heads/self_attention/__init__.py | 0 ppocr/modeling/heads/self_attention/model.py | 1065 +++++++++++++++++ ppocr/modeling/losses/rec_srn_loss.py | 58 + ppocr/utils/character.py | 45 + tools/eval_utils/eval_rec_utils.py | 57 +- tools/program.py | 37 +- tools/train.py | 4 +- 14 files changed, 1838 insertions(+), 43 deletions(-) create mode 100755 configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml create mode 100755 ppocr/modeling/backbones/rec_resnet50_fpn.py create mode 100755 ppocr/modeling/heads/rec_srn_all_head.py create mode 100644 ppocr/modeling/heads/self_attention/__init__.py create mode 100644 ppocr/modeling/heads/self_attention/model.py create mode 100755 ppocr/modeling/losses/rec_srn_loss.py diff --git a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml new file mode 100755 index 00000000..933a7513 --- /dev/null +++ b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml @@ -0,0 +1,48 @@ +Global: + algorithm: SRN + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: output/rec_pvam_withrotate + save_epoch_step: 1 + eval_batch_step: 8000 + train_batch_size_per_card: 64 + test_batch_size_per_card: 1 + image_shape: [1, 64, 256] + max_text_length: 25 + character_type: en + loss_type: srn + num_heads: 8 + average_window: 0.15 + max_average_window: 15625 + min_average_window: 10000 + reader_yml: ./configs/rec/rec_srn_reader.yml + pretrain_weights: + checkpoints: + save_inference_dir: + +Architecture: + function: ppocr.modeling.architectures.rec_model,RecModel + +Backbone: + function: ppocr.modeling.backbones.rec_resnet50_fpn,ResNet + layers: 50 + +Head: + function: ppocr.modeling.heads.rec_srn_all_head,SRNPredict + encoder_type: rnn + num_encoder_TUs: 2 + num_decoder_TUs: 4 + hidden_dims: 512 + SeqRNN: + hidden_size: 256 + +Loss: + function: ppocr.modeling.losses.rec_srn_loss,SRNLoss + +Optimizer: + function: ppocr.optimizer,AdamDecay + base_lr: 0.0001 + beta1: 0.9 + beta2: 0.999 diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py index ec3e9d86..7135fca5 100755 --- a/ppocr/data/rec/dataset_traversal.py +++ b/ppocr/data/rec/dataset_traversal.py @@ -26,7 +26,7 @@ from ppocr.utils.utility import initial_logger from ppocr.utils.utility import get_image_file_list logger = initial_logger() -from .img_tools import process_image, get_img_data +from .img_tools import process_image, process_image_srn, get_img_data class LMDBReader(object): @@ -40,6 +40,7 @@ class LMDBReader(object): self.image_shape = params['image_shape'] self.loss_type = params['loss_type'] self.max_text_length = params['max_text_length'] + self.num_heads = params['num_heads'] self.mode = params['mode'] self.drop_last = False self.use_tps = False @@ -117,14 +118,36 @@ class LMDBReader(object): image_file_list = get_image_file_list(self.infer_img) for single_img in image_file_list: img = cv2.imread(single_img) - if img.shape[-1] == 1 or len(list(img.shape)) == 2: + if img.shape[-1]==1 or len(list(img.shape))==2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + if self.loss_type == 'srn': + norm_img = process_image_srn( + img=img, + image_shape=self.image_shape, + num_heads=self.num_heads, + max_text_length=self.max_text_length + ) + else: + norm_img = process_image( + img=img, + image_shape=self.image_shape, + char_ops=self.char_ops, + tps=self.use_tps, + infer_mode=True) + yield norm_img + elif self.mode == 'test': + image_file_list = get_image_file_list(self.infer_img) + for single_img in image_file_list: + img = cv2.imread(single_img) + if img.shape[-1]==1 or len(list(img.shape))==2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) norm_img = process_image( img=img, image_shape=self.image_shape, char_ops=self.char_ops, tps=self.use_tps, - infer_mode=True) + infer_mode=True + ) yield norm_img else: lmdb_sets = self.load_hierarchical_lmdb_dataset() @@ -144,14 +167,16 @@ class LMDBReader(object): if sample_info is None: continue img, label = sample_info - outs = process_image( - img=img, - image_shape=self.image_shape, - label=label, - char_ops=self.char_ops, - loss_type=self.loss_type, - max_text_length=self.max_text_length, - distort=self.use_distort) + outs = [] + if self.loss_type == "srn": + outs = process_image_srn(img, self.image_shape, self.num_heads, + self.max_text_length, label, + self.char_ops, self.loss_type) + + else: + outs = process_image(img, self.image_shape, label, + self.char_ops, self.loss_type, + self.max_text_length) if outs is None: continue yield outs @@ -159,7 +184,6 @@ class LMDBReader(object): if finish_read_num == len(lmdb_sets): break self.close_lmdb_dataset(lmdb_sets) - def batch_iter_reader(): batch_outs = [] for outs in sample_iter_reader(): @@ -167,9 +191,8 @@ class LMDBReader(object): if len(batch_outs) == self.batch_size: yield batch_outs batch_outs = [] - if not self.drop_last: - if len(batch_outs) != 0: - yield batch_outs + if len(batch_outs) != 0: + yield batch_outs if self.infer_img is None: return batch_iter_reader @@ -288,4 +311,4 @@ class SimpleReader(object): if self.infer_img is None: return batch_iter_reader - return sample_iter_reader + return sample_iter_reader \ No newline at end of file diff --git a/ppocr/data/rec/img_tools.py b/ppocr/data/rec/img_tools.py index 0835603b..527e0266 100755 --- a/ppocr/data/rec/img_tools.py +++ b/ppocr/data/rec/img_tools.py @@ -381,3 +381,84 @@ def process_image(img, assert False, "Unsupport loss_type %s in process_image"\ % loss_type return (norm_img) + +def resize_norm_img_srn(img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + +def srn_other_inputs(image_shape, + num_heads, + max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape((max_text_length, 1)).astype('int64') + + lbl_weight = np.array([37] * max_text_length).reshape((-1,1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape([-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]) * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape([-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]) * [-1e9] + + encoder_word_pos = encoder_word_pos[np.newaxis, :] + gsrm_word_pos = gsrm_word_pos[np.newaxis, :] + + return [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] + +def process_image_srn(img, + image_shape, + num_heads, + max_text_length, + label=None, + char_ops=None, + loss_type=None): + norm_img = resize_norm_img_srn(img, image_shape) + norm_img = norm_img[np.newaxis, :] + [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + srn_other_inputs(image_shape, num_heads, max_text_length) + + if label is not None: + char_num = char_ops.get_char_num() + text = char_ops.encode(label) + if len(text) == 0 or len(text) > max_text_length: + return None + else: + if loss_type == "srn": + text_padded = [37] * max_text_length + for i in range(len(text)): + text_padded[i] = text[i] + lbl_weight[i] = [1.0] + text_padded = np.array(text_padded) + text = text_padded.reshape(-1, 1) + return (norm_img, text,encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2,lbl_weight) + else: + assert False, "Unsupport loss_type %s in process_image"\ + % loss_type + return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2) diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py index e80a50ab..a030f362 100755 --- a/ppocr/modeling/architectures/rec_model.py +++ b/ppocr/modeling/architectures/rec_model.py @@ -58,6 +58,7 @@ class RecModel(object): self.loss_type = global_params['loss_type'] self.image_shape = global_params['image_shape'] self.max_text_length = global_params['max_text_length'] + self.num_heads = global_params["num_heads"] def create_feed(self, mode): image_shape = deepcopy(self.image_shape) @@ -77,6 +78,18 @@ class RecModel(object): lod_level=1) feed_list = [image, label_in, label_out] labels = {'label_in': label_in, 'label_out': label_out} + elif self.loss_type == "srn": + encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64") + gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64") + gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) + gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) + lbl_weight = fluid.layers.data(name="lbl_weight", shape=[-1, 1], dtype='int64') + label = fluid.data( + name='label', shape=[-1, 1], dtype='int32', lod_level=1) + feed_list = [image, label, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight] + labels = {'label': label, 'encoder_word_pos': encoder_word_pos, + 'gsrm_word_pos': gsrm_word_pos, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, + 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,'lbl_weight':lbl_weight} else: label = fluid.data( name='label', shape=[None, 1], dtype='int32', lod_level=1) @@ -88,6 +101,8 @@ class RecModel(object): use_double_buffer=True, iterable=False) else: + labels = None + loader = None if self.char_type == "ch" and self.infer_img: image_shape[-1] = -1 if self.tps != None: @@ -97,9 +112,15 @@ class RecModel(object): "We set img_shape to be the same , it may affect the inference effect" ) image_shape = deepcopy(self.image_shape) - image = fluid.data(name='image', shape=image_shape, dtype='float32') - labels = None - loader = None + image = fluid.data(name='image', shape=image_shape, dtype='float32') + if self.loss_type == "srn": + encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64") + gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64") + gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) + gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) + feed_list = [image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] + labels = {'encoder_word_pos': encoder_word_pos, 'gsrm_word_pos': gsrm_word_pos, + 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2} return image, labels, loader def __call__(self, mode): @@ -117,9 +138,15 @@ class RecModel(object): label = labels['label_out'] else: label = labels['label'] - outputs = {'total_loss':loss, 'decoded_out':\ - decoded_out, 'label':label} + if self.loss_type == 'srn': + total_loss, img_loss, word_loss = self.loss(predicts, labels) + outputs = {'total_loss':total_loss, 'img_loss':img_loss, 'word_loss':word_loss, + 'decoded_out':decoded_out, 'label':label} + else: + outputs = {'total_loss':loss, 'decoded_out':\ + decoded_out, 'label':label} return loader, outputs + elif mode == "export": predict = predicts['predict'] if self.loss_type == "ctc": @@ -129,4 +156,4 @@ class RecModel(object): predict = predicts['predict'] if self.loss_type == "ctc": predict = fluid.layers.softmax(predict) - return loader, {'decoded_out': decoded_out, 'predicts': predict} + return loader, {'decoded_out': decoded_out, 'predicts': predict} \ No newline at end of file diff --git a/ppocr/modeling/backbones/rec_resnet50_fpn.py b/ppocr/modeling/backbones/rec_resnet50_fpn.py new file mode 100755 index 00000000..f6d72377 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet50_fpn.py @@ -0,0 +1,172 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr + + +__all__ = ["ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] + +Trainable = True +w_nolr = fluid.ParamAttr( + trainable = Trainable) +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + +class ResNet(): + def __init__(self, params): + self.layers = params['layers'] + self.params = train_parameters + + + def __call__(self, input): + layers = self.layers + supported_layers = [18, 34, 50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + stride_list = [(2,2),(2,2),(1,1),(1,1)] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu', name="conv1") + F = [] + if layers >= 50: + for block in range(len(depth)): + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=stride_list[block] if i == 0 else 1, name=conv_name) + F.append(conv) + + base = F[-1] + for i in [-2, -3]: + b, c, w, h = F[i].shape + if (w,h) == base.shape[2:]: + base = base + else: + base = fluid.layers.conv2d_transpose( input=base, num_filters=c,filter_size=4, stride=2, + padding=1,act=None, + param_attr=w_nolr, + bias_attr=w_nolr) + base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr) + base = fluid.layers.concat([base, F[i]], axis=1) + base = fluid.layers.conv2d(base, num_filters=c, filter_size=1, param_attr=w_nolr, bias_attr=w_nolr) + base = fluid.layers.conv2d(base, num_filters=c, filter_size=3,padding = 1, param_attr=w_nolr, bias_attr=w_nolr) + base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr) + + base = fluid.layers.conv2d(base, num_filters=512, filter_size=1,bias_attr=w_nolr,param_attr=w_nolr) + + return base + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size= 2 if stride==(1,1) else filter_size, + dilation = 2 if stride==(1,1) else 1, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + param_attr=ParamAttr(name=name + "_weights",trainable = Trainable), + bias_attr=False, + name=name + '.conv2d.output.1') + + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + return fluid.layers.batch_norm(input=conv, + act=act, + name=bn_name + '.output.1', + param_attr=ParamAttr(name=bn_name + '_scale',trainable = Trainable), + bias_attr=ParamAttr(bn_name + '_offset',trainable = Trainable), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance', ) + + def shortcut(self, input, ch_out, stride, is_first, name): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1 or is_first == True: + if stride == (1,1): + return self.conv_bn_layer(input, ch_out, 1, 1, name=name) + else: #stride == (2,2) + return self.conv_bn_layer(input, ch_out, 1, stride, name=name) + + else: + return input + + def bottleneck_block(self, input, num_filters, stride, name): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu', name=name + "_branch2a") + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name + "_branch2c") + + short = self.shortcut(input, num_filters * 4, stride, is_first=False, name=name + "_branch1") + + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu', name=name + ".add.output.5") + + def basic_block(self, input, num_filters, stride, is_first, name): + conv0 = self.conv_bn_layer(input=input, num_filters=num_filters, filter_size=3, act='relu', stride=stride, + name=name + "_branch2a") + conv1 = self.conv_bn_layer(input=conv0, num_filters=num_filters, filter_size=3, act=None, + name=name + "_branch2b") + short = self.shortcut(input, num_filters, stride, is_first, name=name + "_branch1") + return fluid.layers.elementwise_add(x=short, y=conv1, act='relu') diff --git a/ppocr/modeling/backbones/rec_resnet_vd.py b/ppocr/modeling/backbones/rec_resnet_vd.py index bc58c8ac..2c7cd4c7 100755 --- a/ppocr/modeling/backbones/rec_resnet_vd.py +++ b/ppocr/modeling/backbones/rec_resnet_vd.py @@ -32,7 +32,7 @@ class ResNet(): def __init__(self, params): self.layers = params['layers'] self.is_3x3 = True - supported_layers = [18, 34, 50, 101, 152, 200] + supported_layers = [18, 34, 50, 101, 152] assert self.layers in supported_layers, \ "supported layers are {} but input layer is {}".format(supported_layers, self.layers) diff --git a/ppocr/modeling/heads/rec_srn_all_head.py b/ppocr/modeling/heads/rec_srn_all_head.py new file mode 100755 index 00000000..bf1f4a44 --- /dev/null +++ b/ppocr/modeling/heads/rec_srn_all_head.py @@ -0,0 +1,218 @@ +#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr +#from .rec_seq_encoder import SequenceEncoder +#from ..common_functions import get_para_bias_attr +import numpy as np +from .self_attention.model import wrap_encoder +from .self_attention.model import wrap_encoder_forFeature +gradient_clip = 10 + + + +class SRNPredict(object): + def __init__(self, params): + super(SRNPredict, self).__init__() + self.char_num = params['char_num'] + self.max_length = params['max_text_length'] + + self.num_heads = params['num_heads'] + self.num_encoder_TUs = params['num_encoder_TUs'] + self.num_decoder_TUs = params['num_decoder_TUs'] + self.hidden_dims = params['hidden_dims'] + + + def pvam(self, inputs, others): + + b, c, h, w = inputs.shape + conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w]) + conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1]) + + #===== Transformer encoder ===== + b, t, c = conv_features.shape + encoder_word_pos = others["encoder_word_pos"] + gsrm_word_pos = others["gsrm_word_pos"] + + + enc_inputs = [conv_features, encoder_word_pos, None] + word_features = wrap_encoder_forFeature(src_vocab_size=-1, + max_length=t, + n_layer=self.num_encoder_TUs, + n_head=self.num_heads, + d_key= int(self.hidden_dims / self.num_heads), + d_value= int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True, + enc_inputs=enc_inputs, + ) + fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(gradient_clip)) + + #===== Parallel Visual Attention Module ===== + b, t, c = word_features.shape + + word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2) + word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c]) + word_features_ = fluid.layers.expand(word_features_, [1, self.max_length, 1, 1]) + word_pos_feature = fluid.layers.embedding(gsrm_word_pos, [self.max_length, c]) + word_pos_ = fluid.layers.reshape(word_pos_feature, [-1, self.max_length, 1, c]) + word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1]) + temp = fluid.layers.elementwise_add(word_features_, word_pos_, act='tanh') + + attention_weight = fluid.layers.fc(input=temp, size=1, num_flatten_dims=3, bias_attr=False) + attention_weight = fluid.layers.reshape(x=attention_weight, shape=[-1, self.max_length, t]) + attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1) + + pvam_features = fluid.layers.matmul(attention_weight, word_features)#[b, max_length, c] + + return pvam_features + + def gsrm(self, pvam_features, others): + + #===== GSRM Visual-to-semantic embedding block ===== + b, t, c = pvam_features.shape + word_out = fluid.layers.fc(input=fluid.layers.reshape(pvam_features, [-1, c]), + size=self.char_num, + act="softmax") + #word_out.stop_gradient = True + word_ids = fluid.layers.argmax(word_out, axis=1) + word_ids.stop_gradient = True + word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1]) + + #===== GSRM Semantic reasoning block ===== + """ + This module is achieved through bi-transformers, + ngram_feature1 is the froward one, ngram_fetaure2 is the backward one + """ + pad_idx = self.char_num + gsrm_word_pos = others["gsrm_word_pos"] + gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"] + gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"] + + def prepare_bi(word_ids): + """ + prepare bi for gsrm + word1 for forward; word2 for backward + """ + word1 = fluid.layers.cast(word_ids, "float32") + word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0], pad_value=1.0 * pad_idx) + word1 = fluid.layers.cast(word1, "int64") + word1 = word1[:, :-1, :] + word2 = word_ids + return word1, word2 + + word1, word2 = prepare_bi(word_ids) + word1.stop_gradient = True + word2.stop_gradient = True + enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] + enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] + + gsrm_feature1 = wrap_encoder(src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True, + enc_inputs=enc_inputs_1, + ) + gsrm_feature2 = wrap_encoder(src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True, + enc_inputs=enc_inputs_2, + ) + gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], pad_value=0.) + gsrm_feature2 = gsrm_feature2[:, 1:, ] + gsrm_features = gsrm_feature1 + gsrm_feature2 + + b, t, c = gsrm_features.shape + + gsrm_out = fluid.layers.matmul( + x=gsrm_features, + y=fluid.default_main_program().global_block().var("src_word_emb_table"), + transpose_y=True) + b,t,c = gsrm_out.shape + gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out, [-1, c])) + + return gsrm_features, word_out, gsrm_out + + def vsfd(self, pvam_features, gsrm_features): + + #===== Visual-Semantic Fusion Decoder Module ===== + b, t, c1 = pvam_features.shape + b, t, c2 = gsrm_features.shape + combine_features_ = fluid.layers.concat([pvam_features, gsrm_features], axis=2) + img_comb_features_ = fluid.layers.reshape(x=combine_features_, shape=[-1, c1 + c2]) + img_comb_features_map = fluid.layers.fc(input=img_comb_features_, size=c1, act="sigmoid") + img_comb_features_map = fluid.layers.reshape(x=img_comb_features_map, shape=[-1, t, c1]) + combine_features = img_comb_features_map * pvam_features + (1.0 - img_comb_features_map) * gsrm_features + img_comb_features = fluid.layers.reshape(x=combine_features, shape=[-1, c1]) + + fc_out = fluid.layers.fc(input=img_comb_features, + size=self.char_num, + act="softmax") + return fc_out + + + def __call__(self, inputs, others, mode=None): + + pvam_features = self.pvam(inputs, others) + gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others) + final_out = self.vsfd(pvam_features, gsrm_features) + + _, decoded_out = fluid.layers.topk(input=final_out, k=1) + predicts = {'predict': final_out, 'decoded_out': decoded_out, + 'word_out': word_out, 'gsrm_out': gsrm_out} + + return predicts + + + + + + + + diff --git a/ppocr/modeling/heads/self_attention/__init__.py b/ppocr/modeling/heads/self_attention/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ppocr/modeling/heads/self_attention/model.py b/ppocr/modeling/heads/self_attention/model.py new file mode 100644 index 00000000..d4aecd5f --- /dev/null +++ b/ppocr/modeling/heads/self_attention/model.py @@ -0,0 +1,1065 @@ +from functools import partial +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + +from .desc import * +from .config import ModelHyperParams,TrainTaskConfig + +def wrap_layer_with_block(layer, block_idx): + """ + Make layer define support indicating block, by which we can add layers + to other blocks within current block. This will make it easy to define + cache among while loop. + """ + + class BlockGuard(object): + """ + BlockGuard class. + + BlockGuard class is used to switch to the given block in a program by + using the Python `with` keyword. + """ + + def __init__(self, block_idx=None, main_program=None): + self.main_program = fluid.default_main_program( + ) if main_program is None else main_program + self.old_block_idx = self.main_program.current_block().idx + self.new_block_idx = block_idx + + def __enter__(self): + self.main_program.current_block_idx = self.new_block_idx + + def __exit__(self, exc_type, exc_val, exc_tb): + self.main_program.current_block_idx = self.old_block_idx + if exc_type is not None: + return False # re-raise exception + return True + + def layer_wrapper(*args, **kwargs): + with BlockGuard(block_idx): + return layer(*args, **kwargs) + + return layer_wrapper + + +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + channels = d_pos_vec + position = np.arange(n_position) + num_timescales = channels // 2 + log_timescale_increment = (np.log(float(1e4) / float(1)) / + (num_timescales - 1)) + inv_timescales = np.exp(np.arange( + num_timescales)) * -log_timescale_increment + scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, + 0) + signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) + signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') + position_enc = signal + return position_enc.astype("float32") + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + gather_idx=None, + static_kv=False): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + keys = queries if keys is None else keys + values = keys if values is None else values + + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + # For encoder-decoder attention in inference, insert the ops and vars + # into global block to use as cache among beam search. + fc_layer = wrap_layer_with_block( + layers.fc, fluid.default_main_program().current_block() + .parent_idx) if cache is not None and static_kv else layers.fc + k = fc_layer( + input=keys, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + v = fc_layer( + input=values, + size=d_value * n_head, + bias_attr=False, + num_flatten_dims=2) + return q, k, v + + def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Reshape input tensors at the last dimension to split multi-heads + and then transpose. Specifically, transform the input tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] to the output tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped_q = layers.reshape( + x=queries, shape=[0, 0, n_head, d_key], inplace=True) + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + # For encoder-decoder attention in inference, insert the ops and vars + # into global block to use as cache among beam search. + reshape_layer = wrap_layer_with_block( + layers.reshape, + fluid.default_main_program().current_block() + .parent_idx) if cache is not None and static_kv else layers.reshape + transpose_layer = wrap_layer_with_block( + layers.transpose, + fluid.default_main_program().current_block(). + parent_idx) if cache is not None and static_kv else layers.transpose + reshaped_k = reshape_layer( + x=keys, shape=[0, 0, n_head, d_key], inplace=True) + k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3]) + reshaped_v = reshape_layer( + x=values, shape=[0, 0, n_head, d_value], inplace=True) + v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3]) + + if cache is not None: # only for faster inference + if static_kv: # For encoder-decoder attention in inference + cache_k, cache_v = cache["static_k"], cache["static_v"] + # To init the static_k and static_v in cache. + # Maybe we can use condition_op(if_else) to do these at the first + # step in while loop to replace these, however it might be less + # efficient. + static_cache_init = wrap_layer_with_block( + layers.assign, + fluid.default_main_program().current_block().parent_idx) + static_cache_init(k, cache_k) + static_cache_init(v, cache_v) + else: # For decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + # gather cell states corresponding to selected parent + select_k = layers.gather(cache_k, index=gather_idx) + select_v = layers.gather(cache_v, index=gather_idx) + if not static_kv: + # For self attention in inference, use cache and concat time steps. + select_k = layers.concat([select_k, k], axis=2) + select_v = layers.concat([select_v, v], axis=2) + # update cell states(caches) cached in global block + layers.assign(select_k, cache_k) + layers.assign(select_v, cache_v) + return q, select_k, select_v + return q, k, v + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=True) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + # print(q) + # print(k) + + product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + seed=dropout_seed, + is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + bias_attr=False, + num_flatten_dims=2) + return proj_out + + +def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act="relu") + if dropout_rate: + hidden = layers.dropout( + hidden, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) + out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2) + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.initializer.Constant(1.), + bias_attr=fluid.initializer.Constant(0.)) + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + seed=dropout_seed, + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def prepare_encoder(src_word,#[b,t,c] + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + """Add word embeddings and position encodings. + The output tensor has a shape of: + [batch_size, max_src_length_in_batch, d_model]. + This module is used at the bottom of the encoder stacks. + """ + + src_word_emb =src_word#layers.concat(res,axis=1) + src_word_emb=layers.cast(src_word_emb,'float32') + # print("src_word_emb",src_word_emb) + + src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) + src_pos_enc = layers.embedding( + src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, trainable=False)) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + return layers.dropout( + enc_input, dropout_prob=dropout_rate, seed=dropout_seed, + is_test=False) if dropout_rate else enc_input + + +def prepare_decoder(src_word, + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + """Add word embeddings and position encodings. + The output tensor has a shape of: + [batch_size, max_src_length_in_batch, d_model]. + This module is used at the bottom of the encoder stacks. + """ + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + padding_idx=bos_idx, # set embedding of bos to 0 + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + # print("target_word_emb",src_word_emb) + src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim ** 0.5) + src_pos_enc = layers.embedding( + src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, trainable=False)) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + return layers.dropout( + enc_input, dropout_prob=dropout_rate, seed=dropout_seed, + is_test=False) if dropout_rate else enc_input + +# prepare_encoder = partial( +# prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0]) +# prepare_decoder = partial( +# prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[1]) + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention( + pre_process_layer(enc_input, preprocess_cmd, + prepostprocess_dropout), None, None, attn_bias, d_key, + d_value, d_model, n_head, attention_dropout) + attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd, + prepostprocess_dropout) + ffd_output = positionwise_feed_forward( + pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout), + d_inner_hid, d_model, relu_dropout) + return post_process_layer(attn_output, ffd_output, postprocess_cmd, + prepostprocess_dropout) + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, ) + enc_input = enc_output + enc_output = pre_process_layer(enc_output, preprocess_cmd, + prepostprocess_dropout) + return enc_output + + +def decoder_layer(dec_input, + enc_output, + slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + cache=None, + gather_idx=None): + """ The layer to be stacked in decoder part. + The structure of this module is similar to that in the encoder part except + a multi-head attention is added to implement encoder-decoder attention. + """ + slf_attn_output = multi_head_attention( + pre_process_layer(dec_input, preprocess_cmd, prepostprocess_dropout), + None, + None, + slf_attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + cache=cache, + gather_idx=gather_idx) + slf_attn_output = post_process_layer( + dec_input, + slf_attn_output, + postprocess_cmd, + prepostprocess_dropout, ) + enc_attn_output = multi_head_attention( + pre_process_layer(slf_attn_output, preprocess_cmd, + prepostprocess_dropout), + enc_output, + enc_output, + dec_enc_attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + cache=cache, + gather_idx=gather_idx, + static_kv=True) + enc_attn_output = post_process_layer( + slf_attn_output, + enc_attn_output, + postprocess_cmd, + prepostprocess_dropout, ) + ffd_output = positionwise_feed_forward( + pre_process_layer(enc_attn_output, preprocess_cmd, + prepostprocess_dropout), + d_inner_hid, + d_model, + relu_dropout, ) + dec_output = post_process_layer( + enc_attn_output, + ffd_output, + postprocess_cmd, + prepostprocess_dropout, ) + return dec_output + + +def decoder(dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + caches=None, + gather_idx=None): + """ + The decoder is composed of a stack of identical decoder_layer layers. + """ + for i in range(n_layer): + dec_output = decoder_layer( + dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + cache=None if caches is None else caches[i], + gather_idx=gather_idx) + dec_input = dec_output + dec_output = pre_process_layer(dec_output, preprocess_cmd, + prepostprocess_dropout) + return dec_output + + +def make_all_inputs(input_fields): + """ + Define the input data layers for the transformer model. + """ + inputs = [] + for input_field in input_fields: + input_var = layers.data( + name=input_field, + shape=input_descs[input_field][0], + dtype=input_descs[input_field][1], + lod_level=input_descs[input_field][2] + if len(input_descs[input_field]) == 3 else 0, + append_batch_size=False) + inputs.append(input_var) + return inputs + + +def make_all_py_reader_inputs(input_fields, is_test=False): + reader = layers.py_reader( + capacity=20, + name="test_reader" if is_test else "train_reader", + shapes=[input_descs[input_field][0] for input_field in input_fields], + dtypes=[input_descs[input_field][1] for input_field in input_fields], + lod_levels=[ + input_descs[input_field][2] + if len(input_descs[input_field]) == 3 else 0 + for input_field in input_fields + ]) + return layers.read_file(reader), reader + + +def transformer(src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + label_smooth_eps, + bos_idx=0, + use_py_reader=False, + is_test=False): + if weight_sharing: + assert src_vocab_size == trg_vocab_size, ( + "Vocabularies in source and target should be same for weight sharing." + ) + + data_input_names = encoder_data_input_fields + \ + decoder_data_input_fields[:-1] + label_data_input_fields + + if use_py_reader: + all_inputs, reader = make_all_py_reader_inputs(data_input_names, + is_test) + else: + all_inputs = make_all_inputs(data_input_names) + # print("all inputs",all_inputs) + enc_inputs_len = len(encoder_data_input_fields) + dec_inputs_len = len(decoder_data_input_fields[:-1]) + enc_inputs = all_inputs[0:enc_inputs_len] + dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] + label = all_inputs[-2] + weights = all_inputs[-1] + + enc_output = wrap_encoder( + src_vocab_size, + ModelHyperParams.src_seq_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_inputs) + + predict = wrap_decoder( + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + dec_inputs, + enc_output, ) + + # Padding index do not contribute to the total loss. The weights is used to + # cancel padding index in calculating the loss. + if label_smooth_eps: + label = layers.label_smooth( + label=layers.one_hot( + input=label, depth=trg_vocab_size), + epsilon=label_smooth_eps) + + cost = layers.softmax_with_cross_entropy( + logits=predict, + label=label, + soft_label=True if label_smooth_eps else False) + weighted_cost = cost * weights + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + token_num.stop_gradient = True + avg_cost = sum_cost / token_num + return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None + + +def wrap_encoder_forFeature(src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_inputs=None, + bos_idx=0): + """ + The wrapper assembles together all needed layers for the encoder. + img, src_pos, src_slf_attn_bias = enc_inputs + img + """ + + if enc_inputs is None: + # This is used to implement independent encoder program in inference. + conv_features, src_pos, src_slf_attn_bias = make_all_inputs( + encoder_data_input_fields) + else: + conv_features, src_pos, src_slf_attn_bias = enc_inputs# + b,t,c = conv_features.shape + #""" + # insert cnn + #""" + #import basemodel + # feat = basemodel.resnet_50(img) + + # mycrnn = basemodel.CRNN() + # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu) + # b, c, w, h = feat.shape + # src_word = layers.reshape(feat, shape=[-1, c, w * h]) + + #myconv8 = basemodel.conv8() + #feat = myconv8.net(img ) + #b , c, h, w = feat.shape#h=6 + #print(feat) + #layers.Print(feat,message="conv_feat",summarize=10) + + #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu") + #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1)) + #src_word = layers.squeeze(feat,axes=[2]) #src_word [-1,c,ww] + + #feat = layers.transpose(feat, [0,3,1,2]) + #src_word = layers.reshape(feat,[-1,w, c*h]) + #src_word = layers.im2sequence( + # input=feat, + # stride=[1, 1], + # filter_size=[feat.shape[2], 1]) + #layers.Print(src_word,message="src_word",summarize=10) + + # print('feat',feat) + #print("src_word",src_word) + + enc_input = prepare_encoder( + conv_features, + src_pos, + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx, + word_emb_param_name=word_emb_param_names[0]) + + enc_output = encoder( + enc_input, + src_slf_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, ) + return enc_output + +def wrap_encoder(src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_inputs=None, + bos_idx=0): + """ + The wrapper assembles together all needed layers for the encoder. + img, src_pos, src_slf_attn_bias = enc_inputs + img + """ + if enc_inputs is None: + # This is used to implement independent encoder program in inference. + src_word, src_pos, src_slf_attn_bias = make_all_inputs( + encoder_data_input_fields) + else: + src_word, src_pos, src_slf_attn_bias = enc_inputs# + #""" + # insert cnn + #""" + #import basemodel + # feat = basemodel.resnet_50(img) + + # mycrnn = basemodel.CRNN() + # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu) + # b, c, w, h = feat.shape + # src_word = layers.reshape(feat, shape=[-1, c, w * h]) + + #myconv8 = basemodel.conv8() + #feat = myconv8.net(img ) + #b , c, h, w = feat.shape#h=6 + #print(feat) + #layers.Print(feat,message="conv_feat",summarize=10) + + #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu") + #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1)) + #src_word = layers.squeeze(feat,axes=[2]) #src_word [-1,c,ww] + + #feat = layers.transpose(feat, [0,3,1,2]) + #src_word = layers.reshape(feat,[-1,w, c*h]) + #src_word = layers.im2sequence( + # input=feat, + # stride=[1, 1], + # filter_size=[feat.shape[2], 1]) + #layers.Print(src_word,message="src_word",summarize=10) + + # print('feat',feat) + #print("src_word",src_word) + enc_input = prepare_decoder( + src_word, + src_pos, + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx, + word_emb_param_name=word_emb_param_names[0]) + + enc_output = encoder( + enc_input, + src_slf_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, ) + return enc_output + + +def wrap_decoder(trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + dec_inputs=None, + enc_output=None, + caches=None, + gather_idx=None, + bos_idx=0): + """ + The wrapper assembles together all needed layers for the decoder. + """ + if dec_inputs is None: + # This is used to implement independent decoder program in inference. + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \ + make_all_inputs(decoder_data_input_fields) + else: + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs + + dec_input = prepare_decoder( + trg_word, + trg_pos, + trg_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx, + word_emb_param_name=word_emb_param_names[0] + if weight_sharing else word_emb_param_names[1]) + dec_output = decoder( + dec_input, + enc_output, + trg_slf_attn_bias, + trg_src_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + caches=caches, + gather_idx=gather_idx) + return dec_output + # Reshape to 2D tensor to use GEMM instead of BatchedGEMM + dec_output = layers.reshape( + dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) + if weight_sharing: + predict = layers.matmul( + x=dec_output, + y=fluid.default_main_program().global_block().var( + word_emb_param_names[0]), + transpose_y=True) + else: + predict = layers.fc(input=dec_output, + size=trg_vocab_size, + bias_attr=False) + if dec_inputs is None: + # Return probs for independent decoder program. + predict = layers.softmax(predict) + return predict + + +def fast_decode(src_vocab_size, + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + beam_size, + max_out_len, + bos_idx, + eos_idx, + use_py_reader=False): + """ + Use beam search to decode. Caches will be used to store states of history + steps which can make the decoding faster. + """ + data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields + + if use_py_reader: + all_inputs, reader = make_all_py_reader_inputs(data_input_names) + else: + all_inputs = make_all_inputs(data_input_names) + + enc_inputs_len = len(encoder_data_input_fields) + dec_inputs_len = len(fast_decoder_data_input_fields) + enc_inputs = all_inputs[0:enc_inputs_len]#enc_inputs tensor + dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]#dec_inputs tensor + + enc_output = wrap_encoder( + src_vocab_size, + ModelHyperParams.src_seq_len,##to do !!!!!???? + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_inputs, + bos_idx=bos_idx) + start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs + + def beam_search(): + max_len = layers.fill_constant( + shape=[1], + dtype=start_tokens.dtype, + value=max_out_len, + force_cpu=True) + step_idx = layers.fill_constant( + shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) + cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True + while_op = layers.While(cond) + # array states will be stored for each step. + ids = layers.array_write( + layers.reshape(start_tokens, (-1, 1)), step_idx) + scores = layers.array_write(init_scores, step_idx) + # cell states will be overwrited at each step. + # caches contains states of history steps in decoder self-attention + # and static encoder output projections in encoder-decoder attention + # to reduce redundant computation. + caches = [ + { + "k": # for self attention + layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, n_head, 0, d_key], + dtype=enc_output.dtype, + value=0), + "v": # for self attention + layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, n_head, 0, d_value], + dtype=enc_output.dtype, + value=0), + "static_k": # for encoder-decoder attention + layers.create_tensor(dtype=enc_output.dtype), + "static_v": # for encoder-decoder attention + layers.create_tensor(dtype=enc_output.dtype) + } for i in range(n_layer) + ] + + with while_op.block(): + pre_ids = layers.array_read(array=ids, i=step_idx) + # Since beam_search_op dosen't enforce pre_ids' shape, we can do + # inplace reshape here which actually change the shape of pre_ids. + pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) + pre_scores = layers.array_read(array=scores, i=step_idx) + # gather cell states corresponding to selected parent + pre_src_attn_bias = layers.gather( + trg_src_attn_bias, index=parent_idx) + pre_pos = layers.elementwise_mul( + x=layers.fill_constant_batch_size_like( + input=pre_src_attn_bias, # cann't use lod tensor here + value=1, + shape=[-1, 1, 1], + dtype=pre_ids.dtype), + y=step_idx, + axis=0) + logits = wrap_decoder( + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), + enc_output=enc_output, + caches=caches, + gather_idx=parent_idx, + bos_idx=bos_idx) + # intra-beam topK + topk_scores, topk_indices = layers.topk( + input=layers.softmax(logits), k=beam_size) + accu_scores = layers.elementwise_add( + x=layers.log(topk_scores), y=pre_scores, axis=0) + # beam_search op uses lod to differentiate branches. + accu_scores = layers.lod_reset(accu_scores, pre_ids) + # topK reduction across beams, also contain special handle of + # end beams and end sentences(batch reduction) + selected_ids, selected_scores, gather_idx = layers.beam_search( + pre_ids=pre_ids, + pre_scores=pre_scores, + ids=topk_indices, + scores=accu_scores, + beam_size=beam_size, + end_id=eos_idx, + return_parent_idx=True) + layers.increment(x=step_idx, value=1.0, in_place=True) + # cell states(caches) have been updated in wrap_decoder, + # only need to update beam search states here. + layers.array_write(selected_ids, i=step_idx, array=ids) + layers.array_write(selected_scores, i=step_idx, array=scores) + layers.assign(gather_idx, parent_idx) + layers.assign(pre_src_attn_bias, trg_src_attn_bias) + length_cond = layers.less_than(x=step_idx, y=max_len) + finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) + layers.logical_and(x=length_cond, y=finish_cond, out=cond) + + finished_ids, finished_scores = layers.beam_search_decode( + ids, scores, beam_size=beam_size, end_id=eos_idx) + return finished_ids, finished_scores + + finished_ids, finished_scores = beam_search() + return finished_ids, finished_scores, reader if use_py_reader else None diff --git a/ppocr/modeling/losses/rec_srn_loss.py b/ppocr/modeling/losses/rec_srn_loss.py new file mode 100755 index 00000000..68a480ac --- /dev/null +++ b/ppocr/modeling/losses/rec_srn_loss.py @@ -0,0 +1,58 @@ +#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +import paddle.fluid as fluid + + +class SRNLoss(object): + def __init__(self, params): + super(SRNLoss, self).__init__() + self.char_num = params['char_num'] + + def __call__(self, predicts, others): + predict = predicts['predict'] + word_predict = predicts['word_out'] + gsrm_predict = predicts['gsrm_out'] + label = others['label'] + lbl_weight = others['lbl_weight'] + + casted_label = fluid.layers.cast(x=label, dtype='int64') + cost_word = fluid.layers.cross_entropy(input=word_predict, label=casted_label) + cost_gsrm = fluid.layers.cross_entropy(input=gsrm_predict, label=casted_label) + cost_vsfd = fluid.layers.cross_entropy(input=predict, label=casted_label) + + #cost_word = cost_word * lbl_weight + #cost_gsrm = cost_gsrm * lbl_weight + #cost_vsfd = cost_vsfd * lbl_weight + + cost_word = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_word), shape=[1]) + cost_gsrm = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_gsrm), shape=[1]) + cost_vsfd = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_vsfd), shape=[1]) + + sum_cost = fluid.layers.sum([cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15]) + + #sum_cost = fluid.layers.sum([cost_word * 3.0, cost_vsfd, cost_gsrm * 0.15]) + #sum_cost = cost_word + + #fluid.layers.Print(cost_word,message="word_cost") + #fluid.layers.Print(cost_vsfd,message="img_cost") + return [sum_cost,cost_vsfd,cost_word] + #return [sum_cost, cost_vsfd, cost_word] diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 9a3db8dd..79d6f5ca 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -25,6 +25,7 @@ class CharacterOps(object): def __init__(self, config): self.character_type = config['character_type'] self.loss_type = config['loss_type'] + self.max_text_len = config['max_text_length'] if self.character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) @@ -54,6 +55,8 @@ class CharacterOps(object): self.end_str = "eos" if self.loss_type == "attention": dict_character = [self.beg_str, self.end_str] + dict_character + elif self.loss_type == "srn": + dict_character = dict_character + [self.beg_str, self.end_str] self.dict = {} for i, char in enumerate(dict_character): self.dict[char] = i @@ -146,6 +149,48 @@ def cal_predicts_accuracy(char_ops, acc = acc_num * 1.0 / img_num return acc, acc_num, img_num +def cal_predicts_accuracy_srn(char_ops, + preds, + labels, + max_text_len, + is_debug=False): + acc_num = 0 + img_num = 0 + + total_len = preds.shape[0] + img_num = int(total_len / max_text_len) + #print (img_num) + for i in range(img_num): + cur_label = [] + cur_pred = [] + for j in range(max_text_len): + if labels[j + i * max_text_len] != 37: #0 + cur_label.append(labels[j + i * max_text_len][0]) + else: + break + + if is_debug: + for j in range(max_text_len): + if preds[j + i * max_text_len] != 37: #0 + cur_pred.append(preds[j + i * max_text_len][0]) + else: + break + print ("cur_label: ", cur_label) + print ("cur_pred: ", cur_pred) + + + for j in range(max_text_len + 1): + if j < len(cur_label) and preds[j + i * max_text_len][0] != cur_label[j]: + break + elif j == len(cur_label) and j == max_text_len: + acc_num += 1 + break + elif j == len(cur_label) and preds[j + i * max_text_len][0] == 37: + acc_num += 1 + break + acc = acc_num * 1.0 / img_num + return acc, acc_num, img_num + def convert_rec_attention_infer_res(preds): img_num = preds.shape[0] diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py index aebb9f90..3d496bd3 100644 --- a/tools/eval_utils/eval_rec_utils.py +++ b/tools/eval_utils/eval_rec_utils.py @@ -29,7 +29,7 @@ FORMAT = '%(asctime)s-%(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) logger = logging.getLogger(__name__) -from ppocr.utils.character import cal_predicts_accuracy +from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn from ppocr.utils.character import convert_rec_label_to_lod from ppocr.utils.character import convert_rec_attention_infer_res from ppocr.utils.utility import create_module @@ -60,19 +60,52 @@ def eval_rec_run(exe, config, eval_info_dict, mode): for ino in range(img_num): img_list.append(data[ino][0]) label_list.append(data[ino][1]) - img_list = np.concatenate(img_list, axis=0) - outs = exe.run(eval_info_dict['program'], \ + + if config['Global']['loss_type'] != "srn": + img_list = np.concatenate(img_list, axis=0) + outs = exe.run(eval_info_dict['program'], \ feed={'image': img_list}, \ fetch_list=eval_info_dict['fetch_varname_list'], \ return_numpy=False) - preds = np.array(outs[0]) - if preds.shape[1] != 1: - preds, preds_lod = convert_rec_attention_infer_res(preds) + preds = np.array(outs[0]) + + if preds.shape[1] != 1: + preds, preds_lod = convert_rec_attention_infer_res(preds) + else: + preds_lod = outs[0].lod()[0] + labels, labels_lod = convert_rec_label_to_lod(label_list) + acc, acc_num, sample_num = cal_predicts_accuracy( + char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate) else: - preds_lod = outs[0].lod()[0] - labels, labels_lod = convert_rec_label_to_lod(label_list) - acc, acc_num, sample_num = cal_predicts_accuracy( - char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate) + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + for ino in range(img_num): + encoder_word_pos_list.append(data[ino][2]) + gsrm_word_pos_list.append(data[ino][3]) + gsrm_slf_attn_bias1_list.append(data[ino][4]) + gsrm_slf_attn_bias2_list.append(data[ino][5]) + + img_list = np.concatenate(img_list, axis=0) + label_list = np.concatenate(label_list, axis=0) + encoder_word_pos_list = np.concatenate(encoder_word_pos_list, axis=0).astype(np.int64) + gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list, axis=0).astype(np.int64) + gsrm_slf_attn_bias1_list = np.concatenate(gsrm_slf_attn_bias1_list, axis=0).astype(np.float32) + gsrm_slf_attn_bias2_list = np.concatenate(gsrm_slf_attn_bias2_list, axis=0).astype(np.float32) + + labels = label_list + + outs = exe.run(eval_info_dict['program'], \ + feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list, + 'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list, + 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \ + fetch_list=eval_info_dict['fetch_varname_list'], \ + return_numpy=False) + preds = np.array(outs[0]) + acc, acc_num, sample_num = cal_predicts_accuracy_srn( + char_ops, preds, labels, config['Global']['max_text_length']) + total_acc_num += acc_num total_sample_num += sample_num logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc)) @@ -85,8 +118,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode): def test_rec_benchmark(exe, config, eval_info_dict): " Evaluate lmdb dataset " - eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', 'IC03_867', \ - 'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077', 'SVTP', 'CUTE80'] + eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', \ + 'IC13_857', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80'] eval_data_dir = config['TestReader']['lmdb_sets_dir'] total_evaluation_data_number = 0 total_correct_number = 0 diff --git a/tools/program.py b/tools/program.py index 4ebc1167..64c827e7 100755 --- a/tools/program.py +++ b/tools/program.py @@ -32,7 +32,7 @@ from eval_utils.eval_det_utils import eval_det_run from eval_utils.eval_rec_utils import eval_rec_run from ppocr.utils.save_load import save_model import numpy as np -from ppocr.utils.character import cal_predicts_accuracy, CharacterOps +from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn, CharacterOps class ArgsParser(ArgumentParser): def __init__(self): @@ -176,8 +176,16 @@ def build(config, main_prog, startup_prog, mode): fetch_name_list = list(outputs.keys()) fetch_varname_list = [outputs[v].name for v in fetch_name_list] opt_loss_name = None + model_average = None + img_loss_name = None + word_loss_name = None if mode == "train": opt_loss = outputs['total_loss'] + # srn loss + #img_loss = outputs['img_loss'] + #word_loss = outputs['word_loss'] + #img_loss_name = img_loss.name + #word_loss_name = word_loss.name opt_params = config['Optimizer'] optimizer = create_module(opt_params['function'])(opt_params) optimizer.minimize(opt_loss) @@ -185,7 +193,13 @@ def build(config, main_prog, startup_prog, mode): global_lr = optimizer._global_learning_rate() fetch_name_list.insert(0, "lr") fetch_varname_list.insert(0, global_lr.name) - return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name) + if config['Global']["loss_type"] == 'srn': + model_average = fluid.optimizer.ModelAverage( + config['Global']['average_window'], + min_average_window=config['Global']['min_average_window'], + max_average_window=config['Global']['max_average_window']) + + return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,model_average) def build_export(config, main_prog, startup_prog): @@ -329,14 +343,20 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): lr = np.mean(np.array(train_outs[fetch_map['lr']])) preds_idx = fetch_map['decoded_out'] preds = np.array(train_outs[preds_idx]) - preds_lod = train_outs[preds_idx].lod()[0] labels_idx = fetch_map['label'] labels = np.array(train_outs[labels_idx]) - labels_lod = train_outs[labels_idx].lod()[0] - acc, acc_num, img_num = cal_predicts_accuracy( - config['Global']['char_ops'], preds, preds_lod, labels, - labels_lod) + if config['Global']['loss_type'] != 'srn': + preds_lod = train_outs[preds_idx].lod()[0] + labels_lod = train_outs[labels_idx].lod()[0] + + acc, acc_num, img_num = cal_predicts_accuracy( + config['Global']['char_ops'], preds, preds_lod, labels, + labels_lod) + else: + acc, acc_num, img_num = cal_predicts_accuracy_srn( + config['Global']['char_ops'], preds, labels, + config['Global']['max_text_length']) t2 = time.time() train_batch_elapse = t2 - t1 stats = {'loss': loss, 'acc': acc} @@ -350,6 +370,9 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): if train_batch_id > 0 and\ train_batch_id % eval_batch_step == 0: + model_average = train_info_dict['model_average'] + if model_average != None: + model_average.apply(exe) metrics = eval_rec_run(exe, config, eval_info_dict, "eval") eval_acc = metrics['avg_acc'] eval_sample_num = metrics['total_sample_num'] diff --git a/tools/train.py b/tools/train.py index 68e792b7..2ea9d0e0 100755 --- a/tools/train.py +++ b/tools/train.py @@ -52,6 +52,7 @@ def main(): train_fetch_name_list = train_build_outputs[1] train_fetch_varname_list = train_build_outputs[2] train_opt_loss_name = train_build_outputs[3] + model_average = train_build_outputs[-1] eval_program = fluid.Program() eval_build_outputs = program.build( @@ -85,7 +86,8 @@ def main(): 'train_program':train_program,\ 'reader':train_loader,\ 'fetch_name_list':train_fetch_name_list,\ - 'fetch_varname_list':train_fetch_varname_list} + 'fetch_varname_list':train_fetch_varname_list,\ + 'model_average': model_average} eval_info_dict = {'program':eval_program,\ 'reader':eval_reader,\ From 6832ca029fe6d7bccd68fddcfe1aedc8e4d6618f Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sat, 15 Aug 2020 12:39:07 +0800 Subject: [PATCH 02/11] update config --- .../rec_r50fpn_vd_none_srn_pvam_test_all.yml | 5 +- ppocr/data/rec/dataset_traversal.py | 49 +++--- ppocr/modeling/architectures/rec_model.py | 99 ++++++++++--- ppocr/modeling/heads/self_attention/model.py | 139 +++++++++--------- tools/eval_utils/eval_rec_utils.py | 21 ++- tools/program.py | 15 +- train_data | 1 + 7 files changed, 197 insertions(+), 132 deletions(-) create mode 120000 train_data diff --git a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml index 933a7513..7a0f136c 100755 --- a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml +++ b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml @@ -17,11 +17,12 @@ Global: average_window: 0.15 max_average_window: 15625 min_average_window: 10000 - reader_yml: ./configs/rec/rec_srn_reader.yml + reader_yml: ./configs/rec/rec_benchmark_reader.yml pretrain_weights: checkpoints: save_inference_dir: - + infer_img: + Architecture: function: ppocr.modeling.architectures.rec_model,RecModel diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py index 7135fca5..b46e37da 100755 --- a/ppocr/data/rec/dataset_traversal.py +++ b/ppocr/data/rec/dataset_traversal.py @@ -118,15 +118,14 @@ class LMDBReader(object): image_file_list = get_image_file_list(self.infer_img) for single_img in image_file_list: img = cv2.imread(single_img) - if img.shape[-1]==1 or len(list(img.shape))==2: + if img.shape[-1] == 1 or len(list(img.shape)) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) if self.loss_type == 'srn': norm_img = process_image_srn( img=img, image_shape=self.image_shape, num_heads=self.num_heads, - max_text_length=self.max_text_length - ) + max_text_length=self.max_text_length) else: norm_img = process_image( img=img, @@ -135,20 +134,20 @@ class LMDBReader(object): tps=self.use_tps, infer_mode=True) yield norm_img - elif self.mode == 'test': - image_file_list = get_image_file_list(self.infer_img) - for single_img in image_file_list: - img = cv2.imread(single_img) - if img.shape[-1]==1 or len(list(img.shape))==2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - norm_img = process_image( - img=img, - image_shape=self.image_shape, - char_ops=self.char_ops, - tps=self.use_tps, - infer_mode=True - ) - yield norm_img + #elif self.mode == 'eval': + # image_file_list = get_image_file_list(self.infer_img) + # for single_img in image_file_list: + # img = cv2.imread(single_img) + # if img.shape[-1]==1 or len(list(img.shape))==2: + # img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + # norm_img = process_image( + # img=img, + # image_shape=self.image_shape, + # char_ops=self.char_ops, + # tps=self.use_tps, + # infer_mode=True + # ) + # yield norm_img else: lmdb_sets = self.load_hierarchical_lmdb_dataset() if process_id == 0: @@ -169,14 +168,15 @@ class LMDBReader(object): img, label = sample_info outs = [] if self.loss_type == "srn": - outs = process_image_srn(img, self.image_shape, self.num_heads, - self.max_text_length, label, - self.char_ops, self.loss_type) + outs = process_image_srn( + img, self.image_shape, self.num_heads, + self.max_text_length, label, self.char_ops, + self.loss_type) else: - outs = process_image(img, self.image_shape, label, - self.char_ops, self.loss_type, - self.max_text_length) + outs = process_image( + img, self.image_shape, label, self.char_ops, + self.loss_type, self.max_text_length) if outs is None: continue yield outs @@ -184,6 +184,7 @@ class LMDBReader(object): if finish_read_num == len(lmdb_sets): break self.close_lmdb_dataset(lmdb_sets) + def batch_iter_reader(): batch_outs = [] for outs in sample_iter_reader(): @@ -311,4 +312,4 @@ class SimpleReader(object): if self.infer_img is None: return batch_iter_reader - return sample_iter_reader \ No newline at end of file + return sample_iter_reader diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py index a030f362..d2e01a43 100755 --- a/ppocr/modeling/architectures/rec_model.py +++ b/ppocr/modeling/architectures/rec_model.py @@ -79,17 +79,45 @@ class RecModel(object): feed_list = [image, label_in, label_out] labels = {'label_in': label_in, 'label_out': label_out} elif self.loss_type == "srn": - encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64") - gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64") - gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) - gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) - lbl_weight = fluid.layers.data(name="lbl_weight", shape=[-1, 1], dtype='int64') + encoder_word_pos = fluid.data( + name="encoder_word_pos", + shape=[ + -1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), + 1 + ], + dtype="int64") + gsrm_word_pos = fluid.data( + name="gsrm_word_pos", + shape=[-1, self.max_text_length, 1], + dtype="int64") + gsrm_slf_attn_bias1 = fluid.data( + name="gsrm_slf_attn_bias1", + shape=[ + -1, self.num_heads, self.max_text_length, + self.max_text_length + ]) + gsrm_slf_attn_bias2 = fluid.data( + name="gsrm_slf_attn_bias2", + shape=[ + -1, self.num_heads, self.max_text_length, + self.max_text_length + ]) + lbl_weight = fluid.layers.data( + name="lbl_weight", shape=[-1, 1], dtype='int64') label = fluid.data( name='label', shape=[-1, 1], dtype='int32', lod_level=1) - feed_list = [image, label, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight] - labels = {'label': label, 'encoder_word_pos': encoder_word_pos, - 'gsrm_word_pos': gsrm_word_pos, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, - 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,'lbl_weight':lbl_weight} + feed_list = [ + image, label, encoder_word_pos, gsrm_word_pos, + gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight + ] + labels = { + 'label': label, + 'encoder_word_pos': encoder_word_pos, + 'gsrm_word_pos': gsrm_word_pos, + 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, + 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2, + 'lbl_weight': lbl_weight + } else: label = fluid.data( name='label', shape=[None, 1], dtype='int32', lod_level=1) @@ -112,15 +140,41 @@ class RecModel(object): "We set img_shape to be the same , it may affect the inference effect" ) image_shape = deepcopy(self.image_shape) - image = fluid.data(name='image', shape=image_shape, dtype='float32') + image = fluid.data(name='image', shape=image_shape, dtype='float32') if self.loss_type == "srn": - encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64") - gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64") - gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) - gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length]) - feed_list = [image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] - labels = {'encoder_word_pos': encoder_word_pos, 'gsrm_word_pos': gsrm_word_pos, - 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2} + encoder_word_pos = fluid.data( + name="encoder_word_pos", + shape=[ + -1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), + 1 + ], + dtype="int64") + gsrm_word_pos = fluid.data( + name="gsrm_word_pos", + shape=[-1, self.max_text_length, 1], + dtype="int64") + gsrm_slf_attn_bias1 = fluid.data( + name="gsrm_slf_attn_bias1", + shape=[ + -1, self.num_heads, self.max_text_length, + self.max_text_length + ]) + gsrm_slf_attn_bias2 = fluid.data( + name="gsrm_slf_attn_bias2", + shape=[ + -1, self.num_heads, self.max_text_length, + self.max_text_length + ]) + feed_list = [ + image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + labels = { + 'encoder_word_pos': encoder_word_pos, + 'gsrm_word_pos': gsrm_word_pos, + 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, + 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2 + } return image, labels, loader def __call__(self, mode): @@ -140,8 +194,13 @@ class RecModel(object): label = labels['label'] if self.loss_type == 'srn': total_loss, img_loss, word_loss = self.loss(predicts, labels) - outputs = {'total_loss':total_loss, 'img_loss':img_loss, 'word_loss':word_loss, - 'decoded_out':decoded_out, 'label':label} + outputs = { + 'total_loss': total_loss, + 'img_loss': img_loss, + 'word_loss': word_loss, + 'decoded_out': decoded_out, + 'label': label + } else: outputs = {'total_loss':loss, 'decoded_out':\ decoded_out, 'label':label} @@ -156,4 +215,4 @@ class RecModel(object): predict = predicts['predict'] if self.loss_type == "ctc": predict = fluid.layers.softmax(predict) - return loader, {'decoded_out': decoded_out, 'predicts': predict} \ No newline at end of file + return loader, {'decoded_out': decoded_out, 'predicts': predict} diff --git a/ppocr/modeling/heads/self_attention/model.py b/ppocr/modeling/heads/self_attention/model.py index d4aecd5f..8ac1458b 100644 --- a/ppocr/modeling/heads/self_attention/model.py +++ b/ppocr/modeling/heads/self_attention/model.py @@ -4,8 +4,9 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers -from .desc import * -from .config import ModelHyperParams,TrainTaskConfig +# Set seed for CE +dropout_seed = None + def wrap_layer_with_block(layer, block_idx): """ @@ -114,7 +115,7 @@ def multi_head_attention(queries, def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value): """ - Reshape input tensors at the last dimension to split multi-heads + Reshape input tensors at the last dimension to split multi-heads and then transpose. Specifically, transform the input tensor with shape [bs, max_sequence_length, n_head * hidden_dim] to the output tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. @@ -269,23 +270,24 @@ pre_process_layer = partial(pre_post_process_layer, None) post_process_layer = pre_post_process_layer -def prepare_encoder(src_word,#[b,t,c] - src_pos, - src_vocab_size, - src_emb_dim, - src_max_len, - dropout_rate=0., - bos_idx=0, - word_emb_param_name=None, - pos_enc_param_name=None): +def prepare_encoder( + src_word, #[b,t,c] + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ - - src_word_emb =src_word#layers.concat(res,axis=1) - src_word_emb=layers.cast(src_word_emb,'float32') + + src_word_emb = src_word #layers.concat(res,axis=1) + src_word_emb = layers.cast(src_word_emb, 'float32') # print("src_word_emb",src_word_emb) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) @@ -302,14 +304,14 @@ def prepare_encoder(src_word,#[b,t,c] def prepare_decoder(src_word, - src_pos, - src_vocab_size, - src_emb_dim, - src_max_len, - dropout_rate=0., - bos_idx=0, - word_emb_param_name=None, - pos_enc_param_name=None): + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. @@ -323,7 +325,7 @@ def prepare_decoder(src_word, name=word_emb_param_name, initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) # print("target_word_emb",src_word_emb) - src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim ** 0.5) + src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding( src_pos, size=[src_max_len, src_emb_dim], @@ -335,6 +337,7 @@ def prepare_decoder(src_word, enc_input, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) if dropout_rate else enc_input + # prepare_encoder = partial( # prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0]) # prepare_decoder = partial( @@ -595,21 +598,9 @@ def transformer(src_vocab_size, weights = all_inputs[-1] enc_output = wrap_encoder( - src_vocab_size, - ModelHyperParams.src_seq_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_inputs) + src_vocab_size, 64, n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd, weight_sharing, enc_inputs) predict = wrap_decoder( trg_vocab_size, @@ -650,34 +641,34 @@ def transformer(src_vocab_size, def wrap_encoder_forFeature(src_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_inputs=None, - bos_idx=0): + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_inputs=None, + bos_idx=0): """ The wrapper assembles together all needed layers for the encoder. img, src_pos, src_slf_attn_bias = enc_inputs img """ - + if enc_inputs is None: # This is used to implement independent encoder program in inference. conv_features, src_pos, src_slf_attn_bias = make_all_inputs( encoder_data_input_fields) else: - conv_features, src_pos, src_slf_attn_bias = enc_inputs# - b,t,c = conv_features.shape + conv_features, src_pos, src_slf_attn_bias = enc_inputs # + b, t, c = conv_features.shape #""" # insert cnn #""" @@ -694,11 +685,11 @@ def wrap_encoder_forFeature(src_vocab_size, #b , c, h, w = feat.shape#h=6 #print(feat) #layers.Print(feat,message="conv_feat",summarize=10) - + #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu") #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1)) #src_word = layers.squeeze(feat,axes=[2]) #src_word [-1,c,ww] - + #feat = layers.transpose(feat, [0,3,1,2]) #src_word = layers.reshape(feat,[-1,w, c*h]) #src_word = layers.im2sequence( @@ -706,10 +697,10 @@ def wrap_encoder_forFeature(src_vocab_size, # stride=[1, 1], # filter_size=[feat.shape[2], 1]) #layers.Print(src_word,message="src_word",summarize=10) - + # print('feat',feat) #print("src_word",src_word) - + enc_input = prepare_encoder( conv_features, src_pos, @@ -718,7 +709,7 @@ def wrap_encoder_forFeature(src_vocab_size, max_length, prepostprocess_dropout, bos_idx=bos_idx, - word_emb_param_name=word_emb_param_names[0]) + word_emb_param_name="src_word_emb_table") enc_output = encoder( enc_input, @@ -736,6 +727,7 @@ def wrap_encoder_forFeature(src_vocab_size, postprocess_cmd, ) return enc_output + def wrap_encoder(src_vocab_size, max_length, n_layer, @@ -762,7 +754,7 @@ def wrap_encoder(src_vocab_size, src_word, src_pos, src_slf_attn_bias = make_all_inputs( encoder_data_input_fields) else: - src_word, src_pos, src_slf_attn_bias = enc_inputs# + src_word, src_pos, src_slf_attn_bias = enc_inputs # #""" # insert cnn #""" @@ -779,11 +771,11 @@ def wrap_encoder(src_vocab_size, #b , c, h, w = feat.shape#h=6 #print(feat) #layers.Print(feat,message="conv_feat",summarize=10) - + #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu") #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1)) #src_word = layers.squeeze(feat,axes=[2]) #src_word [-1,c,ww] - + #feat = layers.transpose(feat, [0,3,1,2]) #src_word = layers.reshape(feat,[-1,w, c*h]) #src_word = layers.im2sequence( @@ -791,7 +783,7 @@ def wrap_encoder(src_vocab_size, # stride=[1, 1], # filter_size=[feat.shape[2], 1]) #layers.Print(src_word,message="src_word",summarize=10) - + # print('feat',feat) #print("src_word",src_word) enc_input = prepare_decoder( @@ -802,7 +794,7 @@ def wrap_encoder(src_vocab_size, max_length, prepostprocess_dropout, bos_idx=bos_idx, - word_emb_param_name=word_emb_param_names[0]) + word_emb_param_name="src_word_emb_table") enc_output = encoder( enc_input, @@ -858,8 +850,8 @@ def wrap_decoder(trg_vocab_size, max_length, prepostprocess_dropout, bos_idx=bos_idx, - word_emb_param_name=word_emb_param_names[0] - if weight_sharing else word_emb_param_names[1]) + word_emb_param_name="src_word_emb_table" + if weight_sharing else "trg_word_emb_table") dec_output = decoder( dec_input, enc_output, @@ -886,7 +878,7 @@ def wrap_decoder(trg_vocab_size, predict = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( - word_emb_param_names[0]), + "trg_word_emb_table"), transpose_y=True) else: predict = layers.fc(input=dec_output, @@ -931,12 +923,13 @@ def fast_decode(src_vocab_size, enc_inputs_len = len(encoder_data_input_fields) dec_inputs_len = len(fast_decoder_data_input_fields) - enc_inputs = all_inputs[0:enc_inputs_len]#enc_inputs tensor - dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]#dec_inputs tensor + enc_inputs = all_inputs[0:enc_inputs_len] #enc_inputs tensor + dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + + dec_inputs_len] #dec_inputs tensor enc_output = wrap_encoder( src_vocab_size, - ModelHyperParams.src_seq_len,##to do !!!!!???? + 64, ##to do !!!!!???? n_layer, n_head, d_key, diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py index 3d496bd3..ecdf0aaf 100644 --- a/tools/eval_utils/eval_rec_utils.py +++ b/tools/eval_utils/eval_rec_utils.py @@ -61,7 +61,7 @@ def eval_rec_run(exe, config, eval_info_dict, mode): img_list.append(data[ino][0]) label_list.append(data[ino][1]) - if config['Global']['loss_type'] != "srn": + if config['Global']['loss_type'] != "srn": img_list = np.concatenate(img_list, axis=0) outs = exe.run(eval_info_dict['program'], \ feed={'image': img_list}, \ @@ -75,7 +75,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode): preds_lod = outs[0].lod()[0] labels, labels_lod = convert_rec_label_to_lod(label_list) acc, acc_num, sample_num = cal_predicts_accuracy( - char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate) + char_ops, preds, preds_lod, labels, labels_lod, + is_remove_duplicate) else: encoder_word_pos_list = [] gsrm_word_pos_list = [] @@ -89,15 +90,19 @@ def eval_rec_run(exe, config, eval_info_dict, mode): img_list = np.concatenate(img_list, axis=0) label_list = np.concatenate(label_list, axis=0) - encoder_word_pos_list = np.concatenate(encoder_word_pos_list, axis=0).astype(np.int64) - gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list, axis=0).astype(np.int64) - gsrm_slf_attn_bias1_list = np.concatenate(gsrm_slf_attn_bias1_list, axis=0).astype(np.float32) - gsrm_slf_attn_bias2_list = np.concatenate(gsrm_slf_attn_bias2_list, axis=0).astype(np.float32) + encoder_word_pos_list = np.concatenate( + encoder_word_pos_list, axis=0).astype(np.int64) + gsrm_word_pos_list = np.concatenate( + gsrm_word_pos_list, axis=0).astype(np.int64) + gsrm_slf_attn_bias1_list = np.concatenate( + gsrm_slf_attn_bias1_list, axis=0).astype(np.float32) + gsrm_slf_attn_bias2_list = np.concatenate( + gsrm_slf_attn_bias2_list, axis=0).astype(np.float32) labels = label_list outs = exe.run(eval_info_dict['program'], \ - feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list, + feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list, 'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list, 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \ fetch_list=eval_info_dict['fetch_varname_list'], \ @@ -108,7 +113,7 @@ def eval_rec_run(exe, config, eval_info_dict, mode): total_acc_num += acc_num total_sample_num += sample_num - logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc)) + #logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc)) total_batch_num += 1 avg_acc = total_acc_num * 1.0 / total_sample_num metrics = {'avg_acc': avg_acc, "total_acc_num": total_acc_num, \ diff --git a/tools/program.py b/tools/program.py index 64c827e7..6ebc27cb 100755 --- a/tools/program.py +++ b/tools/program.py @@ -34,6 +34,7 @@ from ppocr.utils.save_load import save_model import numpy as np from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn, CharacterOps + class ArgsParser(ArgumentParser): def __init__(self): super(ArgsParser, self).__init__( @@ -196,10 +197,13 @@ def build(config, main_prog, startup_prog, mode): if config['Global']["loss_type"] == 'srn': model_average = fluid.optimizer.ModelAverage( config['Global']['average_window'], - min_average_window=config['Global']['min_average_window'], - max_average_window=config['Global']['max_average_window']) + min_average_window=config['Global'][ + 'min_average_window'], + max_average_window=config['Global'][ + 'max_average_window']) - return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,model_average) + return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name, + model_average) def build_export(config, main_prog, startup_prog): @@ -398,6 +402,7 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): save_model(train_info_dict['train_program'], save_path) return + def preprocess(): FLAGS = ArgsParser().parse_args() config = load_config(FLAGS.config) @@ -409,8 +414,8 @@ def preprocess(): check_gpu(use_gpu) alg = config['Global']['algorithm'] - assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE'] - if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']: + assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN'] + if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN']: config['Global']['char_ops'] = CharacterOps(config['Global']) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() diff --git a/train_data b/train_data new file mode 120000 index 00000000..7c2082ab --- /dev/null +++ b/train_data @@ -0,0 +1 @@ +/workspace/PaddleOCR/train_data/ \ No newline at end of file From bf4863c95082651cc8daf5c39455632ca6c113db Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sat, 15 Aug 2020 15:45:55 +0800 Subject: [PATCH 03/11] update infer_rec for srn --- ppocr/data/rec/dataset_traversal.py | 41 +++-- ppocr/modeling/architectures/rec_model.py | 5 +- ppocr/modeling/backbones/rec_resnet_vd.py | 2 +- ppocr/modeling/heads/rec_srn_all_head.py | 194 ++++++++++++---------- ppocr/modeling/losses/rec_srn_loss.py | 33 ++-- ppocr/utils/character.py | 25 +-- tools/eval_utils/eval_rec_utils.py | 4 +- tools/infer_rec.py | 48 +++++- 8 files changed, 194 insertions(+), 158 deletions(-) diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py index b46e37da..53c7e87b 100755 --- a/ppocr/data/rec/dataset_traversal.py +++ b/ppocr/data/rec/dataset_traversal.py @@ -40,10 +40,12 @@ class LMDBReader(object): self.image_shape = params['image_shape'] self.loss_type = params['loss_type'] self.max_text_length = params['max_text_length'] - self.num_heads = params['num_heads'] self.mode = params['mode'] self.drop_last = False self.use_tps = False + self.num_heads = None + if "num_heads" in params: + self.num_heads = params['num_heads'] if "tps" in params: self.ues_tps = True self.use_distort = False @@ -134,20 +136,6 @@ class LMDBReader(object): tps=self.use_tps, infer_mode=True) yield norm_img - #elif self.mode == 'eval': - # image_file_list = get_image_file_list(self.infer_img) - # for single_img in image_file_list: - # img = cv2.imread(single_img) - # if img.shape[-1]==1 or len(list(img.shape))==2: - # img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - # norm_img = process_image( - # img=img, - # image_shape=self.image_shape, - # char_ops=self.char_ops, - # tps=self.use_tps, - # infer_mode=True - # ) - # yield norm_img else: lmdb_sets = self.load_hierarchical_lmdb_dataset() if process_id == 0: @@ -169,14 +157,22 @@ class LMDBReader(object): outs = [] if self.loss_type == "srn": outs = process_image_srn( - img, self.image_shape, self.num_heads, - self.max_text_length, label, self.char_ops, - self.loss_type) + img=img, + image_shape=self.image_shape, + num_heads=self.num_heads, + max_text_length=self.max_text_length, + label=label, + char_ops=self.char_ops, + loss_type=self.loss_type) else: outs = process_image( - img, self.image_shape, label, self.char_ops, - self.loss_type, self.max_text_length) + img=img, + image_shape=self.image_shape, + label=label, + char_ops=self.char_ops, + loss_type=self.loss_type, + max_text_length=self.max_text_length) if outs is None: continue yield outs @@ -192,8 +188,9 @@ class LMDBReader(object): if len(batch_outs) == self.batch_size: yield batch_outs batch_outs = [] - if len(batch_outs) != 0: - yield batch_outs + if not self.drop_last: + if len(batch_outs) != 0: + yield batch_outs if self.infer_img is None: return batch_iter_reader diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py index d2e01a43..5eacd5de 100755 --- a/ppocr/modeling/architectures/rec_model.py +++ b/ppocr/modeling/architectures/rec_model.py @@ -58,7 +58,10 @@ class RecModel(object): self.loss_type = global_params['loss_type'] self.image_shape = global_params['image_shape'] self.max_text_length = global_params['max_text_length'] - self.num_heads = global_params["num_heads"] + if "num_heads" in params: + self.num_heads = global_params["num_heads"] + else: + self.num_heads = None def create_feed(self, mode): image_shape = deepcopy(self.image_shape) diff --git a/ppocr/modeling/backbones/rec_resnet_vd.py b/ppocr/modeling/backbones/rec_resnet_vd.py index 2c7cd4c7..bc58c8ac 100755 --- a/ppocr/modeling/backbones/rec_resnet_vd.py +++ b/ppocr/modeling/backbones/rec_resnet_vd.py @@ -32,7 +32,7 @@ class ResNet(): def __init__(self, params): self.layers = params['layers'] self.is_3x3 = True - supported_layers = [18, 34, 50, 101, 152] + supported_layers = [18, 34, 50, 101, 152, 200] assert self.layers in supported_layers, \ "supported layers are {} but input layer is {}".format(supported_layers, self.layers) diff --git a/ppocr/modeling/heads/rec_srn_all_head.py b/ppocr/modeling/heads/rec_srn_all_head.py index bf1f4a44..e1bb955d 100755 --- a/ppocr/modeling/heads/rec_srn_all_head.py +++ b/ppocr/modeling/heads/rec_srn_all_head.py @@ -21,15 +21,12 @@ import math import paddle import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr -#from .rec_seq_encoder import SequenceEncoder -#from ..common_functions import get_para_bias_attr import numpy as np from .self_attention.model import wrap_encoder from .self_attention.model import wrap_encoder_forFeature gradient_clip = 10 - class SRNPredict(object): def __init__(self, params): super(SRNPredict, self).__init__() @@ -41,7 +38,6 @@ class SRNPredict(object): self.num_decoder_TUs = params['num_decoder_TUs'] self.hidden_dims = params['hidden_dims'] - def pvam(self, inputs, others): b, c, h, w = inputs.shape @@ -53,52 +49,62 @@ class SRNPredict(object): encoder_word_pos = others["encoder_word_pos"] gsrm_word_pos = others["gsrm_word_pos"] - enc_inputs = [conv_features, encoder_word_pos, None] - word_features = wrap_encoder_forFeature(src_vocab_size=-1, - max_length=t, - n_layer=self.num_encoder_TUs, - n_head=self.num_heads, - d_key= int(self.hidden_dims / self.num_heads), - d_value= int(self.hidden_dims / self.num_heads), - d_model=self.hidden_dims, - d_inner_hid=self.hidden_dims, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - weight_sharing=True, - enc_inputs=enc_inputs, - ) - fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(gradient_clip)) + word_features = wrap_encoder_forFeature( + src_vocab_size=-1, + max_length=t, + n_layer=self.num_encoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True, + enc_inputs=enc_inputs, ) + fluid.clip.set_gradient_clip( + fluid.clip.GradientClipByValue(gradient_clip)) #===== Parallel Visual Attention Module ===== b, t, c = word_features.shape - word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2) + word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2) word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c]) - word_features_ = fluid.layers.expand(word_features_, [1, self.max_length, 1, 1]) - word_pos_feature = fluid.layers.embedding(gsrm_word_pos, [self.max_length, c]) - word_pos_ = fluid.layers.reshape(word_pos_feature, [-1, self.max_length, 1, c]) + word_features_ = fluid.layers.expand(word_features_, + [1, self.max_length, 1, 1]) + word_pos_feature = fluid.layers.embedding(gsrm_word_pos, + [self.max_length, c]) + word_pos_ = fluid.layers.reshape(word_pos_feature, + [-1, self.max_length, 1, c]) word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1]) - temp = fluid.layers.elementwise_add(word_features_, word_pos_, act='tanh') + temp = fluid.layers.elementwise_add( + word_features_, word_pos_, act='tanh') - attention_weight = fluid.layers.fc(input=temp, size=1, num_flatten_dims=3, bias_attr=False) - attention_weight = fluid.layers.reshape(x=attention_weight, shape=[-1, self.max_length, t]) - attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1) + attention_weight = fluid.layers.fc(input=temp, + size=1, + num_flatten_dims=3, + bias_attr=False) + attention_weight = fluid.layers.reshape( + x=attention_weight, shape=[-1, self.max_length, t]) + attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1) + + pvam_features = fluid.layers.matmul(attention_weight, + word_features) #[b, max_length, c] - pvam_features = fluid.layers.matmul(attention_weight, word_features)#[b, max_length, c] - return pvam_features - + def gsrm(self, pvam_features, others): #===== GSRM Visual-to-semantic embedding block ===== b, t, c = pvam_features.shape - word_out = fluid.layers.fc(input=fluid.layers.reshape(pvam_features, [-1, c]), - size=self.char_num, - act="softmax") + word_out = fluid.layers.fc( + input=fluid.layers.reshape(pvam_features, [-1, c]), + size=self.char_num, + act="softmax") #word_out.stop_gradient = True word_ids = fluid.layers.argmax(word_out, axis=1) word_ids.stop_gradient = True @@ -106,7 +112,7 @@ class SRNPredict(object): #===== GSRM Semantic reasoning block ===== """ - This module is achieved through bi-transformers, + This module is achieved through bi-transformers, ngram_feature1 is the froward one, ngram_fetaure2 is the backward one """ pad_idx = self.char_num @@ -120,7 +126,8 @@ class SRNPredict(object): word1 for forward; word2 for backward """ word1 = fluid.layers.cast(word_ids, "float32") - word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0], pad_value=1.0 * pad_idx) + word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0], + pad_value=1.0 * pad_idx) word1 = fluid.layers.cast(word1, "int64") word1 = word1[:, :-1, :] word2 = word_ids @@ -132,39 +139,40 @@ class SRNPredict(object): enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] - gsrm_feature1 = wrap_encoder(src_vocab_size=self.char_num + 1, - max_length=self.max_length, - n_layer=self.num_decoder_TUs, - n_head=self.num_heads, - d_key=int(self.hidden_dims / self.num_heads), - d_value=int(self.hidden_dims / self.num_heads), - d_model=self.hidden_dims, - d_inner_hid=self.hidden_dims, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - weight_sharing=True, - enc_inputs=enc_inputs_1, - ) - gsrm_feature2 = wrap_encoder(src_vocab_size=self.char_num + 1, - max_length=self.max_length, - n_layer=self.num_decoder_TUs, - n_head=self.num_heads, - d_key=int(self.hidden_dims / self.num_heads), - d_value=int(self.hidden_dims / self.num_heads), - d_model=self.hidden_dims, - d_inner_hid=self.hidden_dims, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - weight_sharing=True, - enc_inputs=enc_inputs_2, - ) - gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], pad_value=0.) + gsrm_feature1 = wrap_encoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True, + enc_inputs=enc_inputs_1, ) + gsrm_feature2 = wrap_encoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True, + enc_inputs=enc_inputs_2, ) + gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], + pad_value=0.) gsrm_feature2 = gsrm_feature2[:, 1:, ] gsrm_features = gsrm_feature1 + gsrm_feature2 @@ -172,10 +180,12 @@ class SRNPredict(object): gsrm_out = fluid.layers.matmul( x=gsrm_features, - y=fluid.default_main_program().global_block().var("src_word_emb_table"), + y=fluid.default_main_program().global_block().var( + "src_word_emb_table"), transpose_y=True) - b,t,c = gsrm_out.shape - gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out, [-1, c])) + b, t, c = gsrm_out.shape + gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out, + [-1, c])) return gsrm_features, word_out, gsrm_out @@ -184,19 +194,25 @@ class SRNPredict(object): #===== Visual-Semantic Fusion Decoder Module ===== b, t, c1 = pvam_features.shape b, t, c2 = gsrm_features.shape - combine_features_ = fluid.layers.concat([pvam_features, gsrm_features], axis=2) - img_comb_features_ = fluid.layers.reshape(x=combine_features_, shape=[-1, c1 + c2]) - img_comb_features_map = fluid.layers.fc(input=img_comb_features_, size=c1, act="sigmoid") - img_comb_features_map = fluid.layers.reshape(x=img_comb_features_map, shape=[-1, t, c1]) - combine_features = img_comb_features_map * pvam_features + (1.0 - img_comb_features_map) * gsrm_features - img_comb_features = fluid.layers.reshape(x=combine_features, shape=[-1, c1]) + combine_features_ = fluid.layers.concat( + [pvam_features, gsrm_features], axis=2) + img_comb_features_ = fluid.layers.reshape( + x=combine_features_, shape=[-1, c1 + c2]) + img_comb_features_map = fluid.layers.fc(input=img_comb_features_, + size=c1, + act="sigmoid") + img_comb_features_map = fluid.layers.reshape( + x=img_comb_features_map, shape=[-1, t, c1]) + combine_features = img_comb_features_map * pvam_features + ( + 1.0 - img_comb_features_map) * gsrm_features + img_comb_features = fluid.layers.reshape( + x=combine_features, shape=[-1, c1]) fc_out = fluid.layers.fc(input=img_comb_features, size=self.char_num, act="softmax") return fc_out - def __call__(self, inputs, others, mode=None): pvam_features = self.pvam(inputs, others) @@ -204,15 +220,11 @@ class SRNPredict(object): final_out = self.vsfd(pvam_features, gsrm_features) _, decoded_out = fluid.layers.topk(input=final_out, k=1) - predicts = {'predict': final_out, 'decoded_out': decoded_out, - 'word_out': word_out, 'gsrm_out': gsrm_out} + predicts = { + 'predict': final_out, + 'decoded_out': decoded_out, + 'word_out': word_out, + 'gsrm_out': gsrm_out + } return predicts - - - - - - - - diff --git a/ppocr/modeling/losses/rec_srn_loss.py b/ppocr/modeling/losses/rec_srn_loss.py index 68a480ac..b1ebd86f 100755 --- a/ppocr/modeling/losses/rec_srn_loss.py +++ b/ppocr/modeling/losses/rec_srn_loss.py @@ -35,24 +35,21 @@ class SRNLoss(object): lbl_weight = others['lbl_weight'] casted_label = fluid.layers.cast(x=label, dtype='int64') - cost_word = fluid.layers.cross_entropy(input=word_predict, label=casted_label) - cost_gsrm = fluid.layers.cross_entropy(input=gsrm_predict, label=casted_label) - cost_vsfd = fluid.layers.cross_entropy(input=predict, label=casted_label) + cost_word = fluid.layers.cross_entropy( + input=word_predict, label=casted_label) + cost_gsrm = fluid.layers.cross_entropy( + input=gsrm_predict, label=casted_label) + cost_vsfd = fluid.layers.cross_entropy( + input=predict, label=casted_label) - #cost_word = cost_word * lbl_weight - #cost_gsrm = cost_gsrm * lbl_weight - #cost_vsfd = cost_vsfd * lbl_weight + cost_word = fluid.layers.reshape( + x=fluid.layers.reduce_sum(cost_word), shape=[1]) + cost_gsrm = fluid.layers.reshape( + x=fluid.layers.reduce_sum(cost_gsrm), shape=[1]) + cost_vsfd = fluid.layers.reshape( + x=fluid.layers.reduce_sum(cost_vsfd), shape=[1]) - cost_word = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_word), shape=[1]) - cost_gsrm = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_gsrm), shape=[1]) - cost_vsfd = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_vsfd), shape=[1]) + sum_cost = fluid.layers.sum( + [cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15]) - sum_cost = fluid.layers.sum([cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15]) - - #sum_cost = fluid.layers.sum([cost_word * 3.0, cost_vsfd, cost_gsrm * 0.15]) - #sum_cost = cost_word - - #fluid.layers.Print(cost_word,message="word_cost") - #fluid.layers.Print(cost_vsfd,message="img_cost") - return [sum_cost,cost_vsfd,cost_word] - #return [sum_cost, cost_vsfd, cost_word] + return [sum_cost, cost_vsfd, cost_word] diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 79d6f5ca..5f2963ac 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -149,38 +149,29 @@ def cal_predicts_accuracy(char_ops, acc = acc_num * 1.0 / img_num return acc, acc_num, img_num + def cal_predicts_accuracy_srn(char_ops, - preds, - labels, - max_text_len, - is_debug=False): + preds, + labels, + max_text_len, + is_debug=False): acc_num = 0 img_num = 0 total_len = preds.shape[0] img_num = int(total_len / max_text_len) - #print (img_num) for i in range(img_num): cur_label = [] cur_pred = [] for j in range(max_text_len): - if labels[j + i * max_text_len] != 37: #0 + if labels[j + i * max_text_len] != 37: #0 cur_label.append(labels[j + i * max_text_len][0]) else: break - if is_debug: - for j in range(max_text_len): - if preds[j + i * max_text_len] != 37: #0 - cur_pred.append(preds[j + i * max_text_len][0]) - else: - break - print ("cur_label: ", cur_label) - print ("cur_pred: ", cur_pred) - - for j in range(max_text_len + 1): - if j < len(cur_label) and preds[j + i * max_text_len][0] != cur_label[j]: + if j < len(cur_label) and preds[j + i * max_text_len][ + 0] != cur_label[j]: break elif j == len(cur_label) and j == max_text_len: acc_num += 1 diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py index ecdf0aaf..5a653678 100644 --- a/tools/eval_utils/eval_rec_utils.py +++ b/tools/eval_utils/eval_rec_utils.py @@ -123,8 +123,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode): def test_rec_benchmark(exe, config, eval_info_dict): " Evaluate lmdb dataset " - eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', \ - 'IC13_857', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80'] + eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860','IC03_867', \ + 'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80'] eval_data_dir = config['TestReader']['lmdb_sets_dir'] total_evaluation_data_number = 0 total_correct_number = 0 diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 8cde44d8..21b503cc 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -64,7 +64,6 @@ def main(): exe = fluid.Executor(place) rec_model = create_module(config['Architecture']['function'])(params=config) - startup_prog = fluid.Program() eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): @@ -86,10 +85,36 @@ def main(): for i in range(max_img_num): logger.info("infer_img:%s" % infer_list[i]) img = next(blobs) - predict = exe.run(program=eval_prog, - feed={"image": img}, - fetch_list=fetch_varname_list, - return_numpy=False) + if loss_type != "srn": + predict = exe.run(program=eval_prog, + feed={"image": img}, + fetch_list=fetch_varname_list, + return_numpy=False) + else: + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + encoder_word_pos_list.append(img[1]) + gsrm_word_pos_list.append(img[2]) + gsrm_slf_attn_bias1_list.append(img[3]) + gsrm_slf_attn_bias2_list.append(img[4]) + + encoder_word_pos_list = np.concatenate( + encoder_word_pos_list, axis=0).astype(np.int64) + gsrm_word_pos_list = np.concatenate( + gsrm_word_pos_list, axis=0).astype(np.int64) + gsrm_slf_attn_bias1_list = np.concatenate( + gsrm_slf_attn_bias1_list, axis=0).astype(np.float32) + gsrm_slf_attn_bias2_list = np.concatenate( + gsrm_slf_attn_bias2_list, axis=0).astype(np.float32) + + predict = exe.run(program=eval_prog, \ + feed={'image': img[0], 'encoder_word_pos': encoder_word_pos_list, + 'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list, + 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \ + fetch_list=fetch_varname_list, \ + return_numpy=False) if loss_type == "ctc": preds = np.array(predict[0]) preds = preds.reshape(-1) @@ -114,7 +139,18 @@ def main(): score = np.mean(probs[0, 1:end_pos[1]]) preds = preds.reshape(-1) preds_text = char_ops.decode(preds) - + elif loss_type == "srn": + cur_pred = [] + preds = np.array(predict[0]) + preds = preds.reshape(-1) + probs = np.array(predict[1]) + ind = np.argmax(probs, axis=1) + valid_ind = np.where(preds != 37)[0] + if len(valid_ind) == 0: + continue + score = np.mean(probs[valid_ind, ind[valid_ind]]) + preds = preds[:valid_ind[-1] + 1] + preds_text = char_ops.decode(preds) logger.info("\t index: {}".format(preds)) logger.info("\t word : {}".format(preds_text)) logger.info("\t score: {}".format(score)) From d3ed210ac66ef026f3e641b365d1a7e74e43fe47 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sat, 15 Aug 2020 15:48:59 +0800 Subject: [PATCH 04/11] deleted dataset --- train_data | 1 - 1 file changed, 1 deletion(-) delete mode 120000 train_data diff --git a/train_data b/train_data deleted file mode 120000 index 7c2082ab..00000000 --- a/train_data +++ /dev/null @@ -1 +0,0 @@ -/workspace/PaddleOCR/train_data/ \ No newline at end of file From 9cb30720331cd84265b020be14a36dda3e88d2bf Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 12:53:26 +0800 Subject: [PATCH 05/11] fix bug and update doc --- README_cn.md | 6 +++++- ...am_test_all.yml => rec_r50fpn_vd_none_srn.yml} | 0 ppocr/utils/character.py | 2 +- tools/infer/predict_rec.py | 3 ++- tools/infer/utility.py | 4 ++-- tools/program.py | 15 ++++++++------- 6 files changed, 18 insertions(+), 12 deletions(-) rename configs/rec/{rec_r50fpn_vd_none_srn_pvam_test_all.yml => rec_r50fpn_vd_none_srn.yml} (100%) diff --git a/README_cn.md b/README_cn.md index cc5cb00a..ebfc4b1d 100644 --- a/README_cn.md +++ b/README_cn.md @@ -122,7 +122,10 @@ PaddleOCR开源的文本识别算法列表: - [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) - [x] RARE([paper](https://arxiv.org/abs/1603.03915v1)) -- [ ] SRN([paper](https://arxiv.org/abs/2003.12294))(百度自研, coming soon) +- [x] SRN([paper](https://arxiv.org/abs/2003.12294))(百度自研) + +*备注:* SRN模型使用了数据扰动方法对上述提到对两个训练集进行增广,增广后的数据可以在[百度网盘](todo)上下载。 +原始论文使用两阶段训练平均精度为89.74%,PaddleOCR中使用one-stage训练,平均精度为88.33%。两种预训练权重均在[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)中。 参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: @@ -136,6 +139,7 @@ PaddleOCR开源的文本识别算法列表: |STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)| |RARE|Resnet34_vd|84.90%|rec_r34_vd_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_attn.tar)| |RARE|MobileNetV3|83.32%|rec_mv3_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_attn.tar)| +|SRN|Resnet50_vd_fpn|88.33%|rec_r50fpn_vd_none_srn|[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)| 使用[LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#1icdar2019-lsvt)街景数据集根据真值将图crop出来30w数据,进行位置校准。此外基于LSVT语料生成500w合成数据训练中文模型,相关配置和预训练文件如下: diff --git a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml b/configs/rec/rec_r50fpn_vd_none_srn.yml similarity index 100% rename from configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml rename to configs/rec/rec_r50fpn_vd_none_srn.yml diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 5f2963ac..575658ef 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -25,7 +25,7 @@ class CharacterOps(object): def __init__(self, config): self.character_type = config['character_type'] self.loss_type = config['loss_type'] - self.max_text_len = config['max_text_length'] + self.max_text_len = 25 if self.character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index b51c49fc..c81b4eb2 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -40,7 +40,8 @@ class TextRecognizer(object): char_ops_params = { "character_type": args.rec_char_type, "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char + "use_space_char": args.use_space_char, + "max_text_length": args.max_text_length } if self.rec_algorithm != "RARE": char_ops_params['loss_type'] = 'ctc' diff --git a/tools/infer/utility.py b/tools/infer/utility.py index fc91880e..fe590c7e 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -56,8 +56,8 @@ def parse_args(): #params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='CRNN') parser.add_argument("--rec_model_dir", type=str) - parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") - parser.add_argument("--rec_char_type", type=str, default='ch') + parser.add_argument("--rec_image_shape", type=str, default="1, 64, 320") + parser.add_argument("--rec_char_type", type=str, default='en') parser.add_argument("--rec_batch_num", type=int, default=30) parser.add_argument( "--rec_char_dict_path", diff --git a/tools/program.py b/tools/program.py index 6ebc27cb..354cf8dd 100755 --- a/tools/program.py +++ b/tools/program.py @@ -194,13 +194,14 @@ def build(config, main_prog, startup_prog, mode): global_lr = optimizer._global_learning_rate() fetch_name_list.insert(0, "lr") fetch_varname_list.insert(0, global_lr.name) - if config['Global']["loss_type"] == 'srn': - model_average = fluid.optimizer.ModelAverage( - config['Global']['average_window'], - min_average_window=config['Global'][ - 'min_average_window'], - max_average_window=config['Global'][ - 'max_average_window']) + if "loss_type" in config["Global"]: + if config['Global']["loss_type"] == 'srn': + model_average = fluid.optimizer.ModelAverage( + config['Global']['average_window'], + min_average_window=config['Global'][ + 'min_average_window'], + max_average_window=config['Global'][ + 'max_average_window']) return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name, model_average) From d179c7c4468da9f0bb38977febda51a9b495715c Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 12:58:05 +0800 Subject: [PATCH 06/11] fix bug --- ppocr/utils/character.py | 2 +- tools/infer/utility.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 575658ef..5f2963ac 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -25,7 +25,7 @@ class CharacterOps(object): def __init__(self, config): self.character_type = config['character_type'] self.loss_type = config['loss_type'] - self.max_text_len = 25 + self.max_text_len = config['max_text_length'] if self.character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) diff --git a/tools/infer/utility.py b/tools/infer/utility.py index fe590c7e..fc91880e 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -56,8 +56,8 @@ def parse_args(): #params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='CRNN') parser.add_argument("--rec_model_dir", type=str) - parser.add_argument("--rec_image_shape", type=str, default="1, 64, 320") - parser.add_argument("--rec_char_type", type=str, default='en') + parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") + parser.add_argument("--rec_char_type", type=str, default='ch') parser.add_argument("--rec_batch_num", type=int, default=30) parser.add_argument( "--rec_char_dict_path", From d0d5de7f4ddf02964a39b9645e6fb4efc75a9d2b Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 13:30:25 +0800 Subject: [PATCH 07/11] fix bug --- tools/infer/utility.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/infer/utility.py b/tools/infer/utility.py index fc91880e..b0a0ec1f 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -59,6 +59,7 @@ def parse_args(): parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") parser.add_argument("--rec_char_type", type=str, default='ch') parser.add_argument("--rec_batch_num", type=int, default=30) + parser.add_argument("--max_text_length", type=int, default=25) parser.add_argument( "--rec_char_dict_path", type=str, From a3b291928b64687b5e6f437636eeeaa2e785f6e5 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 16:46:22 +0800 Subject: [PATCH 08/11] polish code --- doc/doc_ch/config.md | 3 +++ ppocr/modeling/architectures/rec_model.py | 3 +++ tools/eval_utils/eval_rec_utils.py | 6 +++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md index 5e579096..03fe1b32 100644 --- a/doc/doc_ch/config.md +++ b/doc/doc_ch/config.md @@ -32,6 +32,9 @@ | loss_type | 设置 loss 类型 | ctc | 支持两种loss: ctc / attention | | distort | 设置是否使用数据增强 | false | 设置为true时,将在训练时随机进行扰动,支持的扰动操作可阅读[img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) | | use_space_char | 设置是否识别空格 | false | 仅在 character_type=ch 时支持空格 | +| average_window | ModelAverage优化器中的窗口长度计算比例 | 0.15 | 目前仅应用与SRN | +| max_average_window | 平均值计算窗口长度的最大值 | 15625 | 推荐设置为一轮训练中mini-batchs的数目| +| min_average_window | 平均值计算窗口长度的最小值 | 10000 | \ | | reader_yml | 设置reader配置文件 | ./configs/rec/rec_icdar15_reader.yml | \ | | pretrain_weights | 加载预训练模型路径 | ./pretrain_models/CRNN/best_accuracy | \ | | checkpoints | 加载模型参数路径 | None | 用于中断后加载参数继续训练 | diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py index 5eacd5de..f4e3eea2 100755 --- a/ppocr/modeling/architectures/rec_model.py +++ b/ppocr/modeling/architectures/rec_model.py @@ -213,6 +213,9 @@ class RecModel(object): predict = predicts['predict'] if self.loss_type == "ctc": predict = fluid.layers.softmax(predict) + if self.loss_type == "srn": + logger.infor( + "Warning! SRN does not support export model currently") return [image, {'decoded_out': decoded_out, 'predicts': predict}] else: predict = predicts['predict'] diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py index 5a653678..4479d9df 100644 --- a/tools/eval_utils/eval_rec_utils.py +++ b/tools/eval_utils/eval_rec_utils.py @@ -69,7 +69,7 @@ def eval_rec_run(exe, config, eval_info_dict, mode): return_numpy=False) preds = np.array(outs[0]) - if preds.shape[1] != 1: + if config['Global']['loss_type'] == "attention": preds, preds_lod = convert_rec_attention_infer_res(preds) else: preds_lod = outs[0].lod()[0] @@ -123,8 +123,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode): def test_rec_benchmark(exe, config, eval_info_dict): " Evaluate lmdb dataset " - eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860','IC03_867', \ - 'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80'] + eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', 'IC03_867', \ + 'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077', 'SVTP', 'CUTE80'] eval_data_dir = config['TestReader']['lmdb_sets_dir'] total_evaluation_data_number = 0 total_correct_number = 0 From fe8ce9afdfdf9b73d55ee64aaf476b2a4b0c70c1 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 16:51:24 +0800 Subject: [PATCH 09/11] polish code --- ppocr/utils/character.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 5f2963ac..f27e1b85 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -30,6 +30,8 @@ class CharacterOps(object): self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) elif self.character_type == "ch": + if self.loss_type == "srn": + raise Exception("SRN can only support in character_type == en") character_dict_path = config['character_dict_path'] add_space = False if 'use_space_char' in config: From ab0acf78b4fb2c23a685c5c3ce17ea78870118ec Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 17:09:17 +0800 Subject: [PATCH 10/11] polish code --- ppocr/modeling/architectures/rec_model.py | 14 +++++++++----- ppocr/utils/character.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py index f4e3eea2..91f778ce 100755 --- a/ppocr/modeling/architectures/rec_model.py +++ b/ppocr/modeling/architectures/rec_model.py @@ -98,13 +98,15 @@ class RecModel(object): shape=[ -1, self.num_heads, self.max_text_length, self.max_text_length - ]) + ], + dtype="float32") gsrm_slf_attn_bias2 = fluid.data( name="gsrm_slf_attn_bias2", shape=[ -1, self.num_heads, self.max_text_length, self.max_text_length - ]) + ], + dtype="float32") lbl_weight = fluid.layers.data( name="lbl_weight", shape=[-1, 1], dtype='int64') label = fluid.data( @@ -161,13 +163,15 @@ class RecModel(object): shape=[ -1, self.num_heads, self.max_text_length, self.max_text_length - ]) + ], + dtype="float32") gsrm_slf_attn_bias2 = fluid.data( name="gsrm_slf_attn_bias2", shape=[ -1, self.num_heads, self.max_text_length, self.max_text_length - ]) + ], + dtype="float32") feed_list = [ image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2 @@ -214,7 +218,7 @@ class RecModel(object): if self.loss_type == "ctc": predict = fluid.layers.softmax(predict) if self.loss_type == "srn": - logger.infor( + raise Exception( "Warning! SRN does not support export model currently") return [image, {'decoded_out': decoded_out, 'predicts': predict}] else: diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index f27e1b85..2db0151e 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -26,12 +26,12 @@ class CharacterOps(object): self.character_type = config['character_type'] self.loss_type = config['loss_type'] self.max_text_len = config['max_text_length'] + if self.loss_type == "srn" and self.character_type == "ch": + raise Exception("SRN can only support in character_type == en") if self.character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) elif self.character_type == "ch": - if self.loss_type == "srn": - raise Exception("SRN can only support in character_type == en") character_dict_path = config['character_dict_path'] add_space = False if 'use_space_char' in config: From 9c89310292ad77ad6cfad719219e6d07110e0c1b Mon Sep 17 00:00:00 2001 From: tink2123 Date: Sun, 16 Aug 2020 17:19:00 +0800 Subject: [PATCH 11/11] polish code --- ppocr/utils/character.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 2db0151e..c7c93fc5 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -26,7 +26,7 @@ class CharacterOps(object): self.character_type = config['character_type'] self.loss_type = config['loss_type'] self.max_text_len = config['max_text_length'] - if self.loss_type == "srn" and self.character_type == "ch": + if self.loss_type == "srn" and self.character_type != "en": raise Exception("SRN can only support in character_type == en") if self.character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"