From ffa94415c35497a73cda435f67182920edb04c1f Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 24 Aug 2021 03:45:59 +0000 Subject: [PATCH 01/15] add_rec_sar, test=dygraph --- configs/rec/rec_r31_sar.yml | 99 ++++++ ppocr/losses/rec_sar_loss.py | 25 ++ ppocr/modeling/backbones/rec_resnet_31.py | 176 ++++++++++ ppocr/modeling/heads/rec_sar_head.py | 378 ++++++++++++++++++++++ ppocr/utils/dict90.txt | 90 ++++++ 5 files changed, 768 insertions(+) create mode 100644 configs/rec/rec_r31_sar.yml create mode 100644 ppocr/losses/rec_sar_loss.py create mode 100644 ppocr/modeling/backbones/rec_resnet_31.py create mode 100644 ppocr/modeling/heads/rec_sar_head.py create mode 100644 ppocr/utils/dict90.txt diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml new file mode 100644 index 00000000..3a398064 --- /dev/null +++ b/configs/rec/rec_r31_sar.yml @@ -0,0 +1,99 @@ +Global: + use_gpu: true + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: /paddle/backup/sar_rec/sar_train_v3 + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: #/paddle/backup/sar_rec/sar_train_v2/best_accuracy + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: demo_text_recog.jpg + # for data or label process + character_dict_path: ppocr/utils/dict90.txt + character_type: ch + max_text_length: 30 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_sar.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs: [3, 4] + values: [0.001, 0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: SAR + Transform: + Backbone: + name: ResNet31 + Head: + name: SARHead + +Loss: + name: SARLoss + +PostProcess: + name: SARLabelDecode + +Metric: + name: RecMetric + + +Train: + dataset: + name: LMDBDataSet #SimpleDataSet + # delimiter: ' ' + # label_file_list: ['/paddle/data/concat_data/icdar_2013_train20.txt', '/paddle/data/concat_data/icdar_2015_train20.txt', '/paddle/data/concat_data/coco_text_train20.txt', '/paddle/data/concat_data/IIIt5k_train20.txt', '/paddle/data/concat_data/SynthAdd_train.txt', '/paddle/data/concat_data/SynthText_train.txt', '/paddle/data/concat_data/Syn90k_train.txt'] + data_dir: /paddle/data/ocr_data/training/ #/paddle/data/concat_data/ + # ratio_list: 1.0 + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - SARRecResizeImg: + image_shape: [3, 48, 48, 160] # h:48 w:[48,160] + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 # 32 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: LMDBDataSet + data_dir: /paddle/data/ocr_data/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - SARRecResizeImg: + image_shape: [3, 48, 48, 160] + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 64 + num_workers: 4 + use_shared_memory: False + \ No newline at end of file diff --git a/ppocr/losses/rec_sar_loss.py b/ppocr/losses/rec_sar_loss.py new file mode 100644 index 00000000..1afb21fe --- /dev/null +++ b/ppocr/losses/rec_sar_loss.py @@ -0,0 +1,25 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SARLoss(nn.Layer): + def __init__(self, **kwargs): + super(SARLoss, self).__init__() + self.loss_func = paddle.nn.loss.CrossEntropyLoss(reduction="mean", ignore_index=92) + + def forward(self, predicts, batch): + predict = predicts[:, :-1, :] # ignore last index of outputs to be in same seq_len with targets + label = batch[1].astype("int64")[:, 1:] # ignore first index of target in loss calculation + batch_size, num_steps, num_classes = predict.shape[0], predict.shape[ + 1], predict.shape[2] + assert len(label.shape) == len(list(predict.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predict, [-1, num_classes]) + targets = paddle.reshape(label, [-1]) + loss = self.loss_func(inputs, targets) + return {'loss': loss} diff --git a/ppocr/modeling/backbones/rec_resnet_31.py b/ppocr/modeling/backbones/rec_resnet_31.py new file mode 100644 index 00000000..f60729cd --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_31.py @@ -0,0 +1,176 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +__all__ = ["ResNet31"] + + +def conv3x3(in_channel, out_channel, stride=1): + return nn.Conv2D( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False + ) + + +class BasicBlock(nn.Layer): + expansion = 1 + def __init__(self, in_channels, channels, stride=1, downsample=False): + super().__init__() + self.conv1 = conv3x3(in_channels, channels, stride) + self.bn1 = nn.BatchNorm2D(channels) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels) + self.bn2 = nn.BatchNorm2D(channels) + self.downsample = downsample + if downsample: + self.downsample = nn.Sequential( + nn.Conv2D(in_channels, channels * self.expansion, 1, stride, bias_attr=False), + nn.BatchNorm2D(channels * self.expansion), + ) + else: + self.downsample = nn.Sequential() + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet31(nn.Layer): + ''' + Args: + in_channels (int): Number of channels of input image tensor. + layers (list[int]): List of BasicBlock number for each stage. + channels (list[int]): List of out_channels of Conv2d layer. + out_indices (None | Sequence[int]): Indices of output stages. + last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + ''' + def __init__(self, + in_channels=3, + layers=[1, 2, 5, 3], + channels=[64, 128, 256, 256, 512, 512, 512], + out_indices=None, + last_stage_pool=False): + super(ResNet31, self).__init__() + assert isinstance(in_channels, int) + assert isinstance(last_stage_pool, bool) + + self.out_indices = out_indices + self.last_stage_pool = last_stage_pool + + # conv 1 (Conv Conv) + self.conv1_1 = nn.Conv2D(in_channels, channels[0], kernel_size=3, stride=1, padding=1) + self.bn1_1 = nn.BatchNorm2D(channels[0]) + self.relu1_1 = nn.ReLU() + + self.conv1_2 = nn.Conv2D(channels[0], channels[1], kernel_size=3, stride=1, padding=1) + self.bn1_2 = nn.BatchNorm2D(channels[1]) + self.relu1_2 = nn.ReLU() + + # conv 2 (Max-pooling, Residual block, Conv) + self.pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block2 = self._make_layer(channels[1], channels[2], layers[0]) + self.conv2 = nn.Conv2D(channels[2], channels[2], kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(channels[2]) + self.relu2 = nn.ReLU() + + # conv 3 (Max-pooling, Residual block, Conv) + self.pool3 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block3 = self._make_layer(channels[2], channels[3], layers[1]) + self.conv3 = nn.Conv2D(channels[3], channels[3], kernel_size=3, stride=1, padding=1) + self.bn3 = nn.BatchNorm2D(channels[3]) + self.relu3 = nn.ReLU() + + # conv 4 (Max-pooling, Residual block, Conv) + self.pool4 = nn.MaxPool2D(kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) + self.block4 = self._make_layer(channels[3], channels[4], layers[2]) + self.conv4 = nn.Conv2D(channels[4], channels[4], kernel_size=3, stride=1, padding=1) + self.bn4 = nn.BatchNorm2D(channels[4]) + self.relu4 = nn.ReLU() + + # conv 5 ((Max-pooling), Residual block, Conv) + self.pool5 = None + if self.last_stage_pool: + self.pool5 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block5 = self._make_layer(channels[4], channels[5], layers[3]) + self.conv5 = nn.Conv2D(channels[5], channels[5], kernel_size=3, stride=1, padding=1) + self.bn5 = nn.BatchNorm2D(channels[5]) + self.relu5 = nn.ReLU() + + self.out_channels = channels[-1] + + def _make_layer(self, input_channels, output_channels, blocks): + layers = [] + for _ in range(blocks): + downsample = None + if input_channels != output_channels: + downsample = nn.Sequential( + nn.Conv2D( + input_channels, + output_channels, + kernel_size=1, + stride=1, + bias_attr=False), + nn.BatchNorm2D(output_channels), + ) + + layers.append(BasicBlock(input_channels, output_channels, downsample=downsample)) + input_channels = output_channels + return nn.Sequential(*layers) + + + def forward(self, x): + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu1_1(x) + + x = self.conv1_2(x) + x = self.bn1_2(x) + x = self.relu1_2(x) + + outs = [] + for i in range(4): + layer_index = i + 2 + pool_layer = getattr(self, f'pool{layer_index}') + block_layer = getattr(self, f'block{layer_index}') + conv_layer = getattr(self, f'conv{layer_index}') + bn_layer = getattr(self, f'bn{layer_index}') + relu_layer = getattr(self, f'relu{layer_index}') + + if pool_layer is not None: + x = pool_layer(x) + x = block_layer(x) + x = conv_layer(x) + x = bn_layer(x) + x= relu_layer(x) + + outs.append(x) + + if self.out_indices is not None: + return tuple([outs[i] for i in self.out_indices]) + + return x diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py new file mode 100644 index 00000000..fb37b8ce --- /dev/null +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -0,0 +1,378 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + + +class SAREncoder(nn.Layer): + """ + Args: + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + enc_drop_rnn (float): Dropout probability of RNN layer in encoder. + enc_gru (bool): If True, use GRU, else LSTM in encoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + mask (bool): If True, mask padding in RNN sequence. + """ + def __init__(self, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + d_model=512, + d_enc=512, + mask=True, + **kwargs): + super().__init__() + assert isinstance(enc_bi_rnn, bool) + assert isinstance(enc_drop_rnn, (int, float)) + assert 0 <= enc_drop_rnn < 1.0 + assert isinstance(enc_gru, bool) + assert isinstance(d_model, int) + assert isinstance(d_enc, int) + assert isinstance(mask, bool) + + self.enc_bi_rnn = enc_bi_rnn + self.enc_drop_rnn = enc_drop_rnn + self.mask = mask + + # LSTM Encoder + if enc_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + kwargs = dict( + input_size=d_model, + hidden_size=d_enc, + num_layers=2, + time_major=False, + dropout=enc_drop_rnn, + direction=direction + ) + if enc_gru: + self.rnn_encoder = nn.GRU(**kwargs) + else: + self.rnn_encoder = nn.LSTM(**kwargs) + + # global feature transformation + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size) + + def forward(self, feat, img_metas=None): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + h_feat = feat.shape[2] # bsz c h w + feat_v = F.max_pool2d( + feat, kernel_size=(h_feat, 1), stride=1, padding=0 + ) + feat_v = feat_v.squeeze(2) # bsz * C * W + feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C + holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C + + if valid_ratios is not None: + valid_hf = [] + T = holistic_feat.shape[1] + for i, valid_ratio in enumerate(valid_ratios): + valid_step = min(T, math.ceil(T * valid_ratio)) - 1 + valid_hf.append(holistic_feat[i, valid_step, :]) + valid_hf = paddle.stack(valid_hf, axis=0) + else: + valid_hf = holistic_feat[:, -1, :] # bsz * C + holistic_feat = self.linear(valid_hf) # bsz * C + + return holistic_feat + + +class BaseDecoder(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + img_metas=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, img_metas) + return self.forward_test(feat, out_enc, img_metas) + + +class ParallelSARDecoder(BaseDecoder): + """ + Args: + num_classes (int): Output class number. + channels (list[int]): Network layer channels. + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + dec_bi_rnn (bool): If True, use bidirectional RNN in decoder. + dec_drop_rnn (float): Dropout of RNN layer in decoder. + dec_gru (bool): If True, use GRU, else LSTM in decoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + d_k (int): Dim of channels of attention module. + pred_dropout (float): Dropout probability of prediction layer. + max_seq_len (int): Maximum sequence length for decoding. + mask (bool): If True, mask padding in feature map. + start_idx (int): Index of start token. + padding_idx (int): Index of padding token. + pred_concat (bool): If True, concat glimpse feature from + attention with holistic feature and hidden state. + """ + + def __init__(self, + num_classes=93, # 90 + unknown + start + padding + enc_bi_rnn=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_model=512, + d_enc=512, + d_k=64, + pred_dropout=0.1, + max_text_length=30, + mask=True, + start_idx=91, + padding_idx=92, # 92 + pred_concat=True, + **kwargs): + super().__init__() + + self.num_classes = num_classes + self.enc_bi_rnn = enc_bi_rnn + self.d_k = d_k + self.start_idx = start_idx + self.max_seq_len = max_text_length + self.mask = mask + self.pred_concat = pred_concat + + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + decoder_rnn_out_size = encoder_rnn_out_size * (int(dec_bi_rnn) + 1) + + # 2D attention layer + self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k) + self.conv3x3_1 = nn.Conv2D(d_model, d_k, kernel_size=3, stride=1, padding=1) + self.conv1x1_2 = nn.Linear(d_k, 1) + + # Decoder RNN layer + if dec_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + + kwargs = dict( + input_size=encoder_rnn_out_size, + hidden_size=encoder_rnn_out_size, + num_layers=2, + time_major=False, + dropout=dec_drop_rnn, + direction=direction + ) + if dec_gru: + self.rnn_decoder = nn.GRU(**kwargs) + else: + self.rnn_decoder = nn.LSTM(**kwargs) + + # Decoder input embedding + self.embedding = nn.Embedding( + self.num_classes, encoder_rnn_out_size, padding_idx=padding_idx) + + # Prediction layer + self.pred_dropout = nn.Dropout(pred_dropout) + pred_num_classes = num_classes - 1 + if pred_concat: + fc_in_channel = decoder_rnn_out_size + d_model + d_enc + else: + fc_in_channel = d_model + self.prediction = nn.Linear(fc_in_channel, pred_num_classes) + + def _2d_attention(self, + decoder_input, + feat, + holistic_feat, + valid_ratios=None): + + y = self.rnn_decoder(decoder_input)[0] + # y: bsz * (seq_len + 1) * hidden_size + + attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size + bsz, seq_len, attn_size = attn_query.shape + attn_query = paddle.unsqueeze(attn_query, axis=[3, 4]) + # (bsz, seq_len + 1, attn_size, 1, 1) + + attn_key = self.conv3x3_1(feat) + # bsz * attn_size * h * w + attn_key = attn_key.unsqueeze(1) + # bsz * 1 * attn_size * h * w + + attn_weight = paddle.tanh(paddle.add(attn_key, attn_query)) + + # bsz * (seq_len + 1) * attn_size * h * w + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 3, 4, 2]) + # bsz * (seq_len + 1) * h * w * attn_size + attn_weight = self.conv1x1_2(attn_weight) + # bsz * (seq_len + 1) * h * w * 1 + bsz, T, h, w, c = attn_weight.shape + assert c == 1 + + if valid_ratios is not None: + # cal mask of attention weight + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(w, math.ceil(w * valid_ratio)) + attn_weight[i, :, :, valid_width:, :] = float('-inf') + + attn_weight = paddle.reshape(attn_weight, [bsz, T, -1]) + attn_weight = F.softmax(attn_weight, axis=-1) + + attn_weight = paddle.reshape(attn_weight, [bsz, T, h, w, c]) + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 4, 2, 3]) + # attn_weight: bsz * T * c * h * w + # feat: bsz * c * h * w + attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight), (3, 4), keepdim=False) + # bsz * (seq_len + 1) * C + + # Linear transformation + if self.pred_concat: + hf_c = holistic_feat.shape[-1] + holistic_feat = paddle.expand(holistic_feat, shape=[bsz, seq_len, hf_c]) + y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2)) + else: + y = self.prediction(attn_feat) + # bsz * (seq_len + 1) * num_classes + if self.train_mode: + y = self.pred_dropout(y) + + return y + + def forward_train(self, feat, out_enc, label, img_metas): + ''' + img_metas: [label, valid_ratio] + ''' + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + label = label.cuda() + lab_embedding = self.embedding(label) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + in_dec = paddle.concat((out_enc, lab_embedding), axis=1) + # bsz * (seq_len + 1) * C + out_dec = self._2d_attention( + in_dec, feat, out_enc, valid_ratios=valid_ratios + ) + # bsz * (seq_len + 1) * num_classes + + return out_dec[:, 1:, :] # bsz * seq_len * num_classes + + def forward_test(self, feat, out_enc, img_metas): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + seq_len = self.max_seq_len + bsz = feat.shape[0] + start_token = paddle.full((bsz, ), + fill_value=self.start_idx, + dtype='int64') + # bsz + start_token = self.embedding(start_token) + # bsz * emb_dim + emb_dim = start_token.shape[1] + start_token = start_token.unsqueeze(1) + start_token = paddle.expand(start_token, shape=[bsz, seq_len, emb_dim]) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + decoder_input = paddle.concat((out_enc, start_token), axis=1) + # bsz * (seq_len + 1) * emb_dim + + outputs = [] + for i in range(1, seq_len + 1): + decoder_output = self._2d_attention( + decoder_input, feat, out_enc, valid_ratios=valid_ratios + ) + char_output = decoder_output[:, i, :] # bsz * num_classes + char_output = F.softmax(char_output, -1) + outputs.append(char_output) + max_idx = paddle.argmax(char_output, axis=1, keepdim=False) + char_embedding = self.embedding(max_idx) # bsz * emb_dim + if i < seq_len: + decoder_input[:, i + 1, :] = char_embedding + + outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes + + return outputs + + +class SARHead(nn.Layer): + def __init__(self, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_k=512, + pred_dropout=0.1, + max_text_length=30, + pred_concat=True, + **kwargs): + super(SARHead, self).__init__() + + # encoder module + self.encoder = SAREncoder( + enc_bi_rnn=enc_bi_rnn, + enc_drop_rnn=enc_drop_rnn, + enc_gru=enc_gru) + + # decoder module + self.decoder = ParallelSARDecoder( + enc_bi_rnn=enc_bi_rnn, + dec_bi_rnn=dec_bi_rnn, + dec_drop_rnn=dec_drop_rnn, + dec_gru=dec_gru, + d_k=d_k, + pred_dropout=pred_dropout, + max_text_length=max_text_length, + pred_concat=pred_concat) + + def forward(self, feat, targets=None): + ''' + img_metas: [label, valid_ratio] + ''' + holistic_feat = self.encoder(feat, targets) # bsz c + + if self.training: + label = targets[0] # label + label = paddle.to_tensor(label, dtype='int64') + final_out = self.decoder(feat, holistic_feat, label, img_metas=targets) + if not self.training: + final_out = self.decoder(feat, holistic_feat, label=None, img_metas=targets, train_mode=False) + # (bsz, seq_len, num_classes) + + return final_out + \ No newline at end of file diff --git a/ppocr/utils/dict90.txt b/ppocr/utils/dict90.txt new file mode 100644 index 00000000..a945ae9c --- /dev/null +++ b/ppocr/utils/dict90.txt @@ -0,0 +1,90 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +_ +` +~ \ No newline at end of file From 8a95b3352df44307a1e9f0aff5458881356170ec Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 24 Aug 2021 03:49:26 +0000 Subject: [PATCH 02/15] add_rec_sar, test=dygraph --- doc/doc_ch/algorithm_overview.md | 3 +- doc/doc_ch/recognition.md | 1 + doc/doc_en/algorithm_overview_en.md | 2 + doc/doc_en/recognition_en.md | 1 + ppocr/data/imaug/__init__.py | 2 +- ppocr/data/imaug/label_ops.py | 46 +++++++++++++++++ ppocr/data/imaug/rec_img_aug.py | 50 ++++++++++++++++++ ppocr/losses/__init__.py | 3 +- ppocr/modeling/backbones/__init__.py | 3 +- ppocr/modeling/heads/__init__.py | 3 +- ppocr/postprocess/__init__.py | 4 +- ppocr/postprocess/rec_postprocess.py | 77 ++++++++++++++++++++++++++++ tools/eval.py | 3 +- tools/infer_rec.py | 9 ++++ tools/program.py | 13 +++-- 15 files changed, 207 insertions(+), 13 deletions(-) diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 19d7a69c..b6a365b3 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -44,6 +44,7 @@ PaddleOCR基于动态图开源的文本识别算法列表: - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] - [x] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] - [x] SRN([paper](https://arxiv.org/abs/2003.12294))[5] +- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2)) 参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: @@ -58,6 +59,6 @@ PaddleOCR基于动态图开源的文本识别算法列表: |RARE|MobileNetV3|82.5%|rec_mv3_tps_bilstm_att |[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| |RARE|Resnet34_vd|83.6%|rec_r34_vd_tps_bilstm_att |[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) | - +|SAR|Resnet31| 87.1% | rec_r31_sar | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md)。 diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index 0ff0513a..0ac6da87 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -215,6 +215,7 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t | rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att | | rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | +| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder | 训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index d70f99bb..f201589a 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -46,6 +46,7 @@ PaddleOCR open-source text recognition algorithms list: - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] - [x] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] - [x] SRN([paper](https://arxiv.org/abs/2003.12294))[5] +- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2)) Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: @@ -60,5 +61,6 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |RARE|MobileNetV3|82.5%|rec_mv3_tps_bilstm_att |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| |RARE|Resnet34_vd|83.6%|rec_r34_vd_tps_bilstm_att |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)| +|SAR|Resnet31| 87.1% | rec_r31_sar | [Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./recognition_en.md) diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 634ec783..91f81a6a 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -207,6 +207,7 @@ If the evaluation set is large, the test will be time-consuming. It is recommend | rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att | | rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | +| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder | For training Chinese data, it is recommended to use diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index 52194eb9..6f0492e1 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -21,7 +21,7 @@ from .make_border_map import MakeBorderMap from .make_shrink_map import MakeShrinkMap from .random_crop_data import EastRandomCropData, PSERandomCrop -from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg +from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg, SARRecResizeImg from .randaugment import RandAugment from .copy_paste import CopyPaste from .operators import * diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index d222c410..56da029b 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -521,3 +521,49 @@ class TableLabelEncode(object): assert False, "Unsupport type %s in char_or_elem" \ % char_or_elem return idx + + +class SARLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + character_type='ch', + use_space_char=False, + **kwargs): + super(SARLabelEncode, + self).__init__(max_text_length, character_dict_path, + character_type, use_space_char) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + target = [self.start_idx] + text + [self.end_idx] + padded_text = [self.padding_idx for _ in range(self.max_text_len)] + + padded_text[:len(target)] = target + data['label'] = np.array(padded_text) + return data + + def get_ignored_tokens(self): + return [self.padding_idx] diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index 28e6bd0b..d968f437 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -83,6 +83,56 @@ class SRNRecResizeImg(object): return data +class SARRecResizeImg(object): + def __init__(self, image_shape, width_downsample_ratio=0.25, **kwargs): + self.image_shape = image_shape + self.width_downsample_ratio = width_downsample_ratio + + def __call__(self, data): + img = data['image'] + norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(img, self.image_shape, self.width_downsample_ratio) + data['image'] = norm_img + data['resized_shape'] = resize_shape + data['pad_shape'] = pad_shape + data['valid_ratio'] = valid_ratio + return data + + +def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + def resize_norm_img(img, image_shape): imgC, imgH, imgW = image_shape h = img.shape[0] diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index 025ae7ca..d731185b 100755 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -25,6 +25,7 @@ from .det_sast_loss import SASTLoss from .rec_ctc_loss import CTCLoss from .rec_att_loss import AttentionLoss from .rec_srn_loss import SRNLoss +from .rec_sar_loss import SARLoss # cls loss from .cls_loss import ClsLoss @@ -44,7 +45,7 @@ from .table_att_loss import TableAttentionLoss def build_loss(config): support_dict = [ 'DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', 'AttentionLoss', - 'SRNLoss', 'PGLoss', 'CombinedLoss', 'TableAttentionLoss' + 'SRNLoss', 'PGLoss', 'CombinedLoss', 'TableAttentionLoss', 'SARLoss' ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index f4fe8c76..ce1d6bcf 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -26,8 +26,9 @@ def build_backbone(config, model_type): from .rec_resnet_vd import ResNet from .rec_resnet_fpn import ResNetFPN from .rec_mv1_enhance import MobileNetV1Enhance + from .rec_resnet_31 import ResNet31 support_dict = [ - "MobileNetV1Enhance", "MobileNetV3", "ResNet", "ResNetFPN" + "MobileNetV1Enhance", "MobileNetV3", "ResNet", "ResNetFPN", "ResNet31" ] elif model_type == "e2e": from .e2e_resnet_vd_pg import ResNet diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index 50964794..8414f2ad 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -26,12 +26,13 @@ def build_head(config): from .rec_ctc_head import CTCHead from .rec_att_head import AttentionHead from .rec_srn_head import SRNHead + from .rec_sar_head import SARHead # cls head from .cls_head import ClsHead support_dict = [ 'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead', - 'SRNHead', 'PGHead', 'TableAttentionHead'] + 'SRNHead', 'PGHead', 'TableAttentionHead', 'SARHead'] #table head from .table_att_head import TableAttentionHead diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 654ddf39..86f9ede4 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -25,7 +25,7 @@ from .db_postprocess import DBPostProcess, DistillationDBPostProcess from .east_postprocess import EASTPostProcess from .sast_postprocess import SASTPostProcess from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, DistillationCTCLabelDecode, \ - TableLabelDecode + TableLabelDecode, SARLabelDecode from .cls_postprocess import ClsPostProcess from .pg_postprocess import PGPostProcess @@ -35,7 +35,7 @@ def build_post_process(config, global_config=None): 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode', - 'DistillationDBPostProcess' + 'DistillationDBPostProcess', 'SARLabelDecode' ] config = copy.deepcopy(config) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 8ebe5b27..9e9ddd8f 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -15,6 +15,7 @@ import numpy as np import string import paddle from paddle.nn import functional as F +import re class BaseRecLabelDecode(object): @@ -454,3 +455,79 @@ class TableLabelDecode(object): assert False, "Unsupport type %s in char_or_elem" \ % char_or_elem return idx + + +class SARLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + character_type='ch', + use_space_char=False, + **kwargs): + super(SARLabelDecode, self).__init__(character_dict_path, + character_type, use_space_char) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx ==0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list))) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] diff --git a/tools/eval.py b/tools/eval.py index 0120baab..fb8c7925 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -55,6 +55,7 @@ def main(): model = build_model(config['Architecture']) use_srn = config['Architecture']['algorithm'] == "SRN" + use_sar = config['Architecture']['algorithm'] == "SAR" if "model_type" in config['Architecture'].keys(): model_type = config['Architecture']['model_type'] else: @@ -71,7 +72,7 @@ def main(): # start eval metric = program.eval(model, valid_dataloader, post_process_class, - eval_class, model_type, use_srn) + eval_class, model_type, use_srn, use_sar) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 09f5a0c7..f16cd7d3 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -74,6 +74,10 @@ def main(): 'image', 'encoder_word_pos', 'gsrm_word_pos', 'gsrm_slf_attn_bias1', 'gsrm_slf_attn_bias2' ] + elif config['Architecture']['algorithm'] == "SAR": + op[op_name]['keep_keys'] = [ + 'image', 'valid_ratio' + ] else: op[op_name]['keep_keys'] = ['image'] transforms.append(op) @@ -106,11 +110,16 @@ def main(): paddle.to_tensor(gsrm_slf_attn_bias1_list), paddle.to_tensor(gsrm_slf_attn_bias2_list) ] + if config['Architecture']['algorithm'] == "SAR": + valid_ratio = np.expand_dims(batch[-1], axis=0) + img_metas = [paddle.to_tensor(valid_ratio)] images = np.expand_dims(batch[0], axis=0) images = paddle.to_tensor(images) if config['Architecture']['algorithm'] == "SRN": preds = model(images, others) + elif config['Architecture']['algorithm'] == "SAR": + preds = model(images, img_metas) else: preds = model(images) post_result = post_process_class(preds) diff --git a/tools/program.py b/tools/program.py index 595fe4cb..cb6f8a8b 100755 --- a/tools/program.py +++ b/tools/program.py @@ -186,6 +186,7 @@ def train(config, model.train() use_srn = config['Architecture']['algorithm'] == "SRN" + use_sar = config['Architecture']['algorithm'] == 'SAR' try: model_type = config['Architecture']['model_type'] except: @@ -213,7 +214,7 @@ def train(config, images = batch[0] if use_srn: model_average = True - if use_srn or model_type == 'table': + if use_srn or model_type == 'table' or use_sar: preds = model(images, data=batch[1:]) else: preds = model(images) @@ -277,7 +278,8 @@ def train(config, post_process_class, eval_class, model_type, - use_srn=use_srn) + use_srn=use_srn, + use_sar=use_sar) cur_metric_str = 'cur metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) logger.info(cur_metric_str) @@ -349,7 +351,8 @@ def eval(model, post_process_class, eval_class, model_type, - use_srn=False): + use_srn=False, + use_sar=False): model.eval() with paddle.no_grad(): total_frame = 0.0 @@ -362,7 +365,7 @@ def eval(model, break images = batch[0] start = time.time() - if use_srn or model_type == 'table': + if use_srn or model_type == 'table' or use_sar: preds = model(images, data=batch[1:]) else: preds = model(images) @@ -398,7 +401,7 @@ def preprocess(is_train=False): alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', - 'CLS', 'PGNet', 'Distillation', 'TableAttn' + 'CLS', 'PGNet', 'Distillation', 'TableAttn', 'SAR' ] device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' From 69f9bdd8f6730762b2b4348b5a2791d736968af5 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 24 Aug 2021 04:23:03 +0000 Subject: [PATCH 03/15] add_rec_sar,test=dygraph --- configs/rec/rec_r31_sar.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml index 3a398064..b761d28b 100644 --- a/configs/rec/rec_r31_sar.yml +++ b/configs/rec/rec_r31_sar.yml @@ -3,16 +3,16 @@ Global: epoch_num: 5 log_smooth_window: 20 print_batch_step: 20 - save_model_dir: /paddle/backup/sar_rec/sar_train_v3 + save_model_dir: ./sar_rec save_epoch_step: 1 # evaluation is run every 2000 iterations eval_batch_step: [0, 2000] cal_metric_during_train: True - pretrained_model: #/paddle/backup/sar_rec/sar_train_v2/best_accuracy + pretrained_model: checkpoints: save_inference_dir: use_visualdl: False - infer_img: demo_text_recog.jpg + infer_img: # for data or label process character_dict_path: ppocr/utils/dict90.txt character_type: ch @@ -54,11 +54,11 @@ Metric: Train: dataset: - name: LMDBDataSet #SimpleDataSet - # delimiter: ' ' - # label_file_list: ['/paddle/data/concat_data/icdar_2013_train20.txt', '/paddle/data/concat_data/icdar_2015_train20.txt', '/paddle/data/concat_data/coco_text_train20.txt', '/paddle/data/concat_data/IIIt5k_train20.txt', '/paddle/data/concat_data/SynthAdd_train.txt', '/paddle/data/concat_data/SynthText_train.txt', '/paddle/data/concat_data/Syn90k_train.txt'] - data_dir: /paddle/data/ocr_data/training/ #/paddle/data/concat_data/ - # ratio_list: 1.0 + name: SimpleDataSet + delimiter: ' ' + label_file_list: ['/paddle/data/concat_data/icdar_2013_train20.txt', '/paddle/data/concat_data/icdar_2015_train20.txt', '/paddle/data/concat_data/coco_text_train20.txt', '/paddle/data/concat_data/IIIt5k_train20.txt', '/paddle/data/concat_data/SynthAdd_train.txt', '/paddle/data/concat_data/SynthText_train.txt', '/paddle/data/concat_data/Syn90k_train.txt'] + data_dir: /paddle/data/concat_data/ + ratio_list: 1.0 transforms: - DecodeImage: # load image img_mode: BGR @@ -96,4 +96,4 @@ Eval: batch_size_per_card: 64 num_workers: 4 use_shared_memory: False - \ No newline at end of file + From d43688a4da377269b6469bc818ce325301eb51a7 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 31 Aug 2021 11:33:41 +0000 Subject: [PATCH 04/15] update rec_r31_sar.yml and sar docs --- configs/rec/rec_r31_sar.yml | 4 ++-- doc/doc_ch/recognition.md | 2 ++ doc/doc_en/recognition_en.md | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml index b761d28b..c19bcdee 100644 --- a/configs/rec/rec_r31_sar.yml +++ b/configs/rec/rec_r31_sar.yml @@ -56,7 +56,7 @@ Train: dataset: name: SimpleDataSet delimiter: ' ' - label_file_list: ['/paddle/data/concat_data/icdar_2013_train20.txt', '/paddle/data/concat_data/icdar_2015_train20.txt', '/paddle/data/concat_data/coco_text_train20.txt', '/paddle/data/concat_data/IIIt5k_train20.txt', '/paddle/data/concat_data/SynthAdd_train.txt', '/paddle/data/concat_data/SynthText_train.txt', '/paddle/data/concat_data/Syn90k_train.txt'] + label_file_list: ['/paddle/data/concat_data/train_list.txt'] data_dir: /paddle/data/concat_data/ ratio_list: 1.0 transforms: @@ -71,7 +71,7 @@ Train: keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order loader: shuffle: True - batch_size_per_card: 64 # 32 + batch_size_per_card: 64 drop_last: True num_workers: 8 use_shared_memory: False diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index 0ac6da87..bcad6b86 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -90,6 +90,8 @@ train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 如果希望复现SRN的论文指标,需要下载离线[增广数据](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA),提取码: y3ry。增广数据是由MJSynth和SynthText做旋转和扰动得到的。数据下载完成后请解压到 {your_path}/PaddleOCR/train_data/data_lmdb_release/training/ 路径下。 +如果希望复现SAR的论文指标,需要下载[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg), 提取码:627x。此外,真实数据集icdar2013, icdar2015, cocotext, IIIT5也作为训练数据的一部分。具体数据细节可以参考论文SAR。 + ``` # 训练集标签 wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_train.txt diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 91f81a6a..65030b4a 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -90,6 +90,8 @@ If you do not have a dataset locally, you can download it on the official websit If you want to reproduce the paper indicators of SRN, you need to download offline [augmented data](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA), extraction code: y3ry. The augmented data is obtained by rotation and perturbation of mjsynth and synthtext. Please unzip the data to {your_path}/PaddleOCR/train_data/data_lmdb_Release/training/path. +If you want to reproduce the paper SAR, you need to download extra dataset [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg), extraction code: 627x. Besides, icdar2013, icdar2015, cocotext, IIIT5k datasets are also used to train. For specific details, please refer to the paper SAR. + PaddleOCR provides label files for training the icdar2015 dataset, which can be downloaded in the following ways: ``` From ea019601a712ba99e24c2b4dcad1944c38a90f2b Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 31 Aug 2021 12:19:11 +0000 Subject: [PATCH 05/15] fix conflic in program --- tools/program.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/program.py b/tools/program.py index 2d3114f6..d6d47d04 100755 --- a/tools/program.py +++ b/tools/program.py @@ -402,11 +402,7 @@ def preprocess(is_train=False): alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', -<<<<<<< HEAD - 'CLS', 'PGNet', 'Distillation', 'TableAttn', 'SAR' -======= - 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn' ->>>>>>> 63ed5fcab30801626ecf55a89f5dc9faf79a16d2 + 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR' ] device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' From 37cd4ed333acff9620f7fb0c4c003935fff2b154 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Thu, 2 Sep 2021 07:18:13 +0000 Subject: [PATCH 06/15] delete dict90, update sar postprocess, update sar.yml --- configs/rec/rec_r31_sar.yml | 9 +-- ppocr/postprocess/rec_postprocess.py | 8 ++- ppocr/utils/dict90.txt | 90 ---------------------------- 3 files changed, 10 insertions(+), 97 deletions(-) delete mode 100644 ppocr/utils/dict90.txt diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml index c19bcdee..73d56f37 100644 --- a/configs/rec/rec_r31_sar.yml +++ b/configs/rec/rec_r31_sar.yml @@ -15,7 +15,7 @@ Global: infer_img: # for data or label process character_dict_path: ppocr/utils/dict90.txt - character_type: ch + character_type: EN_symbol max_text_length: 30 infer_mode: False use_space_char: False @@ -47,6 +47,7 @@ Loss: PostProcess: name: SARLabelDecode + rm_symbol: True Metric: name: RecMetric @@ -56,8 +57,8 @@ Train: dataset: name: SimpleDataSet delimiter: ' ' - label_file_list: ['/paddle/data/concat_data/train_list.txt'] - data_dir: /paddle/data/concat_data/ + label_file_list: ['./train_data/train_list.txt'] + data_dir: ./train_data/ ratio_list: 1.0 transforms: - DecodeImage: # load image @@ -79,7 +80,7 @@ Train: Eval: dataset: name: LMDBDataSet - data_dir: /paddle/data/ocr_data/evaluation/ + data_dir: ./eval_data/evaluation/ transforms: - DecodeImage: # load image img_mode: BGR diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 83d7b215..20439281 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -526,6 +526,7 @@ class SARLabelDecode(BaseRecLabelDecode): character_dict_path=None, character_type='ch', use_space_char=False, + rm_symbol=True, **kwargs): super(SARLabelDecode, self).__init__(character_dict_path, character_type, use_space_char) @@ -572,9 +573,10 @@ class SARLabelDecode(BaseRecLabelDecode): else: conf_list.append(1) text = ''.join(char_list) - comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') - text = text.lower() - text = comp.sub('', text) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) result_list.append((text, np.mean(conf_list))) return result_list diff --git a/ppocr/utils/dict90.txt b/ppocr/utils/dict90.txt deleted file mode 100644 index a945ae9c..00000000 --- a/ppocr/utils/dict90.txt +++ /dev/null @@ -1,90 +0,0 @@ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -: -; -< -= -> -? -@ -[ -\ -] -_ -` -~ \ No newline at end of file From dc3bce76916b515870655a1e9150a91f01ac7649 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Thu, 2 Sep 2021 07:28:30 +0000 Subject: [PATCH 07/15] update sar postprocess --- configs/rec/rec_r31_sar.yml | 2 +- ppocr/postprocess/rec_postprocess.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml index 73d56f37..7adb7136 100644 --- a/configs/rec/rec_r31_sar.yml +++ b/configs/rec/rec_r31_sar.yml @@ -19,6 +19,7 @@ Global: max_text_length: 30 infer_mode: False use_space_char: False + rm_symbol: True save_res_path: ./output/rec/predicts_sar.txt Optimizer: @@ -47,7 +48,6 @@ Loss: PostProcess: name: SARLabelDecode - rm_symbol: True Metric: name: RecMetric diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 20439281..f2c3f289 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -526,10 +526,10 @@ class SARLabelDecode(BaseRecLabelDecode): character_dict_path=None, character_type='ch', use_space_char=False, - rm_symbol=True, **kwargs): super(SARLabelDecode, self).__init__(character_dict_path, character_type, use_space_char) + self.rm_symbol = kwargs.get('rm_symbol', True) def add_special_char(self, dict_character): beg_end_str = "" From 073fad37ba1e0237b2d56bcfe54e32e7850b34c3 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Thu, 2 Sep 2021 07:38:08 +0000 Subject: [PATCH 08/15] update sar postprocess --- ppocr/postprocess/rec_postprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index f2c3f289..2bc60648 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -529,6 +529,7 @@ class SARLabelDecode(BaseRecLabelDecode): **kwargs): super(SARLabelDecode, self).__init__(character_dict_path, character_type, use_space_char) + self.rm_symbol = kwargs.get('rm_symbol', True) def add_special_char(self, dict_character): From df4a2f6a7ee5efbf48a36b42a70c89511fdb4ac6 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 03:33:02 +0000 Subject: [PATCH 09/15] update rec_sar_head --- ppocr/losses/rec_sar_loss.py | 2 +- ppocr/modeling/heads/rec_sar_head.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ppocr/losses/rec_sar_loss.py b/ppocr/losses/rec_sar_loss.py index 1afb21fe..9e1c6495 100644 --- a/ppocr/losses/rec_sar_loss.py +++ b/ppocr/losses/rec_sar_loss.py @@ -9,7 +9,7 @@ from paddle import nn class SARLoss(nn.Layer): def __init__(self, **kwargs): super(SARLoss, self).__init__() - self.loss_func = paddle.nn.loss.CrossEntropyLoss(reduction="mean", ignore_index=92) + self.loss_func = paddle.nn.loss.CrossEntropyLoss(reduction="mean", ignore_index=96) def forward(self, predicts, batch): predict = predicts[:, :-1, :] # ignore last index of outputs to be in same seq_len with targets diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index fb37b8ce..98b00ed0 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -118,8 +118,7 @@ class BaseDecoder(nn.Layer): class ParallelSARDecoder(BaseDecoder): """ Args: - num_classes (int): Output class number. - channels (list[int]): Network layer channels. + out_channels (int): Output class number. enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. dec_bi_rnn (bool): If True, use bidirectional RNN in decoder. dec_drop_rnn (float): Dropout of RNN layer in decoder. @@ -137,7 +136,7 @@ class ParallelSARDecoder(BaseDecoder): """ def __init__(self, - num_classes=93, # 90 + unknown + start + padding + out_channels, # 90 + unknown + start + padding enc_bi_rnn=False, dec_bi_rnn=False, dec_drop_rnn=0.0, @@ -148,8 +147,6 @@ class ParallelSARDecoder(BaseDecoder): pred_dropout=0.1, max_text_length=30, mask=True, - start_idx=91, - padding_idx=92, # 92 pred_concat=True, **kwargs): super().__init__() @@ -157,7 +154,8 @@ class ParallelSARDecoder(BaseDecoder): self.num_classes = num_classes self.enc_bi_rnn = enc_bi_rnn self.d_k = d_k - self.start_idx = start_idx + self.start_idx = out_channels - 2 + self.padding_idx = out_channels - 1 self.max_seq_len = max_text_length self.mask = mask self.pred_concat = pred_concat @@ -191,7 +189,7 @@ class ParallelSARDecoder(BaseDecoder): # Decoder input embedding self.embedding = nn.Embedding( - self.num_classes, encoder_rnn_out_size, padding_idx=padding_idx) + self.num_classes, encoder_rnn_out_size, padding_idx=self.padding_idx) # Prediction layer self.pred_dropout = nn.Dropout(pred_dropout) @@ -330,6 +328,7 @@ class ParallelSARDecoder(BaseDecoder): class SARHead(nn.Layer): def __init__(self, + out_channels, enc_bi_rnn=False, enc_drop_rnn=0.1, enc_gru=False, @@ -351,7 +350,8 @@ class SARHead(nn.Layer): # decoder module self.decoder = ParallelSARDecoder( - enc_bi_rnn=enc_bi_rnn, + out_channels=out_channels, + enc_bi_rnn=enc_bi_rnn, dec_bi_rnn=dec_bi_rnn, dec_drop_rnn=dec_drop_rnn, dec_gru=dec_gru, @@ -375,4 +375,4 @@ class SARHead(nn.Layer): # (bsz, seq_len, num_classes) return final_out - \ No newline at end of file + From 8123688a0941be22f5c9868c882d8c40e2472321 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 03:38:34 +0000 Subject: [PATCH 10/15] update rec_sar_head --- ppocr/modeling/heads/rec_sar_head.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index 98b00ed0..ba0aa8eb 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -151,11 +151,11 @@ class ParallelSARDecoder(BaseDecoder): **kwargs): super().__init__() - self.num_classes = num_classes + self.num_classes = out_channels self.enc_bi_rnn = enc_bi_rnn self.d_k = d_k self.start_idx = out_channels - 2 - self.padding_idx = out_channels - 1 + self.padding_idx = out_channels - 1 self.max_seq_len = max_text_length self.mask = mask self.pred_concat = pred_concat From ae09ef607fd7b39d43aed73abec3212c756b10ee Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 06:09:59 +0000 Subject: [PATCH 11/15] fix code style --- ppocr/postprocess/rec_postprocess.py | 62 +++++++++++++++------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 2bc60648..39ef16ec 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -166,21 +166,21 @@ class NRTRLabelDecode(BaseRecLabelDecode): use_space_char=True, **kwargs): super(NRTRLabelDecode, self).__init__(character_dict_path, - character_type, use_space_char) + character_type, use_space_char) def __call__(self, preds, label=None, *args, **kwargs): if preds.dtype == paddle.int64: if isinstance(preds, paddle.Tensor): preds = preds.numpy() - if preds[0][0]==2: - preds_idx = preds[:,1:] + if preds[0][0] == 2: + preds_idx = preds[:, 1:] else: preds_idx = preds text = self.decode(preds_idx) if label is None: return text - label = self.decode(label[:,1:]) + label = self.decode(label[:, 1:]) else: if isinstance(preds, paddle.Tensor): preds = preds.numpy() @@ -189,13 +189,13 @@ class NRTRLabelDecode(BaseRecLabelDecode): text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) if label is None: return text - label = self.decode(label[:,1:]) + label = self.decode(label[:, 1:]) return text, label def add_special_char(self, dict_character): - dict_character = ['blank','','',''] + dict_character + dict_character = ['blank', '', '', ''] + dict_character return dict_character - + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): """ convert text-index into text-label. """ result_list = [] @@ -204,10 +204,11 @@ class NRTRLabelDecode(BaseRecLabelDecode): char_list = [] conf_list = [] for idx in range(len(text_index[batch_idx])): - if text_index[batch_idx][idx] == 3: # end + if text_index[batch_idx][idx] == 3: # end break try: - char_list.append(self.character[int(text_index[batch_idx][idx])]) + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) except: continue if text_prob is not None: @@ -219,7 +220,6 @@ class NRTRLabelDecode(BaseRecLabelDecode): return result_list - class AttnLabelDecode(BaseRecLabelDecode): """ Convert between text-label and text-index """ @@ -257,7 +257,8 @@ class AttnLabelDecode(BaseRecLabelDecode): if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ batch_idx][idx]: continue - char_list.append(self.character[int(text_index[batch_idx][idx])]) + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) if text_prob is not None: conf_list.append(text_prob[batch_idx][idx]) else: @@ -387,10 +388,9 @@ class SRNLabelDecode(BaseRecLabelDecode): class TableLabelDecode(object): """ """ - def __init__(self, - character_dict_path, - **kwargs): - list_character, list_elem = self.load_char_elem_dict(character_dict_path) + def __init__(self, character_dict_path, **kwargs): + list_character, list_elem = self.load_char_elem_dict( + character_dict_path) list_character = self.add_special_char(list_character) list_elem = self.add_special_char(list_elem) self.dict_character = {} @@ -409,7 +409,8 @@ class TableLabelDecode(object): list_elem = [] with open(character_dict_path, "rb") as fin: lines = fin.readlines() - substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split("\t") + substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split( + "\t") character_num = int(substr[0]) elem_num = int(substr[1]) for cno in range(1, 1 + character_num): @@ -429,14 +430,14 @@ class TableLabelDecode(object): def __call__(self, preds): structure_probs = preds['structure_probs'] loc_preds = preds['loc_preds'] - if isinstance(structure_probs,paddle.Tensor): + if isinstance(structure_probs, paddle.Tensor): structure_probs = structure_probs.numpy() - if isinstance(loc_preds,paddle.Tensor): + if isinstance(loc_preds, paddle.Tensor): loc_preds = loc_preds.numpy() structure_idx = structure_probs.argmax(axis=2) structure_probs = structure_probs.max(axis=2) - structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode(structure_idx, - structure_probs, 'elem') + structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode( + structure_idx, structure_probs, 'elem') res_html_code_list = [] res_loc_list = [] batch_num = len(structure_str) @@ -451,8 +452,13 @@ class TableLabelDecode(object): res_loc = np.array(res_loc) res_html_code_list.append(res_html_code) res_loc_list.append(res_loc) - return {'res_html_code': res_html_code_list, 'res_loc': res_loc_list, 'res_score_list': result_score_list, - 'res_elem_idx_list': result_elem_idx_list,'structure_str_list':structure_str} + return { + 'res_html_code': res_html_code_list, + 'res_loc': res_loc_list, + 'res_score_list': result_score_list, + 'res_elem_idx_list': result_elem_idx_list, + 'structure_str_list': structure_str + } def decode(self, text_index, structure_probs, char_or_elem): """convert text-label into text-index. @@ -528,9 +534,9 @@ class SARLabelDecode(BaseRecLabelDecode): use_space_char=False, **kwargs): super(SARLabelDecode, self).__init__(character_dict_path, - character_type, use_space_char) - - self.rm_symbol = kwargs.get('rm_symbol', True) + character_type, use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', True) def add_special_char(self, dict_character): beg_end_str = "" @@ -549,7 +555,7 @@ class SARLabelDecode(BaseRecLabelDecode): """ convert text-index into text-label. """ result_list = [] ignored_tokens = self.get_ignored_tokens() - + batch_size = len(text_index) for batch_idx in range(batch_size): char_list = [] @@ -558,7 +564,7 @@ class SARLabelDecode(BaseRecLabelDecode): if text_index[batch_idx][idx] in ignored_tokens: continue if int(text_index[batch_idx][idx]) == int(self.end_idx): - if text_prob is None and idx ==0: + if text_prob is None and idx == 0: continue else: break @@ -586,7 +592,7 @@ class SARLabelDecode(BaseRecLabelDecode): preds = preds.numpy() preds_idx = preds.argmax(axis=2) preds_prob = preds.max(axis=2) - + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) if label is None: From d611515803d71a847f7b262704fe7e92607b0ef6 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 06:13:56 +0000 Subject: [PATCH 12/15] fix code style --- ppocr/modeling/heads/rec_sar_head.py | 179 ++++++++++++++------------- 1 file changed, 92 insertions(+), 87 deletions(-) diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index ba0aa8eb..3c131c8b 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -19,6 +19,7 @@ class SAREncoder(nn.Layer): d_enc (int): Dim of encoder RNN layer. mask (bool): If True, mask padding in RNN sequence. """ + def __init__(self, enc_bi_rnn=False, enc_drop_rnn=0.1, @@ -51,33 +52,31 @@ class SAREncoder(nn.Layer): num_layers=2, time_major=False, dropout=enc_drop_rnn, - direction=direction - ) + direction=direction) if enc_gru: self.rnn_encoder = nn.GRU(**kwargs) else: self.rnn_encoder = nn.LSTM(**kwargs) - + # global feature transformation encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size) - + def forward(self, feat, img_metas=None): if img_metas is not None: assert len(img_metas[0]) == feat.shape[0] - + valid_ratios = None if img_metas is not None and self.mask: valid_ratios = img_metas[-1] - - h_feat = feat.shape[2] # bsz c h w + + h_feat = feat.shape[2] # bsz c h w feat_v = F.max_pool2d( - feat, kernel_size=(h_feat, 1), stride=1, padding=0 - ) - feat_v = feat_v.squeeze(2) # bsz * C * W - feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C - holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C - + feat, kernel_size=(h_feat, 1), stride=1, padding=0) + feat_v = feat_v.squeeze(2) # bsz * C * W + feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C + holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C + if valid_ratios is not None: valid_hf = [] T = holistic_feat.shape[1] @@ -86,11 +85,11 @@ class SAREncoder(nn.Layer): valid_hf.append(holistic_feat[i, valid_step, :]) valid_hf = paddle.stack(valid_hf, axis=0) else: - valid_hf = holistic_feat[:, -1, :] # bsz * C - holistic_feat = self.linear(valid_hf) # bsz * C - + valid_hf = holistic_feat[:, -1, :] # bsz * C + holistic_feat = self.linear(valid_hf) # bsz * C + return holistic_feat - + class BaseDecoder(nn.Layer): def __init__(self, **kwargs): @@ -102,7 +101,7 @@ class BaseDecoder(nn.Layer): def forward_test(self, feat, out_enc, img_metas): raise NotImplementedError - def forward(self, + def forward(self, feat, out_enc, label=None, @@ -135,20 +134,21 @@ class ParallelSARDecoder(BaseDecoder): attention with holistic feature and hidden state. """ - def __init__(self, - out_channels, # 90 + unknown + start + padding - enc_bi_rnn=False, - dec_bi_rnn=False, - dec_drop_rnn=0.0, - dec_gru=False, - d_model=512, - d_enc=512, - d_k=64, - pred_dropout=0.1, - max_text_length=30, - mask=True, - pred_concat=True, - **kwargs): + def __init__( + self, + out_channels, # 90 + unknown + start + padding + enc_bi_rnn=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_model=512, + d_enc=512, + d_k=64, + pred_dropout=0.1, + max_text_length=30, + mask=True, + pred_concat=True, + **kwargs): super().__init__() self.num_classes = out_channels @@ -165,7 +165,8 @@ class ParallelSARDecoder(BaseDecoder): # 2D attention layer self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k) - self.conv3x3_1 = nn.Conv2D(d_model, d_k, kernel_size=3, stride=1, padding=1) + self.conv3x3_1 = nn.Conv2D( + d_model, d_k, kernel_size=3, stride=1, padding=1) self.conv1x1_2 = nn.Linear(d_k, 1) # Decoder RNN layer @@ -180,8 +181,7 @@ class ParallelSARDecoder(BaseDecoder): num_layers=2, time_major=False, dropout=dec_drop_rnn, - direction=direction - ) + direction=direction) if dec_gru: self.rnn_decoder = nn.GRU(**kwargs) else: @@ -189,8 +189,10 @@ class ParallelSARDecoder(BaseDecoder): # Decoder input embedding self.embedding = nn.Embedding( - self.num_classes, encoder_rnn_out_size, padding_idx=self.padding_idx) - + self.num_classes, + encoder_rnn_out_size, + padding_idx=self.padding_idx) + # Prediction layer self.pred_dropout = nn.Dropout(pred_dropout) pred_num_classes = num_classes - 1 @@ -205,11 +207,11 @@ class ParallelSARDecoder(BaseDecoder): feat, holistic_feat, valid_ratios=None): - + y = self.rnn_decoder(decoder_input)[0] # y: bsz * (seq_len + 1) * hidden_size - - attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size + + attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size bsz, seq_len, attn_size = attn_query.shape attn_query = paddle.unsqueeze(attn_query, axis=[3, 4]) # (bsz, seq_len + 1, attn_size, 1, 1) @@ -220,7 +222,7 @@ class ParallelSARDecoder(BaseDecoder): # bsz * 1 * attn_size * h * w attn_weight = paddle.tanh(paddle.add(attn_key, attn_query)) - + # bsz * (seq_len + 1) * attn_size * h * w attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 3, 4, 2]) # bsz * (seq_len + 1) * h * w * attn_size @@ -237,25 +239,28 @@ class ParallelSARDecoder(BaseDecoder): attn_weight = paddle.reshape(attn_weight, [bsz, T, -1]) attn_weight = F.softmax(attn_weight, axis=-1) - + attn_weight = paddle.reshape(attn_weight, [bsz, T, h, w, c]) attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 4, 2, 3]) # attn_weight: bsz * T * c * h * w # feat: bsz * c * h * w - attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight), (3, 4), keepdim=False) + attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight), + (3, 4), + keepdim=False) # bsz * (seq_len + 1) * C # Linear transformation if self.pred_concat: hf_c = holistic_feat.shape[-1] - holistic_feat = paddle.expand(holistic_feat, shape=[bsz, seq_len, hf_c]) + holistic_feat = paddle.expand( + holistic_feat, shape=[bsz, seq_len, hf_c]) y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2)) else: y = self.prediction(attn_feat) # bsz * (seq_len + 1) * num_classes if self.train_mode: y = self.pred_dropout(y) - + return y def forward_train(self, feat, out_enc, label, img_metas): @@ -268,7 +273,7 @@ class ParallelSARDecoder(BaseDecoder): valid_ratios = None if img_metas is not None and self.mask: valid_ratios = img_metas[-1] - + label = label.cuda() lab_embedding = self.embedding(label) # bsz * seq_len * emb_dim @@ -277,11 +282,10 @@ class ParallelSARDecoder(BaseDecoder): in_dec = paddle.concat((out_enc, lab_embedding), axis=1) # bsz * (seq_len + 1) * C out_dec = self._2d_attention( - in_dec, feat, out_enc, valid_ratios=valid_ratios - ) + in_dec, feat, out_enc, valid_ratios=valid_ratios) # bsz * (seq_len + 1) * num_classes - - return out_dec[:, 1:, :] # bsz * seq_len * num_classes + + return out_dec[:, 1:, :] # bsz * seq_len * num_classes def forward_test(self, feat, out_enc, img_metas): if img_metas is not None: @@ -289,13 +293,12 @@ class ParallelSARDecoder(BaseDecoder): valid_ratios = None if img_metas is not None and self.mask: - valid_ratios = img_metas[-1] - + valid_ratios = img_metas[-1] + seq_len = self.max_seq_len bsz = feat.shape[0] - start_token = paddle.full((bsz, ), - fill_value=self.start_idx, - dtype='int64') + start_token = paddle.full( + (bsz, ), fill_value=self.start_idx, dtype='int64') # bsz start_token = self.embedding(start_token) # bsz * emb_dim @@ -311,68 +314,70 @@ class ParallelSARDecoder(BaseDecoder): outputs = [] for i in range(1, seq_len + 1): decoder_output = self._2d_attention( - decoder_input, feat, out_enc, valid_ratios=valid_ratios - ) - char_output = decoder_output[:, i, :] # bsz * num_classes + decoder_input, feat, out_enc, valid_ratios=valid_ratios) + char_output = decoder_output[:, i, :] # bsz * num_classes char_output = F.softmax(char_output, -1) outputs.append(char_output) max_idx = paddle.argmax(char_output, axis=1, keepdim=False) - char_embedding = self.embedding(max_idx) # bsz * emb_dim + char_embedding = self.embedding(max_idx) # bsz * emb_dim if i < seq_len: decoder_input[:, i + 1, :] = char_embedding - - outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes + + outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes return outputs class SARHead(nn.Layer): - def __init__(self, - out_channels, - enc_bi_rnn=False, - enc_drop_rnn=0.1, - enc_gru=False, - dec_bi_rnn=False, - dec_drop_rnn=0.0, - dec_gru=False, - d_k=512, - pred_dropout=0.1, - max_text_length=30, - pred_concat=True, - **kwargs): + def __init__(self, + out_channels, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_k=512, + pred_dropout=0.1, + max_text_length=30, + pred_concat=True, + **kwargs): super(SARHead, self).__init__() # encoder module self.encoder = SAREncoder( - enc_bi_rnn=enc_bi_rnn, - enc_drop_rnn=enc_drop_rnn, - enc_gru=enc_gru) + enc_bi_rnn=enc_bi_rnn, enc_drop_rnn=enc_drop_rnn, enc_gru=enc_gru) # decoder module self.decoder = ParallelSARDecoder( out_channels=out_channels, - enc_bi_rnn=enc_bi_rnn, + enc_bi_rnn=enc_bi_rnn, dec_bi_rnn=dec_bi_rnn, dec_drop_rnn=dec_drop_rnn, dec_gru=dec_gru, d_k=d_k, pred_dropout=pred_dropout, max_text_length=max_text_length, - pred_concat=pred_concat) - + pred_concat=pred_concat) + def forward(self, feat, targets=None): ''' img_metas: [label, valid_ratio] ''' - holistic_feat = self.encoder(feat, targets) # bsz c - + holistic_feat = self.encoder(feat, targets) # bsz c + if self.training: - label = targets[0] # label + label = targets[0] # label label = paddle.to_tensor(label, dtype='int64') - final_out = self.decoder(feat, holistic_feat, label, img_metas=targets) + final_out = self.decoder( + feat, holistic_feat, label, img_metas=targets) if not self.training: - final_out = self.decoder(feat, holistic_feat, label=None, img_metas=targets, train_mode=False) + final_out = self.decoder( + feat, + holistic_feat, + label=None, + img_metas=targets, + train_mode=False) # (bsz, seq_len, num_classes) - + return final_out - From b68c895827641b3459069fe804546407fd6c8711 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 07:39:24 +0000 Subject: [PATCH 13/15] fix code style --- ppocr/modeling/heads/rec_sar_head.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index 3c131c8b..647f5820 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -195,7 +195,7 @@ class ParallelSARDecoder(BaseDecoder): # Prediction layer self.pred_dropout = nn.Dropout(pred_dropout) - pred_num_classes = num_classes - 1 + pred_num_classes = self.num_classes - 1 if pred_concat: fc_in_channel = decoder_rnn_out_size + d_model + d_enc else: From e10f71df100d9dd51d7f3e767da368f7d45e1c7b Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 09:47:09 +0000 Subject: [PATCH 14/15] update sar_yml --- configs/rec/rec_r31_sar.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml index 7adb7136..053b1ae8 100644 --- a/configs/rec/rec_r31_sar.yml +++ b/configs/rec/rec_r31_sar.yml @@ -56,7 +56,6 @@ Metric: Train: dataset: name: SimpleDataSet - delimiter: ' ' label_file_list: ['./train_data/train_list.txt'] data_dir: ./train_data/ ratio_list: 1.0 From 38fb2ba3c99edcb6da322239f3f781e22cbec5a2 Mon Sep 17 00:00:00 2001 From: andyjpaddle Date: Tue, 7 Sep 2021 11:31:23 +0000 Subject: [PATCH 15/15] update rec postprocess --- ppocr/postprocess/rec_postprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 39ef16ec..6ff375eb 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -536,7 +536,7 @@ class SARLabelDecode(BaseRecLabelDecode): super(SARLabelDecode, self).__init__(character_dict_path, character_type, use_space_char) - self.rm_symbol = kwargs.get('rm_symbol', True) + self.rm_symbol = kwargs.get('rm_symbol', False) def add_special_char(self, dict_character): beg_end_str = ""