From 09d8cb6d98ed3a7c4636aa5c324bcaef1e4280c2 Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Fri, 14 Aug 2020 16:31:13 +0800
Subject: [PATCH 01/11] update for srn

---
 .../rec_r50fpn_vd_none_srn_pvam_test_all.yml  |   48 +
 ppocr/data/rec/dataset_traversal.py           |   55 +-
 ppocr/data/rec/img_tools.py                   |   81 ++
 ppocr/modeling/architectures/rec_model.py     |   39 +-
 ppocr/modeling/backbones/rec_resnet50_fpn.py  |  172 +++
 ppocr/modeling/backbones/rec_resnet_vd.py     |    2 +-
 ppocr/modeling/heads/rec_srn_all_head.py      |  218 ++++
 .../modeling/heads/self_attention/__init__.py |    0
 ppocr/modeling/heads/self_attention/model.py  | 1065 +++++++++++++++++
 ppocr/modeling/losses/rec_srn_loss.py         |   58 +
 ppocr/utils/character.py                      |   45 +
 tools/eval_utils/eval_rec_utils.py            |   57 +-
 tools/program.py                              |   37 +-
 tools/train.py                                |    4 +-
 14 files changed, 1838 insertions(+), 43 deletions(-)
 create mode 100755 configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
 create mode 100755 ppocr/modeling/backbones/rec_resnet50_fpn.py
 create mode 100755 ppocr/modeling/heads/rec_srn_all_head.py
 create mode 100644 ppocr/modeling/heads/self_attention/__init__.py
 create mode 100644 ppocr/modeling/heads/self_attention/model.py
 create mode 100755 ppocr/modeling/losses/rec_srn_loss.py

diff --git a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
new file mode 100755
index 00000000..933a7513
--- /dev/null
+++ b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
@@ -0,0 +1,48 @@
+Global:
+  algorithm: SRN
+  use_gpu: true
+  epoch_num: 72
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: output/rec_pvam_withrotate
+  save_epoch_step: 1
+  eval_batch_step: 8000
+  train_batch_size_per_card: 64
+  test_batch_size_per_card: 1
+  image_shape: [1, 64, 256]
+  max_text_length: 25
+  character_type: en
+  loss_type: srn
+  num_heads: 8
+  average_window: 0.15
+  max_average_window: 15625
+  min_average_window: 10000
+  reader_yml: ./configs/rec/rec_srn_reader.yml
+  pretrain_weights: 
+  checkpoints:
+  save_inference_dir:
+  
+Architecture:
+  function: ppocr.modeling.architectures.rec_model,RecModel
+
+Backbone:
+  function: ppocr.modeling.backbones.rec_resnet50_fpn,ResNet
+  layers: 50
+ 
+Head:
+  function: ppocr.modeling.heads.rec_srn_all_head,SRNPredict
+  encoder_type: rnn
+  num_encoder_TUs: 2
+  num_decoder_TUs: 4
+  hidden_dims: 512
+  SeqRNN:
+    hidden_size: 256
+    
+Loss:
+  function: ppocr.modeling.losses.rec_srn_loss,SRNLoss
+
+Optimizer:
+  function: ppocr.optimizer,AdamDecay
+  base_lr: 0.0001
+  beta1: 0.9
+  beta2: 0.999
diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py
index ec3e9d86..7135fca5 100755
--- a/ppocr/data/rec/dataset_traversal.py
+++ b/ppocr/data/rec/dataset_traversal.py
@@ -26,7 +26,7 @@ from ppocr.utils.utility import initial_logger
 from ppocr.utils.utility import get_image_file_list
 logger = initial_logger()
 
-from .img_tools import process_image, get_img_data
+from .img_tools import process_image, process_image_srn, get_img_data
 
 
 class LMDBReader(object):
@@ -40,6 +40,7 @@ class LMDBReader(object):
         self.image_shape = params['image_shape']
         self.loss_type = params['loss_type']
         self.max_text_length = params['max_text_length']
+        self.num_heads = params['num_heads']
         self.mode = params['mode']
         self.drop_last = False
         self.use_tps = False
@@ -117,14 +118,36 @@ class LMDBReader(object):
                 image_file_list = get_image_file_list(self.infer_img)
                 for single_img in image_file_list:
                     img = cv2.imread(single_img)
-                    if img.shape[-1] == 1 or len(list(img.shape)) == 2:
+                    if img.shape[-1]==1 or len(list(img.shape))==2:
+                        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+                    if self.loss_type == 'srn':
+                        norm_img = process_image_srn(
+                            img=img,
+                            image_shape=self.image_shape,
+                            num_heads=self.num_heads,
+                            max_text_length=self.max_text_length
+                        )
+                    else:
+                        norm_img = process_image(
+                            img=img,
+                            image_shape=self.image_shape,
+                            char_ops=self.char_ops,
+                            tps=self.use_tps,
+                            infer_mode=True)
+                    yield norm_img
+            elif self.mode == 'test':
+                image_file_list = get_image_file_list(self.infer_img)
+                for single_img in image_file_list:
+                    img = cv2.imread(single_img)
+                    if img.shape[-1]==1 or len(list(img.shape))==2:
                         img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
                     norm_img = process_image(
                         img=img,
                         image_shape=self.image_shape,
                         char_ops=self.char_ops,
                         tps=self.use_tps,
-                        infer_mode=True)
+                        infer_mode=True
+                    )
                     yield norm_img
             else:
                 lmdb_sets = self.load_hierarchical_lmdb_dataset()
@@ -144,14 +167,16 @@ class LMDBReader(object):
                             if sample_info is None:
                                 continue
                             img, label = sample_info
-                            outs = process_image(
-                                img=img,
-                                image_shape=self.image_shape,
-                                label=label,
-                                char_ops=self.char_ops,
-                                loss_type=self.loss_type,
-                                max_text_length=self.max_text_length,
-                                distort=self.use_distort)
+                            outs = []
+                            if self.loss_type == "srn":
+                                outs = process_image_srn(img, self.image_shape, self.num_heads,
+                                                         self.max_text_length, label,
+                                                         self.char_ops, self.loss_type)
+
+                            else:
+                                outs = process_image(img, self.image_shape, label,
+                                                    self.char_ops, self.loss_type,
+                                                    self.max_text_length)
                             if outs is None:
                                 continue
                             yield outs
@@ -159,7 +184,6 @@ class LMDBReader(object):
                     if finish_read_num == len(lmdb_sets):
                         break
                 self.close_lmdb_dataset(lmdb_sets)
-
         def batch_iter_reader():
             batch_outs = []
             for outs in sample_iter_reader():
@@ -167,9 +191,8 @@ class LMDBReader(object):
                 if len(batch_outs) == self.batch_size:
                     yield batch_outs
                     batch_outs = []
-            if not self.drop_last:
-                if len(batch_outs) != 0:
-                    yield batch_outs
+            if len(batch_outs) != 0:
+                yield batch_outs
 
         if self.infer_img is None:
             return batch_iter_reader
@@ -288,4 +311,4 @@ class SimpleReader(object):
 
         if self.infer_img is None:
             return batch_iter_reader
-        return sample_iter_reader
+        return sample_iter_reader
\ No newline at end of file
diff --git a/ppocr/data/rec/img_tools.py b/ppocr/data/rec/img_tools.py
index 0835603b..527e0266 100755
--- a/ppocr/data/rec/img_tools.py
+++ b/ppocr/data/rec/img_tools.py
@@ -381,3 +381,84 @@ def process_image(img,
                 assert False, "Unsupport loss_type %s in process_image"\
                     % loss_type
     return (norm_img)
+
+def resize_norm_img_srn(img, image_shape):
+    imgC, imgH, imgW = image_shape
+
+    img_black = np.zeros((imgH, imgW))
+    im_hei = img.shape[0]
+    im_wid = img.shape[1]
+
+    if im_wid <= im_hei * 1:
+        img_new = cv2.resize(img, (imgH * 1, imgH))
+    elif im_wid <= im_hei * 2:
+        img_new = cv2.resize(img, (imgH * 2, imgH))
+    elif im_wid <= im_hei * 3:
+        img_new = cv2.resize(img, (imgH * 3, imgH))
+    else:
+        img_new = cv2.resize(img, (imgW, imgH))
+
+    img_np = np.asarray(img_new)
+    img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+    img_black[:, 0:img_np.shape[1]] = img_np
+    img_black = img_black[:, :, np.newaxis]
+
+    row, col, c = img_black.shape
+    c = 1
+
+    return np.reshape(img_black, (c, row, col)).astype(np.float32)
+
+def srn_other_inputs(image_shape,
+                     num_heads,
+                     max_text_length):
+
+    imgC, imgH, imgW = image_shape
+    feature_dim = int((imgH / 8) * (imgW / 8))
+
+    encoder_word_pos = np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype('int64')
+    gsrm_word_pos = np.array(range(0, max_text_length)).reshape((max_text_length, 1)).astype('int64')
+
+    lbl_weight = np.array([37] * max_text_length).reshape((-1,1)).astype('int64')
+
+    gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) 
+    gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape([-1, 1, max_text_length, max_text_length])
+    gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]) * [-1e9] 
+
+    gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape([-1, 1, max_text_length, max_text_length])
+    gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]) * [-1e9] 
+
+    encoder_word_pos = encoder_word_pos[np.newaxis, :]
+    gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
+
+    return [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2]
+
+def process_image_srn(img,
+                      image_shape,
+                      num_heads,
+                      max_text_length,
+                      label=None,
+                      char_ops=None,
+                      loss_type=None):
+    norm_img = resize_norm_img_srn(img, image_shape)
+    norm_img = norm_img[np.newaxis, :]
+    [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
+        srn_other_inputs(image_shape, num_heads, max_text_length)
+
+    if label is not None:
+        char_num = char_ops.get_char_num()
+        text = char_ops.encode(label)
+        if len(text) == 0 or len(text) > max_text_length:
+            return None
+        else:
+            if loss_type == "srn":
+                text_padded = [37] * max_text_length
+                for i in range(len(text)):
+                    text_padded[i] = text[i]
+                    lbl_weight[i] = [1.0]
+                text_padded = np.array(text_padded)
+                text = text_padded.reshape(-1, 1)
+                return (norm_img, text,encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2,lbl_weight)
+            else:
+                assert False, "Unsupport loss_type %s in process_image"\
+                    % loss_type
+    return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2)
diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py
index e80a50ab..a030f362 100755
--- a/ppocr/modeling/architectures/rec_model.py
+++ b/ppocr/modeling/architectures/rec_model.py
@@ -58,6 +58,7 @@ class RecModel(object):
         self.loss_type = global_params['loss_type']
         self.image_shape = global_params['image_shape']
         self.max_text_length = global_params['max_text_length']
+        self.num_heads = global_params["num_heads"]
 
     def create_feed(self, mode):
         image_shape = deepcopy(self.image_shape)
@@ -77,6 +78,18 @@ class RecModel(object):
                     lod_level=1)
                 feed_list = [image, label_in, label_out]
                 labels = {'label_in': label_in, 'label_out': label_out}
+            elif self.loss_type == "srn":
+                encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64")
+                gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64")
+                gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
+                gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
+                lbl_weight = fluid.layers.data(name="lbl_weight", shape=[-1, 1], dtype='int64')
+                label = fluid.data(
+                    name='label', shape=[-1, 1], dtype='int32', lod_level=1)
+                feed_list = [image, label, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight]
+                labels = {'label': label, 'encoder_word_pos': encoder_word_pos,
+                          'gsrm_word_pos': gsrm_word_pos, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
+                          'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,'lbl_weight':lbl_weight}
             else:
                 label = fluid.data(
                     name='label', shape=[None, 1], dtype='int32', lod_level=1)
@@ -88,6 +101,8 @@ class RecModel(object):
                 use_double_buffer=True,
                 iterable=False)
         else:
+            labels = None
+            loader = None
             if self.char_type == "ch" and self.infer_img:
                 image_shape[-1] = -1
                 if self.tps != None:
@@ -97,9 +112,15 @@ class RecModel(object):
                         "We set img_shape to be the same , it may affect the inference effect"
                     )
                     image_shape = deepcopy(self.image_shape)
-            image = fluid.data(name='image', shape=image_shape, dtype='float32')
-            labels = None
-            loader = None
+                    image = fluid.data(name='image', shape=image_shape, dtype='float32')
+            if self.loss_type == "srn":
+                encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64")
+                gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64")
+                gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
+                gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
+                feed_list = [image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2]
+                labels = {'encoder_word_pos': encoder_word_pos, 'gsrm_word_pos': gsrm_word_pos,
+                        'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2}
         return image, labels, loader
 
     def __call__(self, mode):
@@ -117,9 +138,15 @@ class RecModel(object):
                 label = labels['label_out']
             else:
                 label = labels['label']
-            outputs = {'total_loss':loss, 'decoded_out':\
-                decoded_out, 'label':label}
+            if self.loss_type == 'srn':
+                total_loss, img_loss, word_loss = self.loss(predicts, labels)
+                outputs = {'total_loss':total_loss, 'img_loss':img_loss, 'word_loss':word_loss,
+                           'decoded_out':decoded_out, 'label':label}
+            else:
+                outputs = {'total_loss':loss, 'decoded_out':\
+                    decoded_out, 'label':label}
             return loader, outputs
+
         elif mode == "export":
             predict = predicts['predict']
             if self.loss_type == "ctc":
@@ -129,4 +156,4 @@ class RecModel(object):
             predict = predicts['predict']
             if self.loss_type == "ctc":
                 predict = fluid.layers.softmax(predict)
-            return loader, {'decoded_out': decoded_out, 'predicts': predict}
+            return loader, {'decoded_out': decoded_out, 'predicts': predict}
\ No newline at end of file
diff --git a/ppocr/modeling/backbones/rec_resnet50_fpn.py b/ppocr/modeling/backbones/rec_resnet50_fpn.py
new file mode 100755
index 00000000..f6d72377
--- /dev/null
+++ b/ppocr/modeling/backbones/rec_resnet50_fpn.py
@@ -0,0 +1,172 @@
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+
+
+__all__ = ["ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
+
+Trainable = True
+w_nolr = fluid.ParamAttr(
+        trainable = Trainable)
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+class ResNet():
+    def __init__(self, params):
+        self.layers = params['layers']
+        self.params = train_parameters
+
+
+    def __call__(self, input):
+        layers = self.layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        stride_list = [(2,2),(2,2),(1,1),(1,1)]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu', name="conv1")
+        F = [] 
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=stride_list[block]  if i == 0 else 1, name=conv_name)
+                F.append(conv)
+
+        base = F[-1]
+        for i in [-2, -3]:  
+            b, c, w, h = F[i].shape
+            if (w,h) == base.shape[2:]:
+                base = base
+            else:
+                base = fluid.layers.conv2d_transpose( input=base, num_filters=c,filter_size=4, stride=2,
+                    padding=1,act=None,
+                    param_attr=w_nolr,
+                    bias_attr=w_nolr)
+                base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr)
+            base = fluid.layers.concat([base, F[i]], axis=1)
+            base = fluid.layers.conv2d(base, num_filters=c, filter_size=1, param_attr=w_nolr, bias_attr=w_nolr)
+            base = fluid.layers.conv2d(base, num_filters=c, filter_size=3,padding = 1, param_attr=w_nolr, bias_attr=w_nolr)
+            base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr)
+
+        base = fluid.layers.conv2d(base, num_filters=512, filter_size=1,bias_attr=w_nolr,param_attr=w_nolr)
+
+        return base
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size= 2  if stride==(1,1)  else filter_size,
+            dilation = 2 if stride==(1,1) else 1,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights",trainable = Trainable),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(input=conv,
+                                       act=act,
+                                       name=bn_name + '.output.1',
+                                       param_attr=ParamAttr(name=bn_name + '_scale',trainable = Trainable),
+                                       bias_attr=ParamAttr(bn_name + '_offset',trainable = Trainable),
+                                       moving_mean_name=bn_name + '_mean',
+                                       moving_variance_name=bn_name + '_variance', )
+
+    def shortcut(self, input, ch_out, stride, is_first, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1 or is_first == True:
+            if stride == (1,1):
+                return self.conv_bn_layer(input, ch_out, 1, 1, name=name)
+            else: #stride == (2,2)
+                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+                
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, name):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu', name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name + "_branch2c")
+
+        short = self.shortcut(input, num_filters * 4, stride, is_first=False, name=name + "_branch1")
+
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu', name=name + ".add.output.5")
+
+    def basic_block(self, input, num_filters, stride, is_first, name):
+        conv0 = self.conv_bn_layer(input=input, num_filters=num_filters, filter_size=3, act='relu', stride=stride,
+                                   name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(input=conv0, num_filters=num_filters, filter_size=3, act=None,
+                                   name=name + "_branch2b")
+        short = self.shortcut(input, num_filters, stride, is_first, name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
diff --git a/ppocr/modeling/backbones/rec_resnet_vd.py b/ppocr/modeling/backbones/rec_resnet_vd.py
index bc58c8ac..2c7cd4c7 100755
--- a/ppocr/modeling/backbones/rec_resnet_vd.py
+++ b/ppocr/modeling/backbones/rec_resnet_vd.py
@@ -32,7 +32,7 @@ class ResNet():
     def __init__(self, params):
         self.layers = params['layers']
         self.is_3x3 = True
-        supported_layers = [18, 34, 50, 101, 152, 200]
+        supported_layers = [18, 34, 50, 101, 152]
         assert self.layers in supported_layers, \
             "supported layers are {} but input layer is {}".format(supported_layers, self.layers)
 
diff --git a/ppocr/modeling/heads/rec_srn_all_head.py b/ppocr/modeling/heads/rec_srn_all_head.py
new file mode 100755
index 00000000..bf1f4a44
--- /dev/null
+++ b/ppocr/modeling/heads/rec_srn_all_head.py
@@ -0,0 +1,218 @@
+#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+#from .rec_seq_encoder import SequenceEncoder
+#from ..common_functions import get_para_bias_attr
+import numpy as np
+from .self_attention.model import wrap_encoder
+from .self_attention.model import wrap_encoder_forFeature
+gradient_clip = 10
+
+
+
+class SRNPredict(object):
+    def __init__(self, params):
+        super(SRNPredict, self).__init__()
+        self.char_num = params['char_num']
+        self.max_length = params['max_text_length']
+
+        self.num_heads = params['num_heads']
+        self.num_encoder_TUs = params['num_encoder_TUs']
+        self.num_decoder_TUs = params['num_decoder_TUs']
+        self.hidden_dims = params['hidden_dims']
+
+
+    def pvam(self, inputs, others):
+
+        b, c, h, w = inputs.shape
+        conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w])
+        conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1])
+
+        #===== Transformer encoder =====
+        b, t, c = conv_features.shape
+        encoder_word_pos = others["encoder_word_pos"]
+        gsrm_word_pos = others["gsrm_word_pos"]
+
+
+        enc_inputs = [conv_features, encoder_word_pos, None]
+        word_features = wrap_encoder_forFeature(src_vocab_size=-1,
+                 max_length=t,
+                 n_layer=self.num_encoder_TUs,
+                 n_head=self.num_heads,
+                 d_key= int(self.hidden_dims / self.num_heads),
+                 d_value= int(self.hidden_dims / self.num_heads),
+                 d_model=self.hidden_dims,
+                 d_inner_hid=self.hidden_dims,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 weight_sharing=True,
+                 enc_inputs=enc_inputs,
+                )
+        fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(gradient_clip))
+
+        #===== Parallel Visual Attention Module =====
+        b, t, c = word_features.shape
+
+        word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2) 
+        word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
+        word_features_ = fluid.layers.expand(word_features_, [1, self.max_length, 1, 1])
+        word_pos_feature = fluid.layers.embedding(gsrm_word_pos, [self.max_length, c]) 
+        word_pos_ = fluid.layers.reshape(word_pos_feature, [-1, self.max_length, 1, c])
+        word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
+        temp = fluid.layers.elementwise_add(word_features_, word_pos_, act='tanh')
+
+        attention_weight = fluid.layers.fc(input=temp, size=1, num_flatten_dims=3, bias_attr=False) 
+        attention_weight = fluid.layers.reshape(x=attention_weight, shape=[-1, self.max_length, t])    
+        attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1) 
+
+        pvam_features = fluid.layers.matmul(attention_weight, word_features)#[b, max_length, c]
+        
+        return pvam_features
+        
+    def gsrm(self, pvam_features, others):
+
+        #===== GSRM Visual-to-semantic embedding block =====
+        b, t, c = pvam_features.shape
+        word_out = fluid.layers.fc(input=fluid.layers.reshape(pvam_features, [-1, c]),
+                                  size=self.char_num,
+                                  act="softmax")
+        #word_out.stop_gradient = True
+        word_ids = fluid.layers.argmax(word_out, axis=1)
+        word_ids.stop_gradient = True
+        word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1])
+
+        #===== GSRM Semantic reasoning block =====
+        """
+        This module is achieved through bi-transformers, 
+        ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
+        """
+        pad_idx = self.char_num
+        gsrm_word_pos = others["gsrm_word_pos"]
+        gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"]
+        gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"]
+
+        def prepare_bi(word_ids):
+            """
+            prepare bi for gsrm
+            word1 for forward; word2 for backward
+            """
+            word1 = fluid.layers.cast(word_ids, "float32")
+            word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0], pad_value=1.0 * pad_idx)
+            word1 = fluid.layers.cast(word1, "int64")
+            word1 = word1[:, :-1, :]
+            word2 = word_ids
+            return word1, word2
+
+        word1, word2 = prepare_bi(word_ids)
+        word1.stop_gradient = True
+        word2.stop_gradient = True
+        enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
+        enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
+
+        gsrm_feature1 = wrap_encoder(src_vocab_size=self.char_num + 1,
+                 max_length=self.max_length,
+                 n_layer=self.num_decoder_TUs,
+                 n_head=self.num_heads,
+                 d_key=int(self.hidden_dims / self.num_heads),
+                 d_value=int(self.hidden_dims / self.num_heads),
+                 d_model=self.hidden_dims,
+                 d_inner_hid=self.hidden_dims,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 weight_sharing=True,
+                 enc_inputs=enc_inputs_1,
+                )
+        gsrm_feature2 = wrap_encoder(src_vocab_size=self.char_num + 1,
+                 max_length=self.max_length,
+                 n_layer=self.num_decoder_TUs,
+                 n_head=self.num_heads,
+                 d_key=int(self.hidden_dims / self.num_heads),
+                 d_value=int(self.hidden_dims / self.num_heads),
+                 d_model=self.hidden_dims,
+                 d_inner_hid=self.hidden_dims,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 weight_sharing=True,
+                 enc_inputs=enc_inputs_2,
+                )
+        gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], pad_value=0.)
+        gsrm_feature2 = gsrm_feature2[:, 1:, ]
+        gsrm_features = gsrm_feature1 + gsrm_feature2
+
+        b, t, c = gsrm_features.shape
+
+        gsrm_out = fluid.layers.matmul(
+            x=gsrm_features,
+            y=fluid.default_main_program().global_block().var("src_word_emb_table"),
+            transpose_y=True)
+        b,t,c = gsrm_out.shape
+        gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out, [-1, c]))
+
+        return gsrm_features, word_out, gsrm_out
+
+    def vsfd(self, pvam_features, gsrm_features):
+
+        #===== Visual-Semantic Fusion Decoder Module =====
+        b, t, c1 = pvam_features.shape
+        b, t, c2 = gsrm_features.shape
+        combine_features_ = fluid.layers.concat([pvam_features, gsrm_features], axis=2)
+        img_comb_features_ = fluid.layers.reshape(x=combine_features_, shape=[-1, c1 + c2])
+        img_comb_features_map = fluid.layers.fc(input=img_comb_features_, size=c1, act="sigmoid")
+        img_comb_features_map = fluid.layers.reshape(x=img_comb_features_map, shape=[-1, t, c1])    
+        combine_features = img_comb_features_map * pvam_features + (1.0 - img_comb_features_map) * gsrm_features    
+        img_comb_features = fluid.layers.reshape(x=combine_features, shape=[-1, c1])
+
+        fc_out = fluid.layers.fc(input=img_comb_features,
+                                 size=self.char_num,
+                                 act="softmax")
+        return fc_out
+
+
+    def __call__(self, inputs, others, mode=None):
+
+        pvam_features = self.pvam(inputs, others)
+        gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others)
+        final_out = self.vsfd(pvam_features, gsrm_features)
+
+        _, decoded_out = fluid.layers.topk(input=final_out, k=1)
+        predicts = {'predict': final_out, 'decoded_out': decoded_out, 
+                    'word_out': word_out, 'gsrm_out': gsrm_out}
+
+        return predicts
+
+
+
+
+
+
+
+
diff --git a/ppocr/modeling/heads/self_attention/__init__.py b/ppocr/modeling/heads/self_attention/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/ppocr/modeling/heads/self_attention/model.py b/ppocr/modeling/heads/self_attention/model.py
new file mode 100644
index 00000000..d4aecd5f
--- /dev/null
+++ b/ppocr/modeling/heads/self_attention/model.py
@@ -0,0 +1,1065 @@
+from functools import partial
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+from .desc import *
+from .config import ModelHyperParams,TrainTaskConfig
+
+def wrap_layer_with_block(layer, block_idx):
+    """
+    Make layer define support indicating block, by which we can add layers
+    to other blocks within current block. This will make it easy to define
+    cache among while loop.
+    """
+
+    class BlockGuard(object):
+        """
+        BlockGuard class.
+
+        BlockGuard class is used to switch to the given block in a program by
+        using the Python `with` keyword.
+        """
+
+        def __init__(self, block_idx=None, main_program=None):
+            self.main_program = fluid.default_main_program(
+            ) if main_program is None else main_program
+            self.old_block_idx = self.main_program.current_block().idx
+            self.new_block_idx = block_idx
+
+        def __enter__(self):
+            self.main_program.current_block_idx = self.new_block_idx
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            self.main_program.current_block_idx = self.old_block_idx
+            if exc_type is not None:
+                return False  # re-raise exception
+            return True
+
+    def layer_wrapper(*args, **kwargs):
+        with BlockGuard(block_idx):
+            return layer(*args, **kwargs)
+
+    return layer_wrapper
+
+
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    channels = d_pos_vec
+    position = np.arange(n_position)
+    num_timescales = channels // 2
+    log_timescale_increment = (np.log(float(1e4) / float(1)) /
+                               (num_timescales - 1))
+    inv_timescales = np.exp(np.arange(
+        num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
+                                                               0)
+    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
+    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
+    position_enc = signal
+    return position_enc.astype("float32")
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         gather_idx=None,
+                         static_kv=False):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        # For encoder-decoder attention in inference, insert the ops and vars
+        # into global block to use as cache among beam search.
+        fc_layer = wrap_layer_with_block(
+            layers.fc, fluid.default_main_program().current_block()
+            .parent_idx) if cache is not None and static_kv else layers.fc
+        k = fc_layer(
+            input=keys,
+            size=d_key * n_head,
+            bias_attr=False,
+            num_flatten_dims=2)
+        v = fc_layer(
+            input=values,
+            size=d_value * n_head,
+            bias_attr=False,
+            num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Reshape input tensors at the last dimension to split multi-heads 
+        and then transpose. Specifically, transform the input tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] to the output tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped_q = layers.reshape(
+            x=queries, shape=[0, 0, n_head, d_key], inplace=True)
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
+        # For encoder-decoder attention in inference, insert the ops and vars
+        # into global block to use as cache among beam search.
+        reshape_layer = wrap_layer_with_block(
+            layers.reshape,
+            fluid.default_main_program().current_block()
+            .parent_idx) if cache is not None and static_kv else layers.reshape
+        transpose_layer = wrap_layer_with_block(
+            layers.transpose,
+            fluid.default_main_program().current_block().
+            parent_idx) if cache is not None and static_kv else layers.transpose
+        reshaped_k = reshape_layer(
+            x=keys, shape=[0, 0, n_head, d_key], inplace=True)
+        k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3])
+        reshaped_v = reshape_layer(
+            x=values, shape=[0, 0, n_head, d_value], inplace=True)
+        v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3])
+
+        if cache is not None:  # only for faster inference
+            if static_kv:  # For encoder-decoder attention in inference
+                cache_k, cache_v = cache["static_k"], cache["static_v"]
+                # To init the static_k and static_v in cache.
+                # Maybe we can use condition_op(if_else) to do these at the first
+                # step in while loop to replace these, however it might be less
+                # efficient.
+                static_cache_init = wrap_layer_with_block(
+                    layers.assign,
+                    fluid.default_main_program().current_block().parent_idx)
+                static_cache_init(k, cache_k)
+                static_cache_init(v, cache_v)
+            else:  # For decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+            # gather cell states corresponding to selected parent
+            select_k = layers.gather(cache_k, index=gather_idx)
+            select_v = layers.gather(cache_v, index=gather_idx)
+            if not static_kv:
+                # For self attention in inference, use cache and concat time steps.
+                select_k = layers.concat([select_k, k], axis=2)
+                select_v = layers.concat([select_v, v], axis=2)
+            # update cell states(caches) cached in global block
+            layers.assign(select_k, cache_k)
+            layers.assign(select_v, cache_v)
+            return q, select_k, select_v
+        return q, k, v
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        # print(q)
+        # print(k)
+
+        product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                seed=dropout_seed,
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+    q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         bias_attr=False,
+                         num_flatten_dims=2)
+    return proj_out
+
+
+def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act="relu")
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False)
+    out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    seed=dropout_seed,
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def prepare_encoder(src_word,#[b,t,c]
+                            src_pos,
+                            src_vocab_size,
+                            src_emb_dim,
+                            src_max_len,
+                            dropout_rate=0.,
+                            bos_idx=0,
+                            word_emb_param_name=None,
+                            pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+    This module is used at the bottom of the encoder stacks.
+    """
+    
+    src_word_emb =src_word#layers.concat(res,axis=1)
+    src_word_emb=layers.cast(src_word_emb,'float32')
+    # print("src_word_emb",src_word_emb)
+
+    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name, trainable=False))
+    src_pos_enc.stop_gradient = True
+    enc_input = src_word_emb + src_pos_enc
+    return layers.dropout(
+        enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
+        is_test=False) if dropout_rate else enc_input
+
+
+def prepare_decoder(src_word,
+                            src_pos,
+                            src_vocab_size,
+                            src_emb_dim,
+                            src_max_len,
+                            dropout_rate=0.,
+                            bos_idx=0,
+                            word_emb_param_name=None,
+                            pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+        The output tensor has a shape of:
+        [batch_size, max_src_length_in_batch, d_model].
+        This module is used at the bottom of the encoder stacks.
+        """
+    src_word_emb = layers.embedding(
+        src_word,
+        size=[src_vocab_size, src_emb_dim],
+        padding_idx=bos_idx,  # set embedding of bos to 0
+        param_attr=fluid.ParamAttr(
+            name=word_emb_param_name,
+            initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+    # print("target_word_emb",src_word_emb)
+    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim ** 0.5)
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name, trainable=False))
+    src_pos_enc.stop_gradient = True
+    enc_input = src_word_emb + src_pos_enc
+    return layers.dropout(
+        enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
+        is_test=False) if dropout_rate else enc_input
+
+# prepare_encoder = partial(
+#     prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0])
+# prepare_decoder = partial(
+#     prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[1])
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da"):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(enc_input, preprocess_cmd,
+                          prepostprocess_dropout), None, None, attn_bias, d_key,
+        d_value, d_model, n_head, attention_dropout)
+    attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd,
+                                     prepostprocess_dropout)
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout),
+        d_inner_hid, d_model, relu_dropout)
+    return post_process_layer(attn_output, ffd_output, postprocess_cmd,
+                              prepostprocess_dropout)
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd="n",
+            postprocess_cmd="da"):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd,
+            postprocess_cmd, )
+        enc_input = enc_output
+    enc_output = pre_process_layer(enc_output, preprocess_cmd,
+                                   prepostprocess_dropout)
+    return enc_output
+
+
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  preprocess_cmd,
+                  postprocess_cmd,
+                  cache=None,
+                  gather_idx=None):
+    """ The layer to be stacked in decoder part.
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        pre_process_layer(dec_input, preprocess_cmd, prepostprocess_dropout),
+        None,
+        None,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        cache=cache,
+        gather_idx=gather_idx)
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout, )
+    enc_attn_output = multi_head_attention(
+        pre_process_layer(slf_attn_output, preprocess_cmd,
+                          prepostprocess_dropout),
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        cache=cache,
+        gather_idx=gather_idx,
+        static_kv=True)
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout, )
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(enc_attn_output, preprocess_cmd,
+                          prepostprocess_dropout),
+        d_inner_hid,
+        d_model,
+        relu_dropout, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout, )
+    return dec_output
+
+
+def decoder(dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd,
+            postprocess_cmd,
+            caches=None,
+            gather_idx=None):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd,
+            postprocess_cmd,
+            cache=None if caches is None else caches[i],
+            gather_idx=gather_idx)
+        dec_input = dec_output
+    dec_output = pre_process_layer(dec_output, preprocess_cmd,
+                                   prepostprocess_dropout)
+    return dec_output
+
+
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
+            append_batch_size=False)
+        inputs.append(input_var)
+    return inputs
+
+
+def make_all_py_reader_inputs(input_fields, is_test=False):
+    reader = layers.py_reader(
+        capacity=20,
+        name="test_reader" if is_test else "train_reader",
+        shapes=[input_descs[input_field][0] for input_field in input_fields],
+        dtypes=[input_descs[input_field][1] for input_field in input_fields],
+        lod_levels=[
+            input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0
+            for input_field in input_fields
+        ])
+    return layers.read_file(reader), reader
+
+
+def transformer(src_vocab_size,
+                trg_vocab_size,
+                max_length,
+                n_layer,
+                n_head,
+                d_key,
+                d_value,
+                d_model,
+                d_inner_hid,
+                prepostprocess_dropout,
+                attention_dropout,
+                relu_dropout,
+                preprocess_cmd,
+                postprocess_cmd,
+                weight_sharing,
+                label_smooth_eps,
+                bos_idx=0,
+                use_py_reader=False,
+                is_test=False):
+    if weight_sharing:
+        assert src_vocab_size == trg_vocab_size, (
+            "Vocabularies in source and target should be same for weight sharing."
+        )
+
+    data_input_names = encoder_data_input_fields + \
+                decoder_data_input_fields[:-1] + label_data_input_fields
+
+    if use_py_reader:
+        all_inputs, reader = make_all_py_reader_inputs(data_input_names,
+                                                       is_test)
+    else:
+        all_inputs = make_all_inputs(data_input_names)
+    # print("all inputs",all_inputs)
+    enc_inputs_len = len(encoder_data_input_fields)
+    dec_inputs_len = len(decoder_data_input_fields[:-1])
+    enc_inputs = all_inputs[0:enc_inputs_len]
+    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]
+    label = all_inputs[-2]
+    weights = all_inputs[-1]
+
+    enc_output = wrap_encoder(
+        src_vocab_size,
+        ModelHyperParams.src_seq_len,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        preprocess_cmd,
+        postprocess_cmd,
+        weight_sharing,
+        enc_inputs)
+
+    predict = wrap_decoder(
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        preprocess_cmd,
+        postprocess_cmd,
+        weight_sharing,
+        dec_inputs,
+        enc_output, )
+
+    # Padding index do not contribute to the total loss. The weights is used to
+    # cancel padding index in calculating the loss.
+    if label_smooth_eps:
+        label = layers.label_smooth(
+            label=layers.one_hot(
+                input=label, depth=trg_vocab_size),
+            epsilon=label_smooth_eps)
+
+    cost = layers.softmax_with_cross_entropy(
+        logits=predict,
+        label=label,
+        soft_label=True if label_smooth_eps else False)
+    weighted_cost = cost * weights
+    sum_cost = layers.reduce_sum(weighted_cost)
+    token_num = layers.reduce_sum(weights)
+    token_num.stop_gradient = True
+    avg_cost = sum_cost / token_num
+    return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None
+
+
+def wrap_encoder_forFeature(src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 enc_inputs=None,
+                 bos_idx=0):
+    """
+    The wrapper assembles together all needed layers for the encoder.
+    img, src_pos, src_slf_attn_bias = enc_inputs
+    img
+    """
+    
+    if enc_inputs is None:
+        # This is used to implement independent encoder program in inference.
+        conv_features, src_pos, src_slf_attn_bias = make_all_inputs(
+            encoder_data_input_fields)
+    else:
+        conv_features, src_pos, src_slf_attn_bias = enc_inputs#
+        b,t,c = conv_features.shape
+        #"""
+        #    insert cnn
+        #"""
+        #import basemodel
+        # feat = basemodel.resnet_50(img)
+
+        # mycrnn = basemodel.CRNN()
+        # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu)
+        # b, c, w, h = feat.shape
+        # src_word = layers.reshape(feat, shape=[-1, c, w * h])
+
+        #myconv8 = basemodel.conv8()
+        #feat = myconv8.net(img )
+        #b , c, h, w = feat.shape#h=6
+        #print(feat)
+        #layers.Print(feat,message="conv_feat",summarize=10)
+        
+        #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu")
+        #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1))
+        #src_word = layers.squeeze(feat,axes=[2]) #src_word  [-1,c,ww]
+            
+        #feat = layers.transpose(feat, [0,3,1,2])
+        #src_word = layers.reshape(feat,[-1,w, c*h])
+        #src_word = layers.im2sequence(
+        #    input=feat,
+        #    stride=[1, 1],
+        #    filter_size=[feat.shape[2], 1])
+        #layers.Print(src_word,message="src_word",summarize=10)
+        
+        # print('feat',feat)
+        #print("src_word",src_word)
+    
+    enc_input = prepare_encoder(
+        conv_features,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        max_length,
+        prepostprocess_dropout,
+        bos_idx=bos_idx,
+        word_emb_param_name=word_emb_param_names[0])
+
+    enc_output = encoder(
+        enc_input,
+        src_slf_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        preprocess_cmd,
+        postprocess_cmd, )
+    return enc_output
+
+def wrap_encoder(src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 enc_inputs=None,
+                 bos_idx=0):
+    """
+    The wrapper assembles together all needed layers for the encoder.
+    img, src_pos, src_slf_attn_bias = enc_inputs
+    img
+    """
+    if enc_inputs is None:
+        # This is used to implement independent encoder program in inference.
+        src_word, src_pos, src_slf_attn_bias = make_all_inputs(
+            encoder_data_input_fields)
+    else:
+        src_word, src_pos, src_slf_attn_bias = enc_inputs#
+        #"""
+        #    insert cnn
+        #"""
+        #import basemodel
+        # feat = basemodel.resnet_50(img)
+
+        # mycrnn = basemodel.CRNN()
+        # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu)
+        # b, c, w, h = feat.shape
+        # src_word = layers.reshape(feat, shape=[-1, c, w * h])
+
+        #myconv8 = basemodel.conv8()
+        #feat = myconv8.net(img )
+        #b , c, h, w = feat.shape#h=6
+        #print(feat)
+        #layers.Print(feat,message="conv_feat",summarize=10)
+        
+        #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu")
+        #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1))
+        #src_word = layers.squeeze(feat,axes=[2]) #src_word  [-1,c,ww]
+            
+        #feat = layers.transpose(feat, [0,3,1,2])
+        #src_word = layers.reshape(feat,[-1,w, c*h])
+        #src_word = layers.im2sequence(
+        #    input=feat,
+        #    stride=[1, 1],
+        #    filter_size=[feat.shape[2], 1])
+        #layers.Print(src_word,message="src_word",summarize=10)
+        
+        # print('feat',feat)
+        #print("src_word",src_word)
+    enc_input = prepare_decoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        max_length,
+        prepostprocess_dropout,
+        bos_idx=bos_idx,
+        word_emb_param_name=word_emb_param_names[0])
+
+    enc_output = encoder(
+        enc_input,
+        src_slf_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        preprocess_cmd,
+        postprocess_cmd, )
+    return enc_output
+
+
+def wrap_decoder(trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 dec_inputs=None,
+                 enc_output=None,
+                 caches=None,
+                 gather_idx=None,
+                 bos_idx=0):
+    """
+    The wrapper assembles together all needed layers for the decoder.
+    """
+    if dec_inputs is None:
+        # This is used to implement independent decoder program in inference.
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \
+            make_all_inputs(decoder_data_input_fields)
+    else:
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
+
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        max_length,
+        prepostprocess_dropout,
+        bos_idx=bos_idx,
+        word_emb_param_name=word_emb_param_names[0]
+        if weight_sharing else word_emb_param_names[1])
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        preprocess_cmd,
+        postprocess_cmd,
+        caches=caches,
+        gather_idx=gather_idx)
+    return dec_output
+    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
+    dec_output = layers.reshape(
+        dec_output, shape=[-1, dec_output.shape[-1]], inplace=True)
+    if weight_sharing:
+        predict = layers.matmul(
+            x=dec_output,
+            y=fluid.default_main_program().global_block().var(
+                word_emb_param_names[0]),
+            transpose_y=True)
+    else:
+        predict = layers.fc(input=dec_output,
+                            size=trg_vocab_size,
+                            bias_attr=False)
+    if dec_inputs is None:
+        # Return probs for independent decoder program.
+        predict = layers.softmax(predict)
+    return predict
+
+
+def fast_decode(src_vocab_size,
+                trg_vocab_size,
+                max_in_len,
+                n_layer,
+                n_head,
+                d_key,
+                d_value,
+                d_model,
+                d_inner_hid,
+                prepostprocess_dropout,
+                attention_dropout,
+                relu_dropout,
+                preprocess_cmd,
+                postprocess_cmd,
+                weight_sharing,
+                beam_size,
+                max_out_len,
+                bos_idx,
+                eos_idx,
+                use_py_reader=False):
+    """
+    Use beam search to decode. Caches will be used to store states of history
+    steps which can make the decoding faster.
+    """
+    data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields
+
+    if use_py_reader:
+        all_inputs, reader = make_all_py_reader_inputs(data_input_names)
+    else:
+        all_inputs = make_all_inputs(data_input_names)
+
+    enc_inputs_len = len(encoder_data_input_fields)
+    dec_inputs_len = len(fast_decoder_data_input_fields)
+    enc_inputs = all_inputs[0:enc_inputs_len]#enc_inputs tensor
+    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]#dec_inputs tensor
+
+    enc_output = wrap_encoder(
+        src_vocab_size,
+        ModelHyperParams.src_seq_len,##to do !!!!!????
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        preprocess_cmd,
+        postprocess_cmd,
+        weight_sharing,
+        enc_inputs,
+        bos_idx=bos_idx)
+    start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs
+
+    def beam_search():
+        max_len = layers.fill_constant(
+            shape=[1],
+            dtype=start_tokens.dtype,
+            value=max_out_len,
+            force_cpu=True)
+        step_idx = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True)
+        cond = layers.less_than(x=step_idx, y=max_len)  # default force_cpu=True
+        while_op = layers.While(cond)
+        # array states will be stored for each step.
+        ids = layers.array_write(
+            layers.reshape(start_tokens, (-1, 1)), step_idx)
+        scores = layers.array_write(init_scores, step_idx)
+        # cell states will be overwrited at each step.
+        # caches contains states of history steps in decoder self-attention
+        # and static encoder output projections in encoder-decoder attention
+        # to reduce redundant computation.
+        caches = [
+            {
+                "k":  # for self attention
+                layers.fill_constant_batch_size_like(
+                    input=start_tokens,
+                    shape=[-1, n_head, 0, d_key],
+                    dtype=enc_output.dtype,
+                    value=0),
+                "v":  # for self attention
+                layers.fill_constant_batch_size_like(
+                    input=start_tokens,
+                    shape=[-1, n_head, 0, d_value],
+                    dtype=enc_output.dtype,
+                    value=0),
+                "static_k":  # for encoder-decoder attention
+                layers.create_tensor(dtype=enc_output.dtype),
+                "static_v":  # for encoder-decoder attention
+                layers.create_tensor(dtype=enc_output.dtype)
+            } for i in range(n_layer)
+        ]
+
+        with while_op.block():
+            pre_ids = layers.array_read(array=ids, i=step_idx)
+            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
+            # inplace reshape here which actually change the shape of pre_ids.
+            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
+            pre_scores = layers.array_read(array=scores, i=step_idx)
+            # gather cell states corresponding to selected parent
+            pre_src_attn_bias = layers.gather(
+                trg_src_attn_bias, index=parent_idx)
+            pre_pos = layers.elementwise_mul(
+                x=layers.fill_constant_batch_size_like(
+                    input=pre_src_attn_bias,  # cann't use lod tensor here
+                    value=1,
+                    shape=[-1, 1, 1],
+                    dtype=pre_ids.dtype),
+                y=step_idx,
+                axis=0)
+            logits = wrap_decoder(
+                trg_vocab_size,
+                max_in_len,
+                n_layer,
+                n_head,
+                d_key,
+                d_value,
+                d_model,
+                d_inner_hid,
+                prepostprocess_dropout,
+                attention_dropout,
+                relu_dropout,
+                preprocess_cmd,
+                postprocess_cmd,
+                weight_sharing,
+                dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
+                enc_output=enc_output,
+                caches=caches,
+                gather_idx=parent_idx,
+                bos_idx=bos_idx)
+            # intra-beam topK
+            topk_scores, topk_indices = layers.topk(
+                input=layers.softmax(logits), k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(topk_scores), y=pre_scores, axis=0)
+            # beam_search op uses lod to differentiate branches.
+            accu_scores = layers.lod_reset(accu_scores, pre_ids)
+            # topK reduction across beams, also contain special handle of
+            # end beams and end sentences(batch reduction)
+            selected_ids, selected_scores, gather_idx = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=eos_idx,
+                return_parent_idx=True)
+            layers.increment(x=step_idx, value=1.0, in_place=True)
+            # cell states(caches) have been updated in wrap_decoder,
+            # only need to update beam search states here.
+            layers.array_write(selected_ids, i=step_idx, array=ids)
+            layers.array_write(selected_scores, i=step_idx, array=scores)
+            layers.assign(gather_idx, parent_idx)
+            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
+            length_cond = layers.less_than(x=step_idx, y=max_len)
+            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
+            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
+
+        finished_ids, finished_scores = layers.beam_search_decode(
+            ids, scores, beam_size=beam_size, end_id=eos_idx)
+        return finished_ids, finished_scores
+
+    finished_ids, finished_scores = beam_search()
+    return finished_ids, finished_scores, reader if use_py_reader else None
diff --git a/ppocr/modeling/losses/rec_srn_loss.py b/ppocr/modeling/losses/rec_srn_loss.py
new file mode 100755
index 00000000..68a480ac
--- /dev/null
+++ b/ppocr/modeling/losses/rec_srn_loss.py
@@ -0,0 +1,58 @@
+#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle
+import paddle.fluid as fluid
+
+
+class SRNLoss(object):
+    def __init__(self, params):
+        super(SRNLoss, self).__init__()
+        self.char_num = params['char_num']
+
+    def __call__(self, predicts, others):
+        predict = predicts['predict']
+        word_predict = predicts['word_out']
+        gsrm_predict = predicts['gsrm_out']
+        label = others['label']
+        lbl_weight = others['lbl_weight']
+
+        casted_label = fluid.layers.cast(x=label, dtype='int64')
+        cost_word = fluid.layers.cross_entropy(input=word_predict, label=casted_label)
+        cost_gsrm = fluid.layers.cross_entropy(input=gsrm_predict, label=casted_label)
+        cost_vsfd = fluid.layers.cross_entropy(input=predict, label=casted_label)
+
+        #cost_word = cost_word * lbl_weight
+        #cost_gsrm = cost_gsrm * lbl_weight
+        #cost_vsfd = cost_vsfd * lbl_weight
+
+        cost_word = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_word), shape=[1])
+        cost_gsrm = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
+        cost_vsfd = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
+
+        sum_cost = fluid.layers.sum([cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
+
+        #sum_cost = fluid.layers.sum([cost_word * 3.0, cost_vsfd, cost_gsrm * 0.15])
+        #sum_cost = cost_word
+
+        #fluid.layers.Print(cost_word,message="word_cost")
+        #fluid.layers.Print(cost_vsfd,message="img_cost")
+        return [sum_cost,cost_vsfd,cost_word]
+        #return [sum_cost, cost_vsfd, cost_word]
diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index 9a3db8dd..79d6f5ca 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -25,6 +25,7 @@ class CharacterOps(object):
     def __init__(self, config):
         self.character_type = config['character_type']
         self.loss_type = config['loss_type']
+        self.max_text_len = config['max_text_length']
         if self.character_type == "en":
             self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
             dict_character = list(self.character_str)
@@ -54,6 +55,8 @@ class CharacterOps(object):
         self.end_str = "eos"
         if self.loss_type == "attention":
             dict_character = [self.beg_str, self.end_str] + dict_character
+        elif self.loss_type == "srn":
+            dict_character = dict_character + [self.beg_str, self.end_str]
         self.dict = {}
         for i, char in enumerate(dict_character):
             self.dict[char] = i
@@ -146,6 +149,48 @@ def cal_predicts_accuracy(char_ops,
     acc = acc_num * 1.0 / img_num
     return acc, acc_num, img_num
 
+def cal_predicts_accuracy_srn(char_ops,
+                          preds,
+                          labels,
+                          max_text_len,
+                          is_debug=False):
+    acc_num = 0
+    img_num = 0
+
+    total_len = preds.shape[0]
+    img_num = int(total_len / max_text_len)
+    #print (img_num)
+    for i in range(img_num):
+        cur_label = []
+        cur_pred = []
+        for j in range(max_text_len):
+            if labels[j + i * max_text_len] != 37: #0
+                cur_label.append(labels[j + i * max_text_len][0])
+            else:
+                break
+
+        if is_debug:
+            for j in range(max_text_len):
+                if preds[j + i * max_text_len] != 37: #0
+                    cur_pred.append(preds[j + i * max_text_len][0])
+                else:
+                    break
+            print ("cur_label: ", cur_label)
+            print ("cur_pred: ", cur_pred)
+
+
+        for j in range(max_text_len + 1):
+            if j < len(cur_label) and preds[j + i * max_text_len][0] != cur_label[j]:
+                break
+            elif j == len(cur_label) and j == max_text_len:
+                acc_num += 1
+                break
+            elif j == len(cur_label) and preds[j + i * max_text_len][0] == 37:
+                acc_num += 1
+                break
+    acc = acc_num * 1.0 / img_num
+    return acc, acc_num, img_num
+
 
 def convert_rec_attention_infer_res(preds):
     img_num = preds.shape[0]
diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py
index aebb9f90..3d496bd3 100644
--- a/tools/eval_utils/eval_rec_utils.py
+++ b/tools/eval_utils/eval_rec_utils.py
@@ -29,7 +29,7 @@ FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logger = logging.getLogger(__name__)
 
-from ppocr.utils.character import cal_predicts_accuracy
+from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn
 from ppocr.utils.character import convert_rec_label_to_lod
 from ppocr.utils.character import convert_rec_attention_infer_res
 from ppocr.utils.utility import create_module
@@ -60,19 +60,52 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
         for ino in range(img_num):
             img_list.append(data[ino][0])
             label_list.append(data[ino][1])
-        img_list = np.concatenate(img_list, axis=0)
-        outs = exe.run(eval_info_dict['program'], \
+
+        if config['Global']['loss_type'] != "srn": 
+            img_list = np.concatenate(img_list, axis=0)
+            outs = exe.run(eval_info_dict['program'], \
                        feed={'image': img_list}, \
                        fetch_list=eval_info_dict['fetch_varname_list'], \
                        return_numpy=False)
-        preds = np.array(outs[0])
-        if preds.shape[1] != 1:
-            preds, preds_lod = convert_rec_attention_infer_res(preds)
+            preds = np.array(outs[0])
+
+            if preds.shape[1] != 1:
+                preds, preds_lod = convert_rec_attention_infer_res(preds)
+            else:
+                preds_lod = outs[0].lod()[0]
+            labels, labels_lod = convert_rec_label_to_lod(label_list)
+            acc, acc_num, sample_num = cal_predicts_accuracy(
+                char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate)
         else:
-            preds_lod = outs[0].lod()[0]
-        labels, labels_lod = convert_rec_label_to_lod(label_list)
-        acc, acc_num, sample_num = cal_predicts_accuracy(
-            char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate)
+            encoder_word_pos_list = []
+            gsrm_word_pos_list = []
+            gsrm_slf_attn_bias1_list = []
+            gsrm_slf_attn_bias2_list = []
+            for ino in range(img_num):
+                encoder_word_pos_list.append(data[ino][2])
+                gsrm_word_pos_list.append(data[ino][3])
+                gsrm_slf_attn_bias1_list.append(data[ino][4])
+                gsrm_slf_attn_bias2_list.append(data[ino][5])
+
+            img_list = np.concatenate(img_list, axis=0)
+            label_list = np.concatenate(label_list, axis=0)
+            encoder_word_pos_list = np.concatenate(encoder_word_pos_list, axis=0).astype(np.int64)
+            gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list, axis=0).astype(np.int64)
+            gsrm_slf_attn_bias1_list = np.concatenate(gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
+            gsrm_slf_attn_bias2_list = np.concatenate(gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
+
+            labels = label_list
+
+            outs = exe.run(eval_info_dict['program'], \
+                       feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list, 
+                             'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list,
+                             'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \
+                       fetch_list=eval_info_dict['fetch_varname_list'], \
+                       return_numpy=False)
+            preds = np.array(outs[0])
+            acc, acc_num, sample_num = cal_predicts_accuracy_srn(
+                char_ops, preds, labels, config['Global']['max_text_length'])
+
         total_acc_num += acc_num
         total_sample_num += sample_num
         logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc))
@@ -85,8 +118,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
 
 def test_rec_benchmark(exe, config, eval_info_dict):
     " Evaluate lmdb dataset "
-    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', 'IC03_867', \
-                      'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077', 'SVTP', 'CUTE80']
+    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860',  \
+                      'IC13_857', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80']
     eval_data_dir = config['TestReader']['lmdb_sets_dir']
     total_evaluation_data_number = 0
     total_correct_number = 0
diff --git a/tools/program.py b/tools/program.py
index 4ebc1167..64c827e7 100755
--- a/tools/program.py
+++ b/tools/program.py
@@ -32,7 +32,7 @@ from eval_utils.eval_det_utils import eval_det_run
 from eval_utils.eval_rec_utils import eval_rec_run
 from ppocr.utils.save_load import save_model
 import numpy as np
-from ppocr.utils.character import cal_predicts_accuracy, CharacterOps
+from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn, CharacterOps
 
 class ArgsParser(ArgumentParser):
     def __init__(self):
@@ -176,8 +176,16 @@ def build(config, main_prog, startup_prog, mode):
             fetch_name_list = list(outputs.keys())
             fetch_varname_list = [outputs[v].name for v in fetch_name_list]
             opt_loss_name = None
+            model_average = None
+            img_loss_name = None
+            word_loss_name = None
             if mode == "train":
                 opt_loss = outputs['total_loss']
+                # srn loss
+                #img_loss = outputs['img_loss']
+                #word_loss = outputs['word_loss']
+                #img_loss_name = img_loss.name
+                #word_loss_name = word_loss.name
                 opt_params = config['Optimizer']
                 optimizer = create_module(opt_params['function'])(opt_params)
                 optimizer.minimize(opt_loss)
@@ -185,7 +193,13 @@ def build(config, main_prog, startup_prog, mode):
                 global_lr = optimizer._global_learning_rate()
                 fetch_name_list.insert(0, "lr")
                 fetch_varname_list.insert(0, global_lr.name)
-    return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name)
+                if config['Global']["loss_type"] == 'srn':
+                    model_average = fluid.optimizer.ModelAverage(
+                        config['Global']['average_window'],
+                        min_average_window=config['Global']['min_average_window'],
+                        max_average_window=config['Global']['max_average_window'])
+
+    return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,model_average)
 
 
 def build_export(config, main_prog, startup_prog):
@@ -329,14 +343,20 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
                 lr = np.mean(np.array(train_outs[fetch_map['lr']]))
                 preds_idx = fetch_map['decoded_out']
                 preds = np.array(train_outs[preds_idx])
-                preds_lod = train_outs[preds_idx].lod()[0]
                 labels_idx = fetch_map['label']
                 labels = np.array(train_outs[labels_idx])
-                labels_lod = train_outs[labels_idx].lod()[0]
 
-                acc, acc_num, img_num = cal_predicts_accuracy(
-                    config['Global']['char_ops'], preds, preds_lod, labels,
-                    labels_lod)
+                if config['Global']['loss_type'] != 'srn':
+                    preds_lod = train_outs[preds_idx].lod()[0]
+                    labels_lod = train_outs[labels_idx].lod()[0]
+
+                    acc, acc_num, img_num = cal_predicts_accuracy(
+                        config['Global']['char_ops'], preds, preds_lod, labels,
+                        labels_lod)
+                else:
+                    acc, acc_num, img_num = cal_predicts_accuracy_srn(
+                        config['Global']['char_ops'], preds, labels,
+                        config['Global']['max_text_length'])
                 t2 = time.time()
                 train_batch_elapse = t2 - t1
                 stats = {'loss': loss, 'acc': acc}
@@ -350,6 +370,9 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
 
                 if train_batch_id > 0 and\
                     train_batch_id % eval_batch_step == 0:
+                    model_average = train_info_dict['model_average']
+                    if model_average != None:
+                        model_average.apply(exe)
                     metrics = eval_rec_run(exe, config, eval_info_dict, "eval")
                     eval_acc = metrics['avg_acc']
                     eval_sample_num = metrics['total_sample_num']
diff --git a/tools/train.py b/tools/train.py
index 68e792b7..2ea9d0e0 100755
--- a/tools/train.py
+++ b/tools/train.py
@@ -52,6 +52,7 @@ def main():
     train_fetch_name_list = train_build_outputs[1]
     train_fetch_varname_list = train_build_outputs[2]
     train_opt_loss_name = train_build_outputs[3]
+    model_average = train_build_outputs[-1]
 
     eval_program = fluid.Program()
     eval_build_outputs = program.build(
@@ -85,7 +86,8 @@ def main():
         'train_program':train_program,\
         'reader':train_loader,\
         'fetch_name_list':train_fetch_name_list,\
-        'fetch_varname_list':train_fetch_varname_list}
+        'fetch_varname_list':train_fetch_varname_list,\
+        'model_average': model_average}
 
     eval_info_dict = {'program':eval_program,\
         'reader':eval_reader,\

From 6832ca029fe6d7bccd68fddcfe1aedc8e4d6618f Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sat, 15 Aug 2020 12:39:07 +0800
Subject: [PATCH 02/11] update config

---
 .../rec_r50fpn_vd_none_srn_pvam_test_all.yml  |   5 +-
 ppocr/data/rec/dataset_traversal.py           |  49 +++---
 ppocr/modeling/architectures/rec_model.py     |  99 ++++++++++---
 ppocr/modeling/heads/self_attention/model.py  | 139 +++++++++---------
 tools/eval_utils/eval_rec_utils.py            |  21 ++-
 tools/program.py                              |  15 +-
 train_data                                    |   1 +
 7 files changed, 197 insertions(+), 132 deletions(-)
 create mode 120000 train_data

diff --git a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
index 933a7513..7a0f136c 100755
--- a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
+++ b/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
@@ -17,11 +17,12 @@ Global:
   average_window: 0.15
   max_average_window: 15625
   min_average_window: 10000
-  reader_yml: ./configs/rec/rec_srn_reader.yml
+  reader_yml: ./configs/rec/rec_benchmark_reader.yml
   pretrain_weights: 
   checkpoints:
   save_inference_dir:
-  
+  infer_img:
+
 Architecture:
   function: ppocr.modeling.architectures.rec_model,RecModel
 
diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py
index 7135fca5..b46e37da 100755
--- a/ppocr/data/rec/dataset_traversal.py
+++ b/ppocr/data/rec/dataset_traversal.py
@@ -118,15 +118,14 @@ class LMDBReader(object):
                 image_file_list = get_image_file_list(self.infer_img)
                 for single_img in image_file_list:
                     img = cv2.imread(single_img)
-                    if img.shape[-1]==1 or len(list(img.shape))==2:
+                    if img.shape[-1] == 1 or len(list(img.shape)) == 2:
                         img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
                     if self.loss_type == 'srn':
                         norm_img = process_image_srn(
                             img=img,
                             image_shape=self.image_shape,
                             num_heads=self.num_heads,
-                            max_text_length=self.max_text_length
-                        )
+                            max_text_length=self.max_text_length)
                     else:
                         norm_img = process_image(
                             img=img,
@@ -135,20 +134,20 @@ class LMDBReader(object):
                             tps=self.use_tps,
                             infer_mode=True)
                     yield norm_img
-            elif self.mode == 'test':
-                image_file_list = get_image_file_list(self.infer_img)
-                for single_img in image_file_list:
-                    img = cv2.imread(single_img)
-                    if img.shape[-1]==1 or len(list(img.shape))==2:
-                        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-                    norm_img = process_image(
-                        img=img,
-                        image_shape=self.image_shape,
-                        char_ops=self.char_ops,
-                        tps=self.use_tps,
-                        infer_mode=True
-                    )
-                    yield norm_img
+            #elif self.mode == 'eval':
+            #    image_file_list = get_image_file_list(self.infer_img)
+            #    for single_img in image_file_list:
+            #        img = cv2.imread(single_img)
+            #        if img.shape[-1]==1 or len(list(img.shape))==2:
+            #            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+            #        norm_img = process_image(
+            #            img=img,
+            #            image_shape=self.image_shape,
+            #            char_ops=self.char_ops,
+            #            tps=self.use_tps,
+            #            infer_mode=True
+            #        )
+            #        yield norm_img
             else:
                 lmdb_sets = self.load_hierarchical_lmdb_dataset()
                 if process_id == 0:
@@ -169,14 +168,15 @@ class LMDBReader(object):
                             img, label = sample_info
                             outs = []
                             if self.loss_type == "srn":
-                                outs = process_image_srn(img, self.image_shape, self.num_heads,
-                                                         self.max_text_length, label,
-                                                         self.char_ops, self.loss_type)
+                                outs = process_image_srn(
+                                    img, self.image_shape, self.num_heads,
+                                    self.max_text_length, label, self.char_ops,
+                                    self.loss_type)
 
                             else:
-                                outs = process_image(img, self.image_shape, label,
-                                                    self.char_ops, self.loss_type,
-                                                    self.max_text_length)
+                                outs = process_image(
+                                    img, self.image_shape, label, self.char_ops,
+                                    self.loss_type, self.max_text_length)
                             if outs is None:
                                 continue
                             yield outs
@@ -184,6 +184,7 @@ class LMDBReader(object):
                     if finish_read_num == len(lmdb_sets):
                         break
                 self.close_lmdb_dataset(lmdb_sets)
+
         def batch_iter_reader():
             batch_outs = []
             for outs in sample_iter_reader():
@@ -311,4 +312,4 @@ class SimpleReader(object):
 
         if self.infer_img is None:
             return batch_iter_reader
-        return sample_iter_reader
\ No newline at end of file
+        return sample_iter_reader
diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py
index a030f362..d2e01a43 100755
--- a/ppocr/modeling/architectures/rec_model.py
+++ b/ppocr/modeling/architectures/rec_model.py
@@ -79,17 +79,45 @@ class RecModel(object):
                 feed_list = [image, label_in, label_out]
                 labels = {'label_in': label_in, 'label_out': label_out}
             elif self.loss_type == "srn":
-                encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64")
-                gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64")
-                gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
-                gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
-                lbl_weight = fluid.layers.data(name="lbl_weight", shape=[-1, 1], dtype='int64')
+                encoder_word_pos = fluid.data(
+                    name="encoder_word_pos",
+                    shape=[
+                        -1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
+                        1
+                    ],
+                    dtype="int64")
+                gsrm_word_pos = fluid.data(
+                    name="gsrm_word_pos",
+                    shape=[-1, self.max_text_length, 1],
+                    dtype="int64")
+                gsrm_slf_attn_bias1 = fluid.data(
+                    name="gsrm_slf_attn_bias1",
+                    shape=[
+                        -1, self.num_heads, self.max_text_length,
+                        self.max_text_length
+                    ])
+                gsrm_slf_attn_bias2 = fluid.data(
+                    name="gsrm_slf_attn_bias2",
+                    shape=[
+                        -1, self.num_heads, self.max_text_length,
+                        self.max_text_length
+                    ])
+                lbl_weight = fluid.layers.data(
+                    name="lbl_weight", shape=[-1, 1], dtype='int64')
                 label = fluid.data(
                     name='label', shape=[-1, 1], dtype='int32', lod_level=1)
-                feed_list = [image, label, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight]
-                labels = {'label': label, 'encoder_word_pos': encoder_word_pos,
-                          'gsrm_word_pos': gsrm_word_pos, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
-                          'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,'lbl_weight':lbl_weight}
+                feed_list = [
+                    image, label, encoder_word_pos, gsrm_word_pos,
+                    gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight
+                ]
+                labels = {
+                    'label': label,
+                    'encoder_word_pos': encoder_word_pos,
+                    'gsrm_word_pos': gsrm_word_pos,
+                    'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
+                    'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,
+                    'lbl_weight': lbl_weight
+                }
             else:
                 label = fluid.data(
                     name='label', shape=[None, 1], dtype='int32', lod_level=1)
@@ -112,15 +140,41 @@ class RecModel(object):
                         "We set img_shape to be the same , it may affect the inference effect"
                     )
                     image_shape = deepcopy(self.image_shape)
-                    image = fluid.data(name='image', shape=image_shape, dtype='float32')
+            image = fluid.data(name='image', shape=image_shape, dtype='float32')
             if self.loss_type == "srn":
-                encoder_word_pos = fluid.data(name="encoder_word_pos", shape=[-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)), 1], dtype="int64")
-                gsrm_word_pos = fluid.data(name="gsrm_word_pos", shape=[-1, self.max_text_length, 1], dtype="int64")
-                gsrm_slf_attn_bias1 = fluid.data(name="gsrm_slf_attn_bias1", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
-                gsrm_slf_attn_bias2 = fluid.data(name="gsrm_slf_attn_bias2", shape=[-1, self.num_heads, self.max_text_length, self.max_text_length])
-                feed_list = [image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2]
-                labels = {'encoder_word_pos': encoder_word_pos, 'gsrm_word_pos': gsrm_word_pos,
-                        'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1, 'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2}
+                encoder_word_pos = fluid.data(
+                    name="encoder_word_pos",
+                    shape=[
+                        -1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
+                        1
+                    ],
+                    dtype="int64")
+                gsrm_word_pos = fluid.data(
+                    name="gsrm_word_pos",
+                    shape=[-1, self.max_text_length, 1],
+                    dtype="int64")
+                gsrm_slf_attn_bias1 = fluid.data(
+                    name="gsrm_slf_attn_bias1",
+                    shape=[
+                        -1, self.num_heads, self.max_text_length,
+                        self.max_text_length
+                    ])
+                gsrm_slf_attn_bias2 = fluid.data(
+                    name="gsrm_slf_attn_bias2",
+                    shape=[
+                        -1, self.num_heads, self.max_text_length,
+                        self.max_text_length
+                    ])
+                feed_list = [
+                    image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+                    gsrm_slf_attn_bias2
+                ]
+                labels = {
+                    'encoder_word_pos': encoder_word_pos,
+                    'gsrm_word_pos': gsrm_word_pos,
+                    'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
+                    'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2
+                }
         return image, labels, loader
 
     def __call__(self, mode):
@@ -140,8 +194,13 @@ class RecModel(object):
                 label = labels['label']
             if self.loss_type == 'srn':
                 total_loss, img_loss, word_loss = self.loss(predicts, labels)
-                outputs = {'total_loss':total_loss, 'img_loss':img_loss, 'word_loss':word_loss,
-                           'decoded_out':decoded_out, 'label':label}
+                outputs = {
+                    'total_loss': total_loss,
+                    'img_loss': img_loss,
+                    'word_loss': word_loss,
+                    'decoded_out': decoded_out,
+                    'label': label
+                }
             else:
                 outputs = {'total_loss':loss, 'decoded_out':\
                     decoded_out, 'label':label}
@@ -156,4 +215,4 @@ class RecModel(object):
             predict = predicts['predict']
             if self.loss_type == "ctc":
                 predict = fluid.layers.softmax(predict)
-            return loader, {'decoded_out': decoded_out, 'predicts': predict}
\ No newline at end of file
+            return loader, {'decoded_out': decoded_out, 'predicts': predict}
diff --git a/ppocr/modeling/heads/self_attention/model.py b/ppocr/modeling/heads/self_attention/model.py
index d4aecd5f..8ac1458b 100644
--- a/ppocr/modeling/heads/self_attention/model.py
+++ b/ppocr/modeling/heads/self_attention/model.py
@@ -4,8 +4,9 @@ import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 
-from .desc import *
-from .config import ModelHyperParams,TrainTaskConfig
+# Set seed for CE
+dropout_seed = None
+
 
 def wrap_layer_with_block(layer, block_idx):
     """
@@ -114,7 +115,7 @@ def multi_head_attention(queries,
 
     def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value):
         """
-        Reshape input tensors at the last dimension to split multi-heads 
+        Reshape input tensors at the last dimension to split multi-heads
         and then transpose. Specifically, transform the input tensor with shape
         [bs, max_sequence_length, n_head * hidden_dim] to the output tensor
         with shape [bs, n_head, max_sequence_length, hidden_dim].
@@ -269,23 +270,24 @@ pre_process_layer = partial(pre_post_process_layer, None)
 post_process_layer = pre_post_process_layer
 
 
-def prepare_encoder(src_word,#[b,t,c]
-                            src_pos,
-                            src_vocab_size,
-                            src_emb_dim,
-                            src_max_len,
-                            dropout_rate=0.,
-                            bos_idx=0,
-                            word_emb_param_name=None,
-                            pos_enc_param_name=None):
+def prepare_encoder(
+        src_word,  #[b,t,c]
+        src_pos,
+        src_vocab_size,
+        src_emb_dim,
+        src_max_len,
+        dropout_rate=0.,
+        bos_idx=0,
+        word_emb_param_name=None,
+        pos_enc_param_name=None):
     """Add word embeddings and position encodings.
     The output tensor has a shape of:
     [batch_size, max_src_length_in_batch, d_model].
     This module is used at the bottom of the encoder stacks.
     """
-    
-    src_word_emb =src_word#layers.concat(res,axis=1)
-    src_word_emb=layers.cast(src_word_emb,'float32')
+
+    src_word_emb = src_word  #layers.concat(res,axis=1)
+    src_word_emb = layers.cast(src_word_emb, 'float32')
     # print("src_word_emb",src_word_emb)
 
     src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
@@ -302,14 +304,14 @@ def prepare_encoder(src_word,#[b,t,c]
 
 
 def prepare_decoder(src_word,
-                            src_pos,
-                            src_vocab_size,
-                            src_emb_dim,
-                            src_max_len,
-                            dropout_rate=0.,
-                            bos_idx=0,
-                            word_emb_param_name=None,
-                            pos_enc_param_name=None):
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_max_len,
+                    dropout_rate=0.,
+                    bos_idx=0,
+                    word_emb_param_name=None,
+                    pos_enc_param_name=None):
     """Add word embeddings and position encodings.
         The output tensor has a shape of:
         [batch_size, max_src_length_in_batch, d_model].
@@ -323,7 +325,7 @@ def prepare_decoder(src_word,
             name=word_emb_param_name,
             initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
     # print("target_word_emb",src_word_emb)
-    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim ** 0.5)
+    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
     src_pos_enc = layers.embedding(
         src_pos,
         size=[src_max_len, src_emb_dim],
@@ -335,6 +337,7 @@ def prepare_decoder(src_word,
         enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
         is_test=False) if dropout_rate else enc_input
 
+
 # prepare_encoder = partial(
 #     prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0])
 # prepare_decoder = partial(
@@ -595,21 +598,9 @@ def transformer(src_vocab_size,
     weights = all_inputs[-1]
 
     enc_output = wrap_encoder(
-        src_vocab_size,
-        ModelHyperParams.src_seq_len,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        weight_sharing,
-        enc_inputs)
+        src_vocab_size, 64, n_layer, n_head, d_key, d_value, d_model,
+        d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout,
+        preprocess_cmd, postprocess_cmd, weight_sharing, enc_inputs)
 
     predict = wrap_decoder(
         trg_vocab_size,
@@ -650,34 +641,34 @@ def transformer(src_vocab_size,
 
 
 def wrap_encoder_forFeature(src_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 enc_inputs=None,
-                 bos_idx=0):
+                            max_length,
+                            n_layer,
+                            n_head,
+                            d_key,
+                            d_value,
+                            d_model,
+                            d_inner_hid,
+                            prepostprocess_dropout,
+                            attention_dropout,
+                            relu_dropout,
+                            preprocess_cmd,
+                            postprocess_cmd,
+                            weight_sharing,
+                            enc_inputs=None,
+                            bos_idx=0):
     """
     The wrapper assembles together all needed layers for the encoder.
     img, src_pos, src_slf_attn_bias = enc_inputs
     img
     """
-    
+
     if enc_inputs is None:
         # This is used to implement independent encoder program in inference.
         conv_features, src_pos, src_slf_attn_bias = make_all_inputs(
             encoder_data_input_fields)
     else:
-        conv_features, src_pos, src_slf_attn_bias = enc_inputs#
-        b,t,c = conv_features.shape
+        conv_features, src_pos, src_slf_attn_bias = enc_inputs  #
+        b, t, c = conv_features.shape
         #"""
         #    insert cnn
         #"""
@@ -694,11 +685,11 @@ def wrap_encoder_forFeature(src_vocab_size,
         #b , c, h, w = feat.shape#h=6
         #print(feat)
         #layers.Print(feat,message="conv_feat",summarize=10)
-        
+
         #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu")
         #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1))
         #src_word = layers.squeeze(feat,axes=[2]) #src_word  [-1,c,ww]
-            
+
         #feat = layers.transpose(feat, [0,3,1,2])
         #src_word = layers.reshape(feat,[-1,w, c*h])
         #src_word = layers.im2sequence(
@@ -706,10 +697,10 @@ def wrap_encoder_forFeature(src_vocab_size,
         #    stride=[1, 1],
         #    filter_size=[feat.shape[2], 1])
         #layers.Print(src_word,message="src_word",summarize=10)
-        
+
         # print('feat',feat)
         #print("src_word",src_word)
-    
+
     enc_input = prepare_encoder(
         conv_features,
         src_pos,
@@ -718,7 +709,7 @@ def wrap_encoder_forFeature(src_vocab_size,
         max_length,
         prepostprocess_dropout,
         bos_idx=bos_idx,
-        word_emb_param_name=word_emb_param_names[0])
+        word_emb_param_name="src_word_emb_table")
 
     enc_output = encoder(
         enc_input,
@@ -736,6 +727,7 @@ def wrap_encoder_forFeature(src_vocab_size,
         postprocess_cmd, )
     return enc_output
 
+
 def wrap_encoder(src_vocab_size,
                  max_length,
                  n_layer,
@@ -762,7 +754,7 @@ def wrap_encoder(src_vocab_size,
         src_word, src_pos, src_slf_attn_bias = make_all_inputs(
             encoder_data_input_fields)
     else:
-        src_word, src_pos, src_slf_attn_bias = enc_inputs#
+        src_word, src_pos, src_slf_attn_bias = enc_inputs  #
         #"""
         #    insert cnn
         #"""
@@ -779,11 +771,11 @@ def wrap_encoder(src_vocab_size,
         #b , c, h, w = feat.shape#h=6
         #print(feat)
         #layers.Print(feat,message="conv_feat",summarize=10)
-        
+
         #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu")
         #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1))
         #src_word = layers.squeeze(feat,axes=[2]) #src_word  [-1,c,ww]
-            
+
         #feat = layers.transpose(feat, [0,3,1,2])
         #src_word = layers.reshape(feat,[-1,w, c*h])
         #src_word = layers.im2sequence(
@@ -791,7 +783,7 @@ def wrap_encoder(src_vocab_size,
         #    stride=[1, 1],
         #    filter_size=[feat.shape[2], 1])
         #layers.Print(src_word,message="src_word",summarize=10)
-        
+
         # print('feat',feat)
         #print("src_word",src_word)
     enc_input = prepare_decoder(
@@ -802,7 +794,7 @@ def wrap_encoder(src_vocab_size,
         max_length,
         prepostprocess_dropout,
         bos_idx=bos_idx,
-        word_emb_param_name=word_emb_param_names[0])
+        word_emb_param_name="src_word_emb_table")
 
     enc_output = encoder(
         enc_input,
@@ -858,8 +850,8 @@ def wrap_decoder(trg_vocab_size,
         max_length,
         prepostprocess_dropout,
         bos_idx=bos_idx,
-        word_emb_param_name=word_emb_param_names[0]
-        if weight_sharing else word_emb_param_names[1])
+        word_emb_param_name="src_word_emb_table"
+        if weight_sharing else "trg_word_emb_table")
     dec_output = decoder(
         dec_input,
         enc_output,
@@ -886,7 +878,7 @@ def wrap_decoder(trg_vocab_size,
         predict = layers.matmul(
             x=dec_output,
             y=fluid.default_main_program().global_block().var(
-                word_emb_param_names[0]),
+                "trg_word_emb_table"),
             transpose_y=True)
     else:
         predict = layers.fc(input=dec_output,
@@ -931,12 +923,13 @@ def fast_decode(src_vocab_size,
 
     enc_inputs_len = len(encoder_data_input_fields)
     dec_inputs_len = len(fast_decoder_data_input_fields)
-    enc_inputs = all_inputs[0:enc_inputs_len]#enc_inputs tensor
-    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]#dec_inputs tensor
+    enc_inputs = all_inputs[0:enc_inputs_len]  #enc_inputs tensor
+    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
+                            dec_inputs_len]  #dec_inputs tensor
 
     enc_output = wrap_encoder(
         src_vocab_size,
-        ModelHyperParams.src_seq_len,##to do !!!!!????
+        64,  ##to do !!!!!????
         n_layer,
         n_head,
         d_key,
diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py
index 3d496bd3..ecdf0aaf 100644
--- a/tools/eval_utils/eval_rec_utils.py
+++ b/tools/eval_utils/eval_rec_utils.py
@@ -61,7 +61,7 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
             img_list.append(data[ino][0])
             label_list.append(data[ino][1])
 
-        if config['Global']['loss_type'] != "srn": 
+        if config['Global']['loss_type'] != "srn":
             img_list = np.concatenate(img_list, axis=0)
             outs = exe.run(eval_info_dict['program'], \
                        feed={'image': img_list}, \
@@ -75,7 +75,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
                 preds_lod = outs[0].lod()[0]
             labels, labels_lod = convert_rec_label_to_lod(label_list)
             acc, acc_num, sample_num = cal_predicts_accuracy(
-                char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate)
+                char_ops, preds, preds_lod, labels, labels_lod,
+                is_remove_duplicate)
         else:
             encoder_word_pos_list = []
             gsrm_word_pos_list = []
@@ -89,15 +90,19 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
 
             img_list = np.concatenate(img_list, axis=0)
             label_list = np.concatenate(label_list, axis=0)
-            encoder_word_pos_list = np.concatenate(encoder_word_pos_list, axis=0).astype(np.int64)
-            gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list, axis=0).astype(np.int64)
-            gsrm_slf_attn_bias1_list = np.concatenate(gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
-            gsrm_slf_attn_bias2_list = np.concatenate(gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
+            encoder_word_pos_list = np.concatenate(
+                encoder_word_pos_list, axis=0).astype(np.int64)
+            gsrm_word_pos_list = np.concatenate(
+                gsrm_word_pos_list, axis=0).astype(np.int64)
+            gsrm_slf_attn_bias1_list = np.concatenate(
+                gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
+            gsrm_slf_attn_bias2_list = np.concatenate(
+                gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
 
             labels = label_list
 
             outs = exe.run(eval_info_dict['program'], \
-                       feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list, 
+                       feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list,
                              'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list,
                              'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \
                        fetch_list=eval_info_dict['fetch_varname_list'], \
@@ -108,7 +113,7 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
 
         total_acc_num += acc_num
         total_sample_num += sample_num
-        logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc))
+        #logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc))
         total_batch_num += 1
     avg_acc = total_acc_num * 1.0 / total_sample_num
     metrics = {'avg_acc': avg_acc, "total_acc_num": total_acc_num, \
diff --git a/tools/program.py b/tools/program.py
index 64c827e7..6ebc27cb 100755
--- a/tools/program.py
+++ b/tools/program.py
@@ -34,6 +34,7 @@ from ppocr.utils.save_load import save_model
 import numpy as np
 from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn, CharacterOps
 
+
 class ArgsParser(ArgumentParser):
     def __init__(self):
         super(ArgsParser, self).__init__(
@@ -196,10 +197,13 @@ def build(config, main_prog, startup_prog, mode):
                 if config['Global']["loss_type"] == 'srn':
                     model_average = fluid.optimizer.ModelAverage(
                         config['Global']['average_window'],
-                        min_average_window=config['Global']['min_average_window'],
-                        max_average_window=config['Global']['max_average_window'])
+                        min_average_window=config['Global'][
+                            'min_average_window'],
+                        max_average_window=config['Global'][
+                            'max_average_window'])
 
-    return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,model_average)
+    return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,
+            model_average)
 
 
 def build_export(config, main_prog, startup_prog):
@@ -398,6 +402,7 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
             save_model(train_info_dict['train_program'], save_path)
     return
 
+
 def preprocess():
     FLAGS = ArgsParser().parse_args()
     config = load_config(FLAGS.config)
@@ -409,8 +414,8 @@ def preprocess():
     check_gpu(use_gpu)
 
     alg = config['Global']['algorithm']
-    assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE']
-    if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']:
+    assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN']
+    if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN']:
         config['Global']['char_ops'] = CharacterOps(config['Global'])
 
     place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
diff --git a/train_data b/train_data
new file mode 120000
index 00000000..7c2082ab
--- /dev/null
+++ b/train_data
@@ -0,0 +1 @@
+/workspace/PaddleOCR/train_data/
\ No newline at end of file

From bf4863c95082651cc8daf5c39455632ca6c113db Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sat, 15 Aug 2020 15:45:55 +0800
Subject: [PATCH 03/11] update infer_rec for srn

---
 ppocr/data/rec/dataset_traversal.py       |  41 +++--
 ppocr/modeling/architectures/rec_model.py |   5 +-
 ppocr/modeling/backbones/rec_resnet_vd.py |   2 +-
 ppocr/modeling/heads/rec_srn_all_head.py  | 194 ++++++++++++----------
 ppocr/modeling/losses/rec_srn_loss.py     |  33 ++--
 ppocr/utils/character.py                  |  25 +--
 tools/eval_utils/eval_rec_utils.py        |   4 +-
 tools/infer_rec.py                        |  48 +++++-
 8 files changed, 194 insertions(+), 158 deletions(-)

diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py
index b46e37da..53c7e87b 100755
--- a/ppocr/data/rec/dataset_traversal.py
+++ b/ppocr/data/rec/dataset_traversal.py
@@ -40,10 +40,12 @@ class LMDBReader(object):
         self.image_shape = params['image_shape']
         self.loss_type = params['loss_type']
         self.max_text_length = params['max_text_length']
-        self.num_heads = params['num_heads']
         self.mode = params['mode']
         self.drop_last = False
         self.use_tps = False
+        self.num_heads = None
+        if "num_heads" in params:
+            self.num_heads = params['num_heads']
         if "tps" in params:
             self.ues_tps = True
         self.use_distort = False
@@ -134,20 +136,6 @@ class LMDBReader(object):
                             tps=self.use_tps,
                             infer_mode=True)
                     yield norm_img
-            #elif self.mode == 'eval':
-            #    image_file_list = get_image_file_list(self.infer_img)
-            #    for single_img in image_file_list:
-            #        img = cv2.imread(single_img)
-            #        if img.shape[-1]==1 or len(list(img.shape))==2:
-            #            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-            #        norm_img = process_image(
-            #            img=img,
-            #            image_shape=self.image_shape,
-            #            char_ops=self.char_ops,
-            #            tps=self.use_tps,
-            #            infer_mode=True
-            #        )
-            #        yield norm_img
             else:
                 lmdb_sets = self.load_hierarchical_lmdb_dataset()
                 if process_id == 0:
@@ -169,14 +157,22 @@ class LMDBReader(object):
                             outs = []
                             if self.loss_type == "srn":
                                 outs = process_image_srn(
-                                    img, self.image_shape, self.num_heads,
-                                    self.max_text_length, label, self.char_ops,
-                                    self.loss_type)
+                                    img=img,
+                                    image_shape=self.image_shape,
+                                    num_heads=self.num_heads,
+                                    max_text_length=self.max_text_length,
+                                    label=label,
+                                    char_ops=self.char_ops,
+                                    loss_type=self.loss_type)
 
                             else:
                                 outs = process_image(
-                                    img, self.image_shape, label, self.char_ops,
-                                    self.loss_type, self.max_text_length)
+                                    img=img,
+                                    image_shape=self.image_shape,
+                                    label=label,
+                                    char_ops=self.char_ops,
+                                    loss_type=self.loss_type,
+                                    max_text_length=self.max_text_length)
                             if outs is None:
                                 continue
                             yield outs
@@ -192,8 +188,9 @@ class LMDBReader(object):
                 if len(batch_outs) == self.batch_size:
                     yield batch_outs
                     batch_outs = []
-            if len(batch_outs) != 0:
-                yield batch_outs
+            if not self.drop_last:
+                if len(batch_outs) != 0:
+                    yield batch_outs
 
         if self.infer_img is None:
             return batch_iter_reader
diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py
index d2e01a43..5eacd5de 100755
--- a/ppocr/modeling/architectures/rec_model.py
+++ b/ppocr/modeling/architectures/rec_model.py
@@ -58,7 +58,10 @@ class RecModel(object):
         self.loss_type = global_params['loss_type']
         self.image_shape = global_params['image_shape']
         self.max_text_length = global_params['max_text_length']
-        self.num_heads = global_params["num_heads"]
+        if "num_heads" in params:
+            self.num_heads = global_params["num_heads"]
+        else:
+            self.num_heads = None
 
     def create_feed(self, mode):
         image_shape = deepcopy(self.image_shape)
diff --git a/ppocr/modeling/backbones/rec_resnet_vd.py b/ppocr/modeling/backbones/rec_resnet_vd.py
index 2c7cd4c7..bc58c8ac 100755
--- a/ppocr/modeling/backbones/rec_resnet_vd.py
+++ b/ppocr/modeling/backbones/rec_resnet_vd.py
@@ -32,7 +32,7 @@ class ResNet():
     def __init__(self, params):
         self.layers = params['layers']
         self.is_3x3 = True
-        supported_layers = [18, 34, 50, 101, 152]
+        supported_layers = [18, 34, 50, 101, 152, 200]
         assert self.layers in supported_layers, \
             "supported layers are {} but input layer is {}".format(supported_layers, self.layers)
 
diff --git a/ppocr/modeling/heads/rec_srn_all_head.py b/ppocr/modeling/heads/rec_srn_all_head.py
index bf1f4a44..e1bb955d 100755
--- a/ppocr/modeling/heads/rec_srn_all_head.py
+++ b/ppocr/modeling/heads/rec_srn_all_head.py
@@ -21,15 +21,12 @@ import math
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
-#from .rec_seq_encoder import SequenceEncoder
-#from ..common_functions import get_para_bias_attr
 import numpy as np
 from .self_attention.model import wrap_encoder
 from .self_attention.model import wrap_encoder_forFeature
 gradient_clip = 10
 
 
-
 class SRNPredict(object):
     def __init__(self, params):
         super(SRNPredict, self).__init__()
@@ -41,7 +38,6 @@ class SRNPredict(object):
         self.num_decoder_TUs = params['num_decoder_TUs']
         self.hidden_dims = params['hidden_dims']
 
-
     def pvam(self, inputs, others):
 
         b, c, h, w = inputs.shape
@@ -53,52 +49,62 @@ class SRNPredict(object):
         encoder_word_pos = others["encoder_word_pos"]
         gsrm_word_pos = others["gsrm_word_pos"]
 
-
         enc_inputs = [conv_features, encoder_word_pos, None]
-        word_features = wrap_encoder_forFeature(src_vocab_size=-1,
-                 max_length=t,
-                 n_layer=self.num_encoder_TUs,
-                 n_head=self.num_heads,
-                 d_key= int(self.hidden_dims / self.num_heads),
-                 d_value= int(self.hidden_dims / self.num_heads),
-                 d_model=self.hidden_dims,
-                 d_inner_hid=self.hidden_dims,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 weight_sharing=True,
-                 enc_inputs=enc_inputs,
-                )
-        fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(gradient_clip))
+        word_features = wrap_encoder_forFeature(
+            src_vocab_size=-1,
+            max_length=t,
+            n_layer=self.num_encoder_TUs,
+            n_head=self.num_heads,
+            d_key=int(self.hidden_dims / self.num_heads),
+            d_value=int(self.hidden_dims / self.num_heads),
+            d_model=self.hidden_dims,
+            d_inner_hid=self.hidden_dims,
+            prepostprocess_dropout=0.1,
+            attention_dropout=0.1,
+            relu_dropout=0.1,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            weight_sharing=True,
+            enc_inputs=enc_inputs, )
+        fluid.clip.set_gradient_clip(
+            fluid.clip.GradientClipByValue(gradient_clip))
 
         #===== Parallel Visual Attention Module =====
         b, t, c = word_features.shape
 
-        word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2) 
+        word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2)
         word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
-        word_features_ = fluid.layers.expand(word_features_, [1, self.max_length, 1, 1])
-        word_pos_feature = fluid.layers.embedding(gsrm_word_pos, [self.max_length, c]) 
-        word_pos_ = fluid.layers.reshape(word_pos_feature, [-1, self.max_length, 1, c])
+        word_features_ = fluid.layers.expand(word_features_,
+                                             [1, self.max_length, 1, 1])
+        word_pos_feature = fluid.layers.embedding(gsrm_word_pos,
+                                                  [self.max_length, c])
+        word_pos_ = fluid.layers.reshape(word_pos_feature,
+                                         [-1, self.max_length, 1, c])
         word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
-        temp = fluid.layers.elementwise_add(word_features_, word_pos_, act='tanh')
+        temp = fluid.layers.elementwise_add(
+            word_features_, word_pos_, act='tanh')
 
-        attention_weight = fluid.layers.fc(input=temp, size=1, num_flatten_dims=3, bias_attr=False) 
-        attention_weight = fluid.layers.reshape(x=attention_weight, shape=[-1, self.max_length, t])    
-        attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1) 
+        attention_weight = fluid.layers.fc(input=temp,
+                                           size=1,
+                                           num_flatten_dims=3,
+                                           bias_attr=False)
+        attention_weight = fluid.layers.reshape(
+            x=attention_weight, shape=[-1, self.max_length, t])
+        attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1)
+
+        pvam_features = fluid.layers.matmul(attention_weight,
+                                            word_features)  #[b, max_length, c]
 
-        pvam_features = fluid.layers.matmul(attention_weight, word_features)#[b, max_length, c]
-        
         return pvam_features
-        
+
     def gsrm(self, pvam_features, others):
 
         #===== GSRM Visual-to-semantic embedding block =====
         b, t, c = pvam_features.shape
-        word_out = fluid.layers.fc(input=fluid.layers.reshape(pvam_features, [-1, c]),
-                                  size=self.char_num,
-                                  act="softmax")
+        word_out = fluid.layers.fc(
+            input=fluid.layers.reshape(pvam_features, [-1, c]),
+            size=self.char_num,
+            act="softmax")
         #word_out.stop_gradient = True
         word_ids = fluid.layers.argmax(word_out, axis=1)
         word_ids.stop_gradient = True
@@ -106,7 +112,7 @@ class SRNPredict(object):
 
         #===== GSRM Semantic reasoning block =====
         """
-        This module is achieved through bi-transformers, 
+        This module is achieved through bi-transformers,
         ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
         """
         pad_idx = self.char_num
@@ -120,7 +126,8 @@ class SRNPredict(object):
             word1 for forward; word2 for backward
             """
             word1 = fluid.layers.cast(word_ids, "float32")
-            word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0], pad_value=1.0 * pad_idx)
+            word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0],
+                                     pad_value=1.0 * pad_idx)
             word1 = fluid.layers.cast(word1, "int64")
             word1 = word1[:, :-1, :]
             word2 = word_ids
@@ -132,39 +139,40 @@ class SRNPredict(object):
         enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
         enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
 
-        gsrm_feature1 = wrap_encoder(src_vocab_size=self.char_num + 1,
-                 max_length=self.max_length,
-                 n_layer=self.num_decoder_TUs,
-                 n_head=self.num_heads,
-                 d_key=int(self.hidden_dims / self.num_heads),
-                 d_value=int(self.hidden_dims / self.num_heads),
-                 d_model=self.hidden_dims,
-                 d_inner_hid=self.hidden_dims,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 weight_sharing=True,
-                 enc_inputs=enc_inputs_1,
-                )
-        gsrm_feature2 = wrap_encoder(src_vocab_size=self.char_num + 1,
-                 max_length=self.max_length,
-                 n_layer=self.num_decoder_TUs,
-                 n_head=self.num_heads,
-                 d_key=int(self.hidden_dims / self.num_heads),
-                 d_value=int(self.hidden_dims / self.num_heads),
-                 d_model=self.hidden_dims,
-                 d_inner_hid=self.hidden_dims,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 weight_sharing=True,
-                 enc_inputs=enc_inputs_2,
-                )
-        gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], pad_value=0.)
+        gsrm_feature1 = wrap_encoder(
+            src_vocab_size=self.char_num + 1,
+            max_length=self.max_length,
+            n_layer=self.num_decoder_TUs,
+            n_head=self.num_heads,
+            d_key=int(self.hidden_dims / self.num_heads),
+            d_value=int(self.hidden_dims / self.num_heads),
+            d_model=self.hidden_dims,
+            d_inner_hid=self.hidden_dims,
+            prepostprocess_dropout=0.1,
+            attention_dropout=0.1,
+            relu_dropout=0.1,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            weight_sharing=True,
+            enc_inputs=enc_inputs_1, )
+        gsrm_feature2 = wrap_encoder(
+            src_vocab_size=self.char_num + 1,
+            max_length=self.max_length,
+            n_layer=self.num_decoder_TUs,
+            n_head=self.num_heads,
+            d_key=int(self.hidden_dims / self.num_heads),
+            d_value=int(self.hidden_dims / self.num_heads),
+            d_model=self.hidden_dims,
+            d_inner_hid=self.hidden_dims,
+            prepostprocess_dropout=0.1,
+            attention_dropout=0.1,
+            relu_dropout=0.1,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            weight_sharing=True,
+            enc_inputs=enc_inputs_2, )
+        gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0],
+                                         pad_value=0.)
         gsrm_feature2 = gsrm_feature2[:, 1:, ]
         gsrm_features = gsrm_feature1 + gsrm_feature2
 
@@ -172,10 +180,12 @@ class SRNPredict(object):
 
         gsrm_out = fluid.layers.matmul(
             x=gsrm_features,
-            y=fluid.default_main_program().global_block().var("src_word_emb_table"),
+            y=fluid.default_main_program().global_block().var(
+                "src_word_emb_table"),
             transpose_y=True)
-        b,t,c = gsrm_out.shape
-        gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out, [-1, c]))
+        b, t, c = gsrm_out.shape
+        gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out,
+                                                                   [-1, c]))
 
         return gsrm_features, word_out, gsrm_out
 
@@ -184,19 +194,25 @@ class SRNPredict(object):
         #===== Visual-Semantic Fusion Decoder Module =====
         b, t, c1 = pvam_features.shape
         b, t, c2 = gsrm_features.shape
-        combine_features_ = fluid.layers.concat([pvam_features, gsrm_features], axis=2)
-        img_comb_features_ = fluid.layers.reshape(x=combine_features_, shape=[-1, c1 + c2])
-        img_comb_features_map = fluid.layers.fc(input=img_comb_features_, size=c1, act="sigmoid")
-        img_comb_features_map = fluid.layers.reshape(x=img_comb_features_map, shape=[-1, t, c1])    
-        combine_features = img_comb_features_map * pvam_features + (1.0 - img_comb_features_map) * gsrm_features    
-        img_comb_features = fluid.layers.reshape(x=combine_features, shape=[-1, c1])
+        combine_features_ = fluid.layers.concat(
+            [pvam_features, gsrm_features], axis=2)
+        img_comb_features_ = fluid.layers.reshape(
+            x=combine_features_, shape=[-1, c1 + c2])
+        img_comb_features_map = fluid.layers.fc(input=img_comb_features_,
+                                                size=c1,
+                                                act="sigmoid")
+        img_comb_features_map = fluid.layers.reshape(
+            x=img_comb_features_map, shape=[-1, t, c1])
+        combine_features = img_comb_features_map * pvam_features + (
+            1.0 - img_comb_features_map) * gsrm_features
+        img_comb_features = fluid.layers.reshape(
+            x=combine_features, shape=[-1, c1])
 
         fc_out = fluid.layers.fc(input=img_comb_features,
                                  size=self.char_num,
                                  act="softmax")
         return fc_out
 
-
     def __call__(self, inputs, others, mode=None):
 
         pvam_features = self.pvam(inputs, others)
@@ -204,15 +220,11 @@ class SRNPredict(object):
         final_out = self.vsfd(pvam_features, gsrm_features)
 
         _, decoded_out = fluid.layers.topk(input=final_out, k=1)
-        predicts = {'predict': final_out, 'decoded_out': decoded_out, 
-                    'word_out': word_out, 'gsrm_out': gsrm_out}
+        predicts = {
+            'predict': final_out,
+            'decoded_out': decoded_out,
+            'word_out': word_out,
+            'gsrm_out': gsrm_out
+        }
 
         return predicts
-
-
-
-
-
-
-
-
diff --git a/ppocr/modeling/losses/rec_srn_loss.py b/ppocr/modeling/losses/rec_srn_loss.py
index 68a480ac..b1ebd86f 100755
--- a/ppocr/modeling/losses/rec_srn_loss.py
+++ b/ppocr/modeling/losses/rec_srn_loss.py
@@ -35,24 +35,21 @@ class SRNLoss(object):
         lbl_weight = others['lbl_weight']
 
         casted_label = fluid.layers.cast(x=label, dtype='int64')
-        cost_word = fluid.layers.cross_entropy(input=word_predict, label=casted_label)
-        cost_gsrm = fluid.layers.cross_entropy(input=gsrm_predict, label=casted_label)
-        cost_vsfd = fluid.layers.cross_entropy(input=predict, label=casted_label)
+        cost_word = fluid.layers.cross_entropy(
+            input=word_predict, label=casted_label)
+        cost_gsrm = fluid.layers.cross_entropy(
+            input=gsrm_predict, label=casted_label)
+        cost_vsfd = fluid.layers.cross_entropy(
+            input=predict, label=casted_label)
 
-        #cost_word = cost_word * lbl_weight
-        #cost_gsrm = cost_gsrm * lbl_weight
-        #cost_vsfd = cost_vsfd * lbl_weight
+        cost_word = fluid.layers.reshape(
+            x=fluid.layers.reduce_sum(cost_word), shape=[1])
+        cost_gsrm = fluid.layers.reshape(
+            x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
+        cost_vsfd = fluid.layers.reshape(
+            x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
 
-        cost_word = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_word), shape=[1])
-        cost_gsrm = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
-        cost_vsfd = fluid.layers.reshape(x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
+        sum_cost = fluid.layers.sum(
+            [cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
 
-        sum_cost = fluid.layers.sum([cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
-
-        #sum_cost = fluid.layers.sum([cost_word * 3.0, cost_vsfd, cost_gsrm * 0.15])
-        #sum_cost = cost_word
-
-        #fluid.layers.Print(cost_word,message="word_cost")
-        #fluid.layers.Print(cost_vsfd,message="img_cost")
-        return [sum_cost,cost_vsfd,cost_word]
-        #return [sum_cost, cost_vsfd, cost_word]
+        return [sum_cost, cost_vsfd, cost_word]
diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index 79d6f5ca..5f2963ac 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -149,38 +149,29 @@ def cal_predicts_accuracy(char_ops,
     acc = acc_num * 1.0 / img_num
     return acc, acc_num, img_num
 
+
 def cal_predicts_accuracy_srn(char_ops,
-                          preds,
-                          labels,
-                          max_text_len,
-                          is_debug=False):
+                              preds,
+                              labels,
+                              max_text_len,
+                              is_debug=False):
     acc_num = 0
     img_num = 0
 
     total_len = preds.shape[0]
     img_num = int(total_len / max_text_len)
-    #print (img_num)
     for i in range(img_num):
         cur_label = []
         cur_pred = []
         for j in range(max_text_len):
-            if labels[j + i * max_text_len] != 37: #0
+            if labels[j + i * max_text_len] != 37:  #0
                 cur_label.append(labels[j + i * max_text_len][0])
             else:
                 break
 
-        if is_debug:
-            for j in range(max_text_len):
-                if preds[j + i * max_text_len] != 37: #0
-                    cur_pred.append(preds[j + i * max_text_len][0])
-                else:
-                    break
-            print ("cur_label: ", cur_label)
-            print ("cur_pred: ", cur_pred)
-
-
         for j in range(max_text_len + 1):
-            if j < len(cur_label) and preds[j + i * max_text_len][0] != cur_label[j]:
+            if j < len(cur_label) and preds[j + i * max_text_len][
+                    0] != cur_label[j]:
                 break
             elif j == len(cur_label) and j == max_text_len:
                 acc_num += 1
diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py
index ecdf0aaf..5a653678 100644
--- a/tools/eval_utils/eval_rec_utils.py
+++ b/tools/eval_utils/eval_rec_utils.py
@@ -123,8 +123,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
 
 def test_rec_benchmark(exe, config, eval_info_dict):
     " Evaluate lmdb dataset "
-    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860',  \
-                      'IC13_857', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80']
+    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860','IC03_867',  \
+                      'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80']
     eval_data_dir = config['TestReader']['lmdb_sets_dir']
     total_evaluation_data_number = 0
     total_correct_number = 0
diff --git a/tools/infer_rec.py b/tools/infer_rec.py
index 8cde44d8..21b503cc 100755
--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@@ -64,7 +64,6 @@ def main():
     exe = fluid.Executor(place)
 
     rec_model = create_module(config['Architecture']['function'])(params=config)
-
     startup_prog = fluid.Program()
     eval_prog = fluid.Program()
     with fluid.program_guard(eval_prog, startup_prog):
@@ -86,10 +85,36 @@ def main():
     for i in range(max_img_num):
         logger.info("infer_img:%s" % infer_list[i])
         img = next(blobs)
-        predict = exe.run(program=eval_prog,
-                          feed={"image": img},
-                          fetch_list=fetch_varname_list,
-                          return_numpy=False)
+        if loss_type != "srn":
+            predict = exe.run(program=eval_prog,
+                              feed={"image": img},
+                              fetch_list=fetch_varname_list,
+                              return_numpy=False)
+        else:
+            encoder_word_pos_list = []
+            gsrm_word_pos_list = []
+            gsrm_slf_attn_bias1_list = []
+            gsrm_slf_attn_bias2_list = []
+            encoder_word_pos_list.append(img[1])
+            gsrm_word_pos_list.append(img[2])
+            gsrm_slf_attn_bias1_list.append(img[3])
+            gsrm_slf_attn_bias2_list.append(img[4])
+
+            encoder_word_pos_list = np.concatenate(
+                encoder_word_pos_list, axis=0).astype(np.int64)
+            gsrm_word_pos_list = np.concatenate(
+                gsrm_word_pos_list, axis=0).astype(np.int64)
+            gsrm_slf_attn_bias1_list = np.concatenate(
+                gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
+            gsrm_slf_attn_bias2_list = np.concatenate(
+                gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
+
+            predict = exe.run(program=eval_prog, \
+                       feed={'image': img[0], 'encoder_word_pos': encoder_word_pos_list,
+                             'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list,
+                             'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \
+                       fetch_list=fetch_varname_list, \
+                       return_numpy=False)
         if loss_type == "ctc":
             preds = np.array(predict[0])
             preds = preds.reshape(-1)
@@ -114,7 +139,18 @@ def main():
                 score = np.mean(probs[0, 1:end_pos[1]])
             preds = preds.reshape(-1)
             preds_text = char_ops.decode(preds)
-
+        elif loss_type == "srn":
+            cur_pred = []
+            preds = np.array(predict[0])
+            preds = preds.reshape(-1)
+            probs = np.array(predict[1])
+            ind = np.argmax(probs, axis=1)
+            valid_ind = np.where(preds != 37)[0]
+            if len(valid_ind) == 0:
+                continue
+            score = np.mean(probs[valid_ind, ind[valid_ind]])
+            preds = preds[:valid_ind[-1] + 1]
+            preds_text = char_ops.decode(preds)
         logger.info("\t index: {}".format(preds))
         logger.info("\t word : {}".format(preds_text))
         logger.info("\t score: {}".format(score))

From d3ed210ac66ef026f3e641b365d1a7e74e43fe47 Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sat, 15 Aug 2020 15:48:59 +0800
Subject: [PATCH 04/11] deleted dataset

---
 train_data | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 train_data

diff --git a/train_data b/train_data
deleted file mode 120000
index 7c2082ab..00000000
--- a/train_data
+++ /dev/null
@@ -1 +0,0 @@
-/workspace/PaddleOCR/train_data/
\ No newline at end of file

From 9cb30720331cd84265b020be14a36dda3e88d2bf Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 12:53:26 +0800
Subject: [PATCH 05/11] fix bug and update doc

---
 README_cn.md                                      |  6 +++++-
 ...am_test_all.yml => rec_r50fpn_vd_none_srn.yml} |  0
 ppocr/utils/character.py                          |  2 +-
 tools/infer/predict_rec.py                        |  3 ++-
 tools/infer/utility.py                            |  4 ++--
 tools/program.py                                  | 15 ++++++++-------
 6 files changed, 18 insertions(+), 12 deletions(-)
 rename configs/rec/{rec_r50fpn_vd_none_srn_pvam_test_all.yml => rec_r50fpn_vd_none_srn.yml} (100%)

diff --git a/README_cn.md b/README_cn.md
index cc5cb00a..ebfc4b1d 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -122,7 +122,10 @@ PaddleOCR开源的文本识别算法列表：
 - [x]  Rosetta([paper](https://arxiv.org/abs/1910.05085))
 - [x]  STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))
 - [x]  RARE([paper](https://arxiv.org/abs/1603.03915v1))
-- [ ]  SRN([paper](https://arxiv.org/abs/2003.12294))(百度自研, coming soon)
+- [x]  SRN([paper](https://arxiv.org/abs/2003.12294))(百度自研)
+
+*备注：* SRN模型使用了数据扰动方法对上述提到对两个训练集进行增广,增广后的数据可以在[百度网盘](todo)上下载。
+原始论文使用两阶段训练平均精度为89.74%，PaddleOCR中使用one-stage训练，平均精度为88.33%。两种预训练权重均在[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)中。
 
 参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程，使用MJSynth和SynthText两个文字识别数据集训练，在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估，算法效果如下：
 
@@ -136,6 +139,7 @@ PaddleOCR开源的文本识别算法列表：
 |STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)|
 |RARE|Resnet34_vd|84.90%|rec_r34_vd_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_attn.tar)|
 |RARE|MobileNetV3|83.32%|rec_mv3_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_attn.tar)|
+|SRN|Resnet50_vd_fpn|88.33%|rec_r50fpn_vd_none_srn|[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)|
 
 使用[LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#1icdar2019-lsvt)街景数据集根据真值将图crop出来30w数据，进行位置校准。此外基于LSVT语料生成500w合成数据训练中文模型，相关配置和预训练文件如下：  
 
diff --git a/configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml b/configs/rec/rec_r50fpn_vd_none_srn.yml
similarity index 100%
rename from configs/rec/rec_r50fpn_vd_none_srn_pvam_test_all.yml
rename to configs/rec/rec_r50fpn_vd_none_srn.yml
diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index 5f2963ac..575658ef 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -25,7 +25,7 @@ class CharacterOps(object):
     def __init__(self, config):
         self.character_type = config['character_type']
         self.loss_type = config['loss_type']
-        self.max_text_len = config['max_text_length']
+        self.max_text_len = 25
         if self.character_type == "en":
             self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
             dict_character = list(self.character_str)
diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py
index b51c49fc..c81b4eb2 100755
--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -40,7 +40,8 @@ class TextRecognizer(object):
         char_ops_params = {
             "character_type": args.rec_char_type,
             "character_dict_path": args.rec_char_dict_path,
-            "use_space_char": args.use_space_char
+            "use_space_char": args.use_space_char,
+            "max_text_length": args.max_text_length
         }
         if self.rec_algorithm != "RARE":
             char_ops_params['loss_type'] = 'ctc'
diff --git a/tools/infer/utility.py b/tools/infer/utility.py
index fc91880e..fe590c7e 100755
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -56,8 +56,8 @@ def parse_args():
     #params for text recognizer
     parser.add_argument("--rec_algorithm", type=str, default='CRNN')
     parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
-    parser.add_argument("--rec_char_type", type=str, default='ch')
+    parser.add_argument("--rec_image_shape", type=str, default="1, 64, 320")
+    parser.add_argument("--rec_char_type", type=str, default='en')
     parser.add_argument("--rec_batch_num", type=int, default=30)
     parser.add_argument(
         "--rec_char_dict_path",
diff --git a/tools/program.py b/tools/program.py
index 6ebc27cb..354cf8dd 100755
--- a/tools/program.py
+++ b/tools/program.py
@@ -194,13 +194,14 @@ def build(config, main_prog, startup_prog, mode):
                 global_lr = optimizer._global_learning_rate()
                 fetch_name_list.insert(0, "lr")
                 fetch_varname_list.insert(0, global_lr.name)
-                if config['Global']["loss_type"] == 'srn':
-                    model_average = fluid.optimizer.ModelAverage(
-                        config['Global']['average_window'],
-                        min_average_window=config['Global'][
-                            'min_average_window'],
-                        max_average_window=config['Global'][
-                            'max_average_window'])
+                if "loss_type" in config["Global"]:
+                    if config['Global']["loss_type"] == 'srn':
+                        model_average = fluid.optimizer.ModelAverage(
+                            config['Global']['average_window'],
+                            min_average_window=config['Global'][
+                                'min_average_window'],
+                            max_average_window=config['Global'][
+                                'max_average_window'])
 
     return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,
             model_average)

From d179c7c4468da9f0bb38977febda51a9b495715c Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 12:58:05 +0800
Subject: [PATCH 06/11] fix bug

---
 ppocr/utils/character.py | 2 +-
 tools/infer/utility.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index 575658ef..5f2963ac 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -25,7 +25,7 @@ class CharacterOps(object):
     def __init__(self, config):
         self.character_type = config['character_type']
         self.loss_type = config['loss_type']
-        self.max_text_len = 25
+        self.max_text_len = config['max_text_length']
         if self.character_type == "en":
             self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
             dict_character = list(self.character_str)
diff --git a/tools/infer/utility.py b/tools/infer/utility.py
index fe590c7e..fc91880e 100755
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -56,8 +56,8 @@ def parse_args():
     #params for text recognizer
     parser.add_argument("--rec_algorithm", type=str, default='CRNN')
     parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_shape", type=str, default="1, 64, 320")
-    parser.add_argument("--rec_char_type", type=str, default='en')
+    parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
+    parser.add_argument("--rec_char_type", type=str, default='ch')
     parser.add_argument("--rec_batch_num", type=int, default=30)
     parser.add_argument(
         "--rec_char_dict_path",

From d0d5de7f4ddf02964a39b9645e6fb4efc75a9d2b Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 13:30:25 +0800
Subject: [PATCH 07/11] fix bug

---
 tools/infer/utility.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/infer/utility.py b/tools/infer/utility.py
index fc91880e..b0a0ec1f 100755
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -59,6 +59,7 @@ def parse_args():
     parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
     parser.add_argument("--rec_char_type", type=str, default='ch')
     parser.add_argument("--rec_batch_num", type=int, default=30)
+    parser.add_argument("--max_text_length", type=int, default=25)
     parser.add_argument(
         "--rec_char_dict_path",
         type=str,

From a3b291928b64687b5e6f437636eeeaa2e785f6e5 Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 16:46:22 +0800
Subject: [PATCH 08/11] polish code

---
 doc/doc_ch/config.md                      | 3 +++
 ppocr/modeling/architectures/rec_model.py | 3 +++
 tools/eval_utils/eval_rec_utils.py        | 6 +++---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md
index 5e579096..03fe1b32 100644
--- a/doc/doc_ch/config.md
+++ b/doc/doc_ch/config.md
@@ -32,6 +32,9 @@
 |      loss_type           |    设置 loss 类型              |       ctc         |    支持两种loss： ctc / attention |
 |       distort            |    设置是否使用数据增强          |       false       |  设置为true时，将在训练时随机进行扰动，支持的扰动操作可阅读[img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py)                 |
 |       use_space_char     |    设置是否识别空格             |        false      |          仅在 character_type=ch 时支持空格                 |
+|      average_window      |    ModelAverage优化器中的窗口长度计算比例 |  0.15       |       目前仅应用与SRN |
+|      max_average_window  |    平均值计算窗口长度的最大值   |   15625              | 推荐设置为一轮训练中mini-batchs的数目|
+|      min_average_window  |    平均值计算窗口长度的最小值  |    10000              |      \          |
 |      reader_yml          |    设置reader配置文件          |  ./configs/rec/rec_icdar15_reader.yml  |  \          |
 |      pretrain_weights    |    加载预训练模型路径      |  ./pretrain_models/CRNN/best_accuracy  |  \          |
 |      checkpoints         |    加载模型参数路径            |       None        |    用于中断后加载参数继续训练 |
diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py
index 5eacd5de..f4e3eea2 100755
--- a/ppocr/modeling/architectures/rec_model.py
+++ b/ppocr/modeling/architectures/rec_model.py
@@ -213,6 +213,9 @@ class RecModel(object):
             predict = predicts['predict']
             if self.loss_type == "ctc":
                 predict = fluid.layers.softmax(predict)
+            if self.loss_type == "srn":
+                logger.infor(
+                    "Warning! SRN does not support export model currently")
             return [image, {'decoded_out': decoded_out, 'predicts': predict}]
         else:
             predict = predicts['predict']
diff --git a/tools/eval_utils/eval_rec_utils.py b/tools/eval_utils/eval_rec_utils.py
index 5a653678..4479d9df 100644
--- a/tools/eval_utils/eval_rec_utils.py
+++ b/tools/eval_utils/eval_rec_utils.py
@@ -69,7 +69,7 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
                        return_numpy=False)
             preds = np.array(outs[0])
 
-            if preds.shape[1] != 1:
+            if config['Global']['loss_type'] == "attention":
                 preds, preds_lod = convert_rec_attention_infer_res(preds)
             else:
                 preds_lod = outs[0].lod()[0]
@@ -123,8 +123,8 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
 
 def test_rec_benchmark(exe, config, eval_info_dict):
     " Evaluate lmdb dataset "
-    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860','IC03_867',  \
-                      'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077','SVTP', 'CUTE80']
+    eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', 'IC03_867', \
+                      'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077', 'SVTP', 'CUTE80']
     eval_data_dir = config['TestReader']['lmdb_sets_dir']
     total_evaluation_data_number = 0
     total_correct_number = 0

From fe8ce9afdfdf9b73d55ee64aaf476b2a4b0c70c1 Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 16:51:24 +0800
Subject: [PATCH 09/11] polish code

---
 ppocr/utils/character.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index 5f2963ac..f27e1b85 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -30,6 +30,8 @@ class CharacterOps(object):
             self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
             dict_character = list(self.character_str)
         elif self.character_type == "ch":
+            if self.loss_type == "srn":
+                raise Exception("SRN can only support in character_type == en")
             character_dict_path = config['character_dict_path']
             add_space = False
             if 'use_space_char' in config:

From ab0acf78b4fb2c23a685c5c3ce17ea78870118ec Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 17:09:17 +0800
Subject: [PATCH 10/11] polish code

---
 ppocr/modeling/architectures/rec_model.py | 14 +++++++++-----
 ppocr/utils/character.py                  |  4 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py
index f4e3eea2..91f778ce 100755
--- a/ppocr/modeling/architectures/rec_model.py
+++ b/ppocr/modeling/architectures/rec_model.py
@@ -98,13 +98,15 @@ class RecModel(object):
                     shape=[
                         -1, self.num_heads, self.max_text_length,
                         self.max_text_length
-                    ])
+                    ],
+                    dtype="float32")
                 gsrm_slf_attn_bias2 = fluid.data(
                     name="gsrm_slf_attn_bias2",
                     shape=[
                         -1, self.num_heads, self.max_text_length,
                         self.max_text_length
-                    ])
+                    ],
+                    dtype="float32")
                 lbl_weight = fluid.layers.data(
                     name="lbl_weight", shape=[-1, 1], dtype='int64')
                 label = fluid.data(
@@ -161,13 +163,15 @@ class RecModel(object):
                     shape=[
                         -1, self.num_heads, self.max_text_length,
                         self.max_text_length
-                    ])
+                    ],
+                    dtype="float32")
                 gsrm_slf_attn_bias2 = fluid.data(
                     name="gsrm_slf_attn_bias2",
                     shape=[
                         -1, self.num_heads, self.max_text_length,
                         self.max_text_length
-                    ])
+                    ],
+                    dtype="float32")
                 feed_list = [
                     image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
                     gsrm_slf_attn_bias2
@@ -214,7 +218,7 @@ class RecModel(object):
             if self.loss_type == "ctc":
                 predict = fluid.layers.softmax(predict)
             if self.loss_type == "srn":
-                logger.infor(
+                raise Exception(
                     "Warning! SRN does not support export model currently")
             return [image, {'decoded_out': decoded_out, 'predicts': predict}]
         else:
diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index f27e1b85..2db0151e 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -26,12 +26,12 @@ class CharacterOps(object):
         self.character_type = config['character_type']
         self.loss_type = config['loss_type']
         self.max_text_len = config['max_text_length']
+        if self.loss_type == "srn" and self.character_type == "ch":
+            raise Exception("SRN can only support in character_type == en")
         if self.character_type == "en":
             self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
             dict_character = list(self.character_str)
         elif self.character_type == "ch":
-            if self.loss_type == "srn":
-                raise Exception("SRN can only support in character_type == en")
             character_dict_path = config['character_dict_path']
             add_space = False
             if 'use_space_char' in config:

From 9c89310292ad77ad6cfad719219e6d07110e0c1b Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sun, 16 Aug 2020 17:19:00 +0800
Subject: [PATCH 11/11] polish code

---
 ppocr/utils/character.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py
index 2db0151e..c7c93fc5 100755
--- a/ppocr/utils/character.py
+++ b/ppocr/utils/character.py
@@ -26,7 +26,7 @@ class CharacterOps(object):
         self.character_type = config['character_type']
         self.loss_type = config['loss_type']
         self.max_text_len = config['max_text_length']
-        if self.loss_type == "srn" and self.character_type == "ch":
+        if self.loss_type == "srn" and self.character_type != "en":
             raise Exception("SRN can only support in character_type == en")
         if self.character_type == "en":
             self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"