PaddleOCR/ppocr/losses/e2e_pg_loss.py

# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from paddle import nn
import paddle
import numpy as np
import copy

from .det_basic_loss import DiceLoss


class PGLoss(nn.Layer):
    def __init__(self, eps=1e-6, **kwargs):
        super(PGLoss, self).__init__()
        self.dice_loss = DiceLoss(eps=eps)

    def org_tcl_rois(self, batch_size, pos_lists, pos_masks, label_lists):
        """
        """
        pos_lists_, pos_masks_, label_lists_ = [], [], []
        img_bs = batch_size
        tcl_bs = 64
        ngpu = int(batch_size / img_bs)
        img_ids = np.array(pos_lists, dtype=np.int32)[:, 0, 0].copy()
        pos_lists_split, pos_masks_split, label_lists_split = [], [], []
        for i in range(ngpu):
            pos_lists_split.append([])
            pos_masks_split.append([])
            label_lists_split.append([])

        for i in range(img_ids.shape[0]):
            img_id = img_ids[i]
            gpu_id = int(img_id / img_bs)
            img_id = img_id % img_bs
            pos_list = pos_lists[i].copy()
            pos_list[:, 0] = img_id
            pos_lists_split[gpu_id].append(pos_list)
            pos_masks_split[gpu_id].append(pos_masks[i].copy())
            label_lists_split[gpu_id].append(copy.deepcopy(label_lists[i]))
        # repeat or delete
        for i in range(ngpu):
            vp_len = len(pos_lists_split[i])
            if vp_len <= tcl_bs:
                for j in range(0, tcl_bs - vp_len):
                    pos_list = pos_lists_split[i][j].copy()
                    pos_lists_split[i].append(pos_list)
                    pos_mask = pos_masks_split[i][j].copy()
                    pos_masks_split[i].append(pos_mask)
                    label_list = copy.deepcopy(label_lists_split[i][j])
                    label_lists_split[i].append(label_list)
            else:
                for j in range(0, vp_len - tcl_bs):
                    c_len = len(pos_lists_split[i])
                    pop_id = np.random.permutation(c_len)[0]
                    pos_lists_split[i].pop(pop_id)
                    pos_masks_split[i].pop(pop_id)
                    label_lists_split[i].pop(pop_id)
        # merge
        for i in range(ngpu):
            pos_lists_.extend(pos_lists_split[i])
            pos_masks_.extend(pos_masks_split[i])
            label_lists_.extend(label_lists_split[i])
        return pos_lists_, pos_masks_, label_lists_

    def pre_process(self, label_list, pos_list, pos_mask):
        max_len = 30  # the max texts in a single image
        max_str_len = 50  # the max len in a single text
        pad_num = 36  # padding num
        label_list = label_list.numpy()
        batch, _, _, _ = label_list.shape
        pos_list = pos_list.numpy()
        pos_mask = pos_mask.numpy()
        pos_list_t = []
        pos_mask_t = []
        label_list_t = []
        for i in range(batch):
            for j in range(max_len):
                if pos_mask[i, j].any():
                    pos_list_t.append(pos_list[i][j])
                    pos_mask_t.append(pos_mask[i][j])
                    label_list_t.append(label_list[i][j])
        pos_list, pos_mask, label_list = self.org_tcl_rois(
            batch, pos_list_t, pos_mask_t, label_list_t)
        label = []
        tt = [l.tolist() for l in label_list]
        for i in range(batch):
            k = 0
            for j in range(max_str_len):
                if tt[i][j][0] != pad_num:
                    k += 1
                else:
                    break
            label.append(k)
        label = paddle.to_tensor(label)
        label = paddle.cast(label, dtype='int64')
        pos_list = paddle.to_tensor(pos_list)
        pos_mask = paddle.to_tensor(pos_mask)
        label_list = paddle.squeeze(paddle.to_tensor(label_list), axis=2)
        label_list = paddle.cast(label_list, dtype='int32')
        return pos_list, pos_mask, label_list, label

    def border_loss(self, f_border, l_border, l_score, l_mask):
        l_border_split, l_border_norm = paddle.tensor.split(
            l_border, num_or_sections=[4, 1], axis=1)
        f_border_split = f_border
        b, c, h, w = l_border_norm.shape
        l_border_norm_split = paddle.expand(
            x=l_border_norm, shape=[b, 4 * c, h, w])
        b, c, h, w = l_score.shape
        l_border_score = paddle.expand(x=l_score, shape=[b, 4 * c, h, w])
        b, c, h, w = l_mask.shape
        l_border_mask = paddle.expand(x=l_mask, shape=[b, 4 * c, h, w])
        border_diff = l_border_split - f_border_split
        abs_border_diff = paddle.abs(border_diff)
        border_sign = abs_border_diff < 1.0
        border_sign = paddle.cast(border_sign, dtype='float32')
        border_sign.stop_gradient = True
        border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \
                         (abs_border_diff - 0.5) * (1.0 - border_sign)
        border_out_loss = l_border_norm_split * border_in_loss
        border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \
                      (paddle.sum(l_border_score * l_border_mask) + 1e-5)
        return border_loss

    def direction_loss(self, f_direction, l_direction, l_score, l_mask):
        l_direction_split, l_direction_norm = paddle.tensor.split(
            l_direction, num_or_sections=[2, 1], axis=1)
        f_direction_split = f_direction
        b, c, h, w = l_direction_norm.shape
        l_direction_norm_split = paddle.expand(
            x=l_direction_norm, shape=[b, 2 * c, h, w])
        b, c, h, w = l_score.shape
        l_direction_score = paddle.expand(x=l_score, shape=[b, 2 * c, h, w])
        b, c, h, w = l_mask.shape
        l_direction_mask = paddle.expand(x=l_mask, shape=[b, 2 * c, h, w])
        direction_diff = l_direction_split - f_direction_split
        abs_direction_diff = paddle.abs(direction_diff)
        direction_sign = abs_direction_diff < 1.0
        direction_sign = paddle.cast(direction_sign, dtype='float32')
        direction_sign.stop_gradient = True
        direction_in_loss = 0.5 * abs_direction_diff * abs_direction_diff * direction_sign + \
                            (abs_direction_diff - 0.5) * (1.0 - direction_sign)
        direction_out_loss = l_direction_norm_split * direction_in_loss
        direction_loss = paddle.sum(direction_out_loss * l_direction_score * l_direction_mask) / \
                         (paddle.sum(l_direction_score * l_direction_mask) + 1e-5)
        return direction_loss

    def ctcloss(self, f_char, tcl_pos, tcl_mask, tcl_label, label_t):
        f_char = paddle.transpose(f_char, [0, 2, 3, 1])
        tcl_pos = paddle.reshape(tcl_pos, [-1, 3])
        tcl_pos = paddle.cast(tcl_pos, dtype=int)
        f_tcl_char = paddle.gather_nd(f_char, tcl_pos)
        f_tcl_char = paddle.reshape(f_tcl_char,
                                    [-1, 64, 37])  # len(Lexicon_Table)+1
        f_tcl_char_fg, f_tcl_char_bg = paddle.split(f_tcl_char, [36, 1], axis=2)
        f_tcl_char_bg = f_tcl_char_bg * tcl_mask + (1.0 - tcl_mask) * 20.0
        b, c, l = tcl_mask.shape
        tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, 36 * l])
        tcl_mask_fg.stop_gradient = True
        f_tcl_char_fg = f_tcl_char_fg * tcl_mask_fg + (1.0 - tcl_mask_fg) * (
            -20.0)
        f_tcl_char_mask = paddle.concat([f_tcl_char_fg, f_tcl_char_bg], axis=2)
        f_tcl_char_ld = paddle.transpose(f_tcl_char_mask, (1, 0, 2))
        N, B, _ = f_tcl_char_ld.shape
        input_lengths = paddle.to_tensor([N] * B, dtype='int64')
        cost = paddle.nn.functional.ctc_loss(
            log_probs=f_tcl_char_ld,
            labels=tcl_label,
            input_lengths=input_lengths,
            label_lengths=label_t,
            blank=36,
            reduction='none')
        cost = cost.mean()
        return cost

    def forward(self, predicts, labels):
        images, tcl_maps, tcl_label_maps, border_maps \
            , direction_maps, training_masks, label_list, pos_list, pos_mask = labels
        # for all the batch_size
        pos_list, pos_mask, label_list, label_t = self.pre_process(
            label_list, pos_list, pos_mask)

        f_score, f_boder, f_direction, f_char = predicts
        score_loss = self.dice_loss(f_score, tcl_maps, training_masks)
        border_loss = self.border_loss(f_boder, border_maps, tcl_maps,
                                       training_masks)
        direction_loss = self.direction_loss(f_direction, direction_maps,
                                             tcl_maps, training_masks)
        ctc_loss = self.ctcloss(f_char, pos_list, pos_mask, label_list, label_t)
        loss_all = score_loss + border_loss + direction_loss + 5 * ctc_loss

        losses = {
            'loss': loss_all,
            "score_loss": score_loss,
            "border_loss": border_loss,
            "direction_loss": direction_loss,
            "ctc_loss": ctc_loss
        }
        return losses
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.`
Add PGNet 2021-03-08 14:15:47 +08:00			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`

			`from paddle import nn`
			`import paddle`
			`import numpy as np`
			`import copy`

fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`from .det_basic_loss import DiceLoss`
Add PGNet 2021-03-08 14:15:47 +08:00

			`class PGLoss(nn.Layer):`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`def __init__(self, eps=1e-6, **kwargs):`
Add PGNet 2021-03-08 14:15:47 +08:00			`super(PGLoss, self).__init__()`
			`self.dice_loss = DiceLoss(eps=eps)`

			`def org_tcl_rois(self, batch_size, pos_lists, pos_masks, label_lists):`
			`"""`
			`"""`
			`pos_lists_, pos_masks_, label_lists_ = [], [], []`
			`img_bs = batch_size`
			`tcl_bs = 64`
			`ngpu = int(batch_size / img_bs)`
			`img_ids = np.array(pos_lists, dtype=np.int32)[:, 0, 0].copy()`
			`pos_lists_split, pos_masks_split, label_lists_split = [], [], []`
			`for i in range(ngpu):`
			`pos_lists_split.append([])`
			`pos_masks_split.append([])`
			`label_lists_split.append([])`

			`for i in range(img_ids.shape[0]):`
			`img_id = img_ids[i]`
			`gpu_id = int(img_id / img_bs)`
			`img_id = img_id % img_bs`
			`pos_list = pos_lists[i].copy()`
			`pos_list[:, 0] = img_id`
			`pos_lists_split[gpu_id].append(pos_list)`
			`pos_masks_split[gpu_id].append(pos_masks[i].copy())`
			`label_lists_split[gpu_id].append(copy.deepcopy(label_lists[i]))`
			`# repeat or delete`
			`for i in range(ngpu):`
			`vp_len = len(pos_lists_split[i])`
			`if vp_len <= tcl_bs:`
			`for j in range(0, tcl_bs - vp_len):`
			`pos_list = pos_lists_split[i][j].copy()`
			`pos_lists_split[i].append(pos_list)`
			`pos_mask = pos_masks_split[i][j].copy()`
			`pos_masks_split[i].append(pos_mask)`
			`label_list = copy.deepcopy(label_lists_split[i][j])`
			`label_lists_split[i].append(label_list)`
			`else:`
			`for j in range(0, vp_len - tcl_bs):`
			`c_len = len(pos_lists_split[i])`
			`pop_id = np.random.permutation(c_len)[0]`
			`pos_lists_split[i].pop(pop_id)`
			`pos_masks_split[i].pop(pop_id)`
			`label_lists_split[i].pop(pop_id)`
			`# merge`
			`for i in range(ngpu):`
			`pos_lists_.extend(pos_lists_split[i])`
			`pos_masks_.extend(pos_masks_split[i])`
			`label_lists_.extend(label_lists_split[i])`
			`return pos_lists_, pos_masks_, label_lists_`

			`def pre_process(self, label_list, pos_list, pos_mask):`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`max_len = 30 # the max texts in a single image`
			`max_str_len = 50 # the max len in a single text`
			`pad_num = 36 # padding num`
Add PGNet 2021-03-08 14:15:47 +08:00			`label_list = label_list.numpy()`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`batch, _, _, _ = label_list.shape`
Add PGNet 2021-03-08 14:15:47 +08:00			`pos_list = pos_list.numpy()`
			`pos_mask = pos_mask.numpy()`
			`pos_list_t = []`
			`pos_mask_t = []`
			`label_list_t = []`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`for i in range(batch):`
			`for j in range(max_len):`
Add PGNet 2021-03-08 14:15:47 +08:00			`if pos_mask[i, j].any():`
			`pos_list_t.append(pos_list[i][j])`
			`pos_mask_t.append(pos_mask[i][j])`
			`label_list_t.append(label_list[i][j])`
			`pos_list, pos_mask, label_list = self.org_tcl_rois(`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`batch, pos_list_t, pos_mask_t, label_list_t)`
Add PGNet 2021-03-08 14:15:47 +08:00			`label = []`
			`tt = [l.tolist() for l in label_list]`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`for i in range(batch):`
Add PGNet 2021-03-08 14:15:47 +08:00			`k = 0`
fix config add e2e_ch.md e2e_res_img623_pg 2021-03-10 14:14:21 +08:00			`for j in range(max_str_len):`
			`if tt[i][j][0] != pad_num:`
Add PGNet 2021-03-08 14:15:47 +08:00			`k += 1`
			`else:`
			`break`
			`label.append(k)`
			`label = paddle.to_tensor(label)`
			`label = paddle.cast(label, dtype='int64')`
			`pos_list = paddle.to_tensor(pos_list)`
			`pos_mask = paddle.to_tensor(pos_mask)`
			`label_list = paddle.squeeze(paddle.to_tensor(label_list), axis=2)`
			`label_list = paddle.cast(label_list, dtype='int32')`
			`return pos_list, pos_mask, label_list, label`

			`def border_loss(self, f_border, l_border, l_score, l_mask):`
			`l_border_split, l_border_norm = paddle.tensor.split(`
			`l_border, num_or_sections=[4, 1], axis=1)`
			`f_border_split = f_border`
			`b, c, h, w = l_border_norm.shape`
			`l_border_norm_split = paddle.expand(`
			`x=l_border_norm, shape=[b, 4 * c, h, w])`
			`b, c, h, w = l_score.shape`
			`l_border_score = paddle.expand(x=l_score, shape=[b, 4 * c, h, w])`
			`b, c, h, w = l_mask.shape`
			`l_border_mask = paddle.expand(x=l_mask, shape=[b, 4 * c, h, w])`
			`border_diff = l_border_split - f_border_split`
			`abs_border_diff = paddle.abs(border_diff)`
			`border_sign = abs_border_diff < 1.0`
			`border_sign = paddle.cast(border_sign, dtype='float32')`
			`border_sign.stop_gradient = True`
			`border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \`
			`(abs_border_diff - 0.5) * (1.0 - border_sign)`
			`border_out_loss = l_border_norm_split * border_in_loss`
			`border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \`
			`(paddle.sum(l_border_score * l_border_mask) + 1e-5)`
			`return border_loss`

			`def direction_loss(self, f_direction, l_direction, l_score, l_mask):`
			`l_direction_split, l_direction_norm = paddle.tensor.split(`
			`l_direction, num_or_sections=[2, 1], axis=1)`
			`f_direction_split = f_direction`
			`b, c, h, w = l_direction_norm.shape`
			`l_direction_norm_split = paddle.expand(`
			`x=l_direction_norm, shape=[b, 2 * c, h, w])`
			`b, c, h, w = l_score.shape`
			`l_direction_score = paddle.expand(x=l_score, shape=[b, 2 * c, h, w])`
			`b, c, h, w = l_mask.shape`
			`l_direction_mask = paddle.expand(x=l_mask, shape=[b, 2 * c, h, w])`
			`direction_diff = l_direction_split - f_direction_split`
			`abs_direction_diff = paddle.abs(direction_diff)`
			`direction_sign = abs_direction_diff < 1.0`
			`direction_sign = paddle.cast(direction_sign, dtype='float32')`
			`direction_sign.stop_gradient = True`
			`direction_in_loss = 0.5 * abs_direction_diff * abs_direction_diff * direction_sign + \`
			`(abs_direction_diff - 0.5) * (1.0 - direction_sign)`
			`direction_out_loss = l_direction_norm_split * direction_in_loss`
			`direction_loss = paddle.sum(direction_out_loss * l_direction_score * l_direction_mask) / \`
			`(paddle.sum(l_direction_score * l_direction_mask) + 1e-5)`
			`return direction_loss`

			`def ctcloss(self, f_char, tcl_pos, tcl_mask, tcl_label, label_t):`
			`f_char = paddle.transpose(f_char, [0, 2, 3, 1])`
			`tcl_pos = paddle.reshape(tcl_pos, [-1, 3])`
			`tcl_pos = paddle.cast(tcl_pos, dtype=int)`
			`f_tcl_char = paddle.gather_nd(f_char, tcl_pos)`
			`f_tcl_char = paddle.reshape(f_tcl_char,`
			`[-1, 64, 37]) # len(Lexicon_Table)+1`
			`f_tcl_char_fg, f_tcl_char_bg = paddle.split(f_tcl_char, [36, 1], axis=2)`
			`f_tcl_char_bg = f_tcl_char_bg * tcl_mask + (1.0 - tcl_mask) * 20.0`
			`b, c, l = tcl_mask.shape`
			`tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, 36 * l])`
			`tcl_mask_fg.stop_gradient = True`
			`f_tcl_char_fg = f_tcl_char_fg * tcl_mask_fg + (1.0 - tcl_mask_fg) * (`
			`-20.0)`
			`f_tcl_char_mask = paddle.concat([f_tcl_char_fg, f_tcl_char_bg], axis=2)`
			`f_tcl_char_ld = paddle.transpose(f_tcl_char_mask, (1, 0, 2))`
			`N, B, _ = f_tcl_char_ld.shape`
			`input_lengths = paddle.to_tensor([N] * B, dtype='int64')`
			`cost = paddle.nn.functional.ctc_loss(`
			`log_probs=f_tcl_char_ld,`
			`labels=tcl_label,`
			`input_lengths=input_lengths,`
			`label_lengths=label_t,`
			`blank=36,`
			`reduction='none')`
			`cost = cost.mean()`
			`return cost`

			`def forward(self, predicts, labels):`
			`images, tcl_maps, tcl_label_maps, border_maps \`
			`, direction_maps, training_masks, label_list, pos_list, pos_mask = labels`
			`# for all the batch_size`
			`pos_list, pos_mask, label_list, label_t = self.pre_process(`
			`label_list, pos_list, pos_mask)`

			`f_score, f_boder, f_direction, f_char = predicts`
			`score_loss = self.dice_loss(f_score, tcl_maps, training_masks)`
			`border_loss = self.border_loss(f_boder, border_maps, tcl_maps,`
			`training_masks)`
			`direction_loss = self.direction_loss(f_direction, direction_maps,`
			`tcl_maps, training_masks)`
			`ctc_loss = self.ctcloss(f_char, pos_list, pos_mask, label_list, label_t)`
			`loss_all = score_loss + border_loss + direction_loss + 5 * ctc_loss`

			`losses = {`
			`'loss': loss_all,`
			`"score_loss": score_loss,`
			`"border_loss": border_loss,`
			`"direction_loss": direction_loss,`
			`"ctc_loss": ctc_loss`
			`}`
			`return losses`