PaddleOCR/ppocr/data/rec/img_tools.py

93 lines
3.0 KiB
Python
Executable File

#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import math
import cv2
import numpy as np
def get_bounding_box_rect(pos):
left = min(pos[0])
right = max(pos[0])
top = min(pos[1])
bottom = max(pos[1])
return [left, top, right, bottom]
def resize_norm_img(img, image_shape):
imgC, imgH, imgW = image_shape
h = img.shape[0]
w = img.shape[1]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def get_img_data(value):
"""get_img_data"""
if not value:
return None
imgdata = np.frombuffer(value, dtype='uint8')
if imgdata is None:
return None
imgori = cv2.imdecode(imgdata, 1)
if imgori is None:
return None
return imgori
def process_image(img,
image_shape,
label=None,
char_ops=None,
loss_type=None,
max_text_length=None):
norm_img = resize_norm_img(img, image_shape)
norm_img = norm_img[np.newaxis, :]
if label is not None:
char_num = char_ops.get_char_num()
text = char_ops.encode(label)
if len(text) == 0 or len(text) > max_text_length:
return None
else:
if loss_type == "ctc":
text = text.reshape(-1, 1)
return (norm_img, text)
elif loss_type == "attention":
beg_flag_idx = char_ops.get_beg_end_flag_idx("beg")
end_flag_idx = char_ops.get_beg_end_flag_idx("end")
beg_text = np.append(beg_flag_idx, text)
end_text = np.append(text, end_flag_idx)
beg_text = beg_text.reshape(-1, 1)
end_text = end_text.reshape(-1, 1)
return (norm_img, beg_text, end_text)
else:
assert False, "Unsupport loss_type %s in process_image"\
% loss_type
return (norm_img)