PaddleOCR/deploy/ios_demo/ocr_demo/ViewController.mm

537 lines
18 KiB
Plaintext

//
// Created by lvxiangxiang on 2020/7/10.
// Copyright (c) 2020 baidu. All rights reserved.
//
#import <opencv2/opencv.hpp>
#import <opencv2/imgcodecs/ios.h>
#import <opencv2/videoio/cap_ios.h>
//#import <opencv2/highgui/ios.h>
#import "ViewController.h"
#import "BoxLayer.h"
#include "include/paddle_api.h"
#include "timer.h"
#import "pdocr/ocr_db_post_process.h"
#import "pdocr/ocr_crnn_process.h"
using namespace paddle::lite_api;
using namespace cv;
struct Object {
int batch_id;
cv::Rect rec;
int class_id;
float prob;
};
std::mutex mtx;
std::shared_ptr<PaddlePredictor> net_ocr1;
std::shared_ptr<PaddlePredictor> net_ocr2;
Timer tic;
long long count = 0;
double tensor_mean(const Tensor &tin) {
auto shape = tin.shape();
int64_t size = 1;
for (int i = 0; i < shape.size(); i++) {
size *= shape[i];
}
double mean = 0.;
auto ptr = tin.data<float>();
for (int i = 0; i < size; i++) {
mean += ptr[i];
}
return mean / size;
}
cv::Mat resize_img_type0(const cv::Mat &img, int max_size_len, float *ratio_h, float *ratio_w) {
int w = img.cols;
int h = img.rows;
float ratio = 1.f;
int max_wh = w >= h ? w : h;
if (max_wh > max_size_len) {
if (h > w) {
ratio = float(max_size_len) / float(h);
} else {
ratio = float(max_size_len) / float(w);
}
}
int resize_h = int(float(h) * ratio);
int resize_w = int(float(w) * ratio);
if (resize_h % 32 == 0)
resize_h = resize_h;
else if (resize_h / 32 < 1)
resize_h = 32;
else
resize_h = (resize_h / 32 - 1) * 32;
if (resize_w % 32 == 0)
resize_w = resize_w;
else if (resize_w / 32 < 1)
resize_w = 32;
else
resize_w = (resize_w / 32 - 1) * 32;
cv::Mat resize_img;
cv::resize(img, resize_img, cv::Size(resize_w, resize_h));
*ratio_h = float(resize_h) / float(h);
*ratio_w = float(resize_w) / float(w);
return resize_img;
}
void neon_mean_scale(const float *din, float *dout, int size, std::vector<float> mean, std::vector<float> scale) {
float32x4_t vmean0 = vdupq_n_f32(mean[0]);
float32x4_t vmean1 = vdupq_n_f32(mean[1]);
float32x4_t vmean2 = vdupq_n_f32(mean[2]);
float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
float *dout_c0 = dout;
float *dout_c1 = dout + size;
float *dout_c2 = dout + size * 2;
int i = 0;
for (; i < size - 3; i += 4) {
float32x4x3_t vin3 = vld3q_f32(din);
float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
vst1q_f32(dout_c0, vs0);
vst1q_f32(dout_c1, vs1);
vst1q_f32(dout_c2, vs2);
din += 12;
dout_c0 += 4;
dout_c1 += 4;
dout_c2 += 4;
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) / scale[0];
*(dout_c1++) = (*(din++) - mean[1]) / scale[1];
*(dout_c2++) = (*(din++) - mean[2]) / scale[2];
}
}
// fill tensor with mean and scale, neon speed up
void fill_tensor_with_cvmat(const Mat &img_in, Tensor &tout, int width, int height,
std::vector<float> mean, std::vector<float> scale, bool is_scale) {
if (img_in.channels() == 4) {
cv::cvtColor(img_in, img_in, CV_RGBA2RGB);
}
cv::Mat im;
cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f);
cv::Mat imgf;
float scale_factor = is_scale ? 1 / 255.f : 1.f;
im.convertTo(imgf, CV_32FC3, scale_factor);
const float *dimg = reinterpret_cast<const float *>(imgf.data);
float *dout = tout.mutable_data<float>();
neon_mean_scale(dimg, dout, width * height, mean, scale);
}
std::vector<Object> detect_object(const float *data,
int count,
const std::vector<std::vector<uint64_t>> &lod,
const float thresh,
Mat &image) {
std::vector<Object> rect_out;
const float *dout = data;
for (int iw = 0; iw < count; iw++) {
int oriw = image.cols;
int orih = image.rows;
if (dout[1] > thresh && static_cast<int>(dout[0]) > 0) {
Object obj;
int x = static_cast<int>(dout[2] * oriw);
int y = static_cast<int>(dout[3] * orih);
int w = static_cast<int>(dout[4] * oriw) - x;
int h = static_cast<int>(dout[5] * orih) - y;
cv::Rect rec_clip = cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
obj.batch_id = 0;
obj.class_id = static_cast<int>(dout[0]);
obj.prob = dout[1];
obj.rec = rec_clip;
if (w > 0 && h > 0 && obj.prob <= 1) {
rect_out.push_back(obj);
cv::rectangle(image, rec_clip, cv::Scalar(255, 0, 0));
}
}
dout += 6;
}
return rect_out;
}
@interface ViewController () <CvVideoCameraDelegate>
@property(weak, nonatomic) IBOutlet UIImageView *imageView;
@property(weak, nonatomic) IBOutlet UISwitch *flag_process;
@property(weak, nonatomic) IBOutlet UISwitch *flag_video;
@property(weak, nonatomic) IBOutlet UIImageView *preView;
@property(weak, nonatomic) IBOutlet UISwitch *flag_back_cam;
@property(weak, nonatomic) IBOutlet UILabel *result;
@property(nonatomic, strong) CvVideoCamera *videoCamera;
@property(nonatomic, strong) UIImage *image;
@property(nonatomic) bool flag_init;
@property(nonatomic) bool flag_cap_photo;
@property(nonatomic) std::vector<float> scale;
@property(nonatomic) std::vector<float> mean;
@property(nonatomic) NSArray *labels;
@property(nonatomic) cv::Mat cvimg;
@property(nonatomic, strong) UIImage *ui_img_test;
@property(strong, nonatomic) CALayer *boxLayer;
@end
@implementation ViewController
@synthesize imageView;
- (OcrData *)paddleOcrRec:(cv::Mat)image {
OcrData *result = [OcrData new];
std::vector<float> mean = {0.5f, 0.5f, 0.5f};
std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};
cv::Mat crop_img;
image.copyTo(crop_img);
cv::Mat resize_img;
float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
resize_img = crnn_resize_img(crop_img, wh_ratio);
resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);
const float *dimg = reinterpret_cast<const float *>(resize_img.data);
std::unique_ptr<Tensor> input_tensor0(std::move(net_ocr2->GetInput(0)));
input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
auto *data0 = input_tensor0->mutable_data<float>();
neon_mean_scale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
//// Run CRNN predictor
net_ocr2->Run();
// Get output and run postprocess
std::unique_ptr<const Tensor> output_tensor0(std::move(net_ocr2->GetOutput(0)));
auto *rec_idx = output_tensor0->data<int>();
auto rec_idx_lod = output_tensor0->lod();
auto shape_out = output_tensor0->shape();
NSMutableString *text = [[NSMutableString alloc] init];
for (int n = int(rec_idx_lod[0][0]); n < int(rec_idx_lod[0][1] * 2); n += 2) {
if (rec_idx[n] >= self.labels.count) {
std::cout << "Index " << rec_idx[n] << " out of text dict range!" << std::endl;
continue;
}
[text appendString:self.labels[rec_idx[n]]];
}
result.label = text;
// get score
std::unique_ptr<const Tensor> output_tensor1(std::move(net_ocr2->GetOutput(1)));
auto *predict_batch = output_tensor1->data<float>();
auto predict_shape = output_tensor1->shape();
auto predict_lod = output_tensor1->lod();
int argmax_idx;
int blank = predict_shape[1];
float score = 0.f;
int count = 0;
float max_value = 0.0f;
for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
argmax_idx = int(argmax(&predict_batch[n * predict_shape[1]], &predict_batch[(n + 1) * predict_shape[1]]));
max_value = float(*std::max_element(&predict_batch[n * predict_shape[1]], &predict_batch[(n + 1) * predict_shape[1]]));
if (blank - 1 - argmax_idx > 1e-5) {
score += max_value;
count += 1;
}
}
score /= count;
result.accuracy = score;
return result;
}
- (NSArray *) ocr_infer:(cv::Mat) originImage{
int max_side_len = 960;
float ratio_h{};
float ratio_w{};
cv::Mat image;
cv::cvtColor(originImage, image, cv::COLOR_RGB2BGR);
cv::Mat img;
image.copyTo(img);
img = resize_img_type0(img, max_side_len, &ratio_h, &ratio_w);
cv::Mat img_fp;
img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);
std::unique_ptr<Tensor> input_tensor(net_ocr1->GetInput(0));
input_tensor->Resize({1, 3, img_fp.rows, img_fp.cols});
auto *data0 = input_tensor->mutable_data<float>();
const float *dimg = reinterpret_cast<const float *>(img_fp.data);
neon_mean_scale(dimg, data0, img_fp.rows * img_fp.cols, self.mean, self.scale);
tic.clear();
tic.start();
net_ocr1->Run();
std::unique_ptr<const Tensor> output_tensor(std::move(net_ocr1->GetOutput(0)));
auto *outptr = output_tensor->data<float>();
auto shape_out = output_tensor->shape();
int64_t out_numl = 1;
double sum = 0;
for (auto i : shape_out) {
out_numl *= i;
}
int s2 = int(shape_out[2]);
int s3 = int(shape_out[3]);
cv::Mat pred_map = cv::Mat::zeros(s2, s3, CV_32F);
memcpy(pred_map.data, outptr, s2 * s3 * sizeof(float));
cv::Mat cbuf_map;
pred_map.convertTo(cbuf_map, CV_8UC1, 255.0f);
const double threshold = 0.1 * 255;
const double maxvalue = 255;
cv::Mat bit_map;
cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);
auto boxes = boxes_from_bitmap(pred_map, bit_map);
std::vector<std::vector<std::vector<int>>> filter_boxes = filter_tag_det_res(boxes, ratio_h, ratio_w, image);
cv::Point rook_points[filter_boxes.size()][4];
for (int n = 0; n < filter_boxes.size(); n++) {
for (int m = 0; m < filter_boxes[0].size(); m++) {
rook_points[n][m] = cv::Point(int(filter_boxes[n][m][0]), int(filter_boxes[n][m][1]));
}
}
NSMutableArray *result = [[NSMutableArray alloc] init];
for (int i = 0; i < filter_boxes.size(); i++) {
cv::Mat crop_img;
crop_img = get_rotate_crop_image(image, filter_boxes[i]);
OcrData *r = [self paddleOcrRec:crop_img ];
NSMutableArray *points = [NSMutableArray new];
for (int jj = 0; jj < 4; ++jj) {
NSValue *v = [NSValue valueWithCGPoint:CGPointMake(
rook_points[i][jj].x / CGFloat(originImage.cols),
rook_points[i][jj].y / CGFloat(originImage.rows))];
[points addObject:v];
}
r.polygonPoints = points;
[result addObject:r];
}
NSArray* rec_out =[[result reverseObjectEnumerator] allObjects];
tic.end();
std::cout<<"infer time: "<<tic.get_sum_ms()<<"ms"<<std::endl;
return rec_out;
}
- (NSArray *)readLabelsFromFile:(NSString *)labelFilePath {
NSString *content = [NSString stringWithContentsOfFile:labelFilePath encoding:NSUTF8StringEncoding error:nil];
NSArray *lines = [content componentsSeparatedByCharactersInSet:[NSCharacterSet newlineCharacterSet]];
NSMutableArray *ret = [[NSMutableArray alloc] init];
for (int i = 0; i < lines.count; ++i) {
[ret addObject:@""];
}
NSUInteger cnt = 0;
for (id line in lines) {
NSString *l = [(NSString *) line stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
if ([l length] == 0)
continue;
NSArray *segs = [l componentsSeparatedByString:@":"];
NSUInteger key;
NSString *value;
if ([segs count] != 2) {
key = cnt;
value = [segs[0] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
} else {
key = [[segs[0] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] integerValue];
value = [segs[1] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
}
ret[key] = value;
cnt += 1;
}
return [NSArray arrayWithArray:ret];
}
- (void)viewDidAppear:(BOOL)animated {
[super viewDidAppear:animated];
self.boxLayer = [[CALayer alloc] init];
CGRect r = AVMakeRectWithAspectRatioInsideRect(self.imageView.frame.size, self.imageView.bounds);
std::cout<<self.imageView.frame.size.width<<","<<self.imageView.frame.size.height<<std::endl;
self.boxLayer.frame = r;
[self.imageView.layer addSublayer:self.boxLayer];
NSString *label_file_path = [[NSBundle mainBundle] pathForResource:[NSString stringWithFormat:@"%@", @"label_list"] ofType:@"txt"];
self.labels = [self readLabelsFromFile:label_file_path];
self.mean = {0.485f, 0.456f, 0.406f};
self.scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
NSString *model1_path = [[NSBundle mainBundle] pathForResource:[NSString stringWithFormat:@"%@", @"ch_det_mv3_db_opt"] ofType:@"nb"];
NSString *model2_path = [[NSBundle mainBundle] pathForResource:[NSString stringWithFormat:@"%@", @"ch_rec_mv3_crnn_opt"] ofType:@"nb"];
std::string model1_path_str = std::string([model1_path UTF8String]);
std::string model2_path_str = std::string([model2_path UTF8String]);
MobileConfig config;
config.set_model_from_file(model1_path_str);
net_ocr1 = CreatePaddlePredictor<MobileConfig>(config);
MobileConfig config2;
config2.set_model_from_file(model2_path_str);
net_ocr2 = CreatePaddlePredictor<MobileConfig>(config2);
cv::Mat originImage;
UIImageToMat(self.image, originImage);
NSArray *rec_out = [self ocr_infer:originImage];
[_boxLayer.sublayers makeObjectsPerformSelector:@selector(removeFromSuperlayer)];
std::cout<<self.imageView.image.size.width<<","<<self.imageView.image.size.height<<std::endl;
CGFloat h = _boxLayer.frame.size.height;
CGFloat w = _boxLayer.frame.size.width;
std::ostringstream result2;
NSInteger cnt = 0;
for (id obj in rec_out) {
OcrData *data = obj;
BoxLayer *singleBox = [[BoxLayer alloc] init];
[singleBox renderOcrPolygon:data withHeight:h withWidth:w withLabel:YES];
[_boxLayer addSublayer:singleBox];
result2<<[data.label UTF8String] <<","<<data.accuracy<<"\n";
cnt += 1;
}
self.flag_init = true;
}
- (void)viewDidLoad {
[super viewDidLoad];
// Do any additional setup after loading the view, typically from a nib.
_flag_process.on = NO;
_flag_back_cam.on = NO;
_flag_video.on = NO;
_flag_cap_photo = false;
_image = [UIImage imageNamed:@"ocr.png"];
if (_image != nil) {
printf("load image successed\n");
imageView.image = _image;
} else {
printf("load image failed\n");
}
[_flag_process addTarget:self action:@selector(PSwitchValueChanged:) forControlEvents:UIControlEventValueChanged];
[_flag_back_cam addTarget:self action:@selector(CSwitchValueChanged:) forControlEvents:UIControlEventValueChanged];
self.videoCamera = [[CvVideoCamera alloc] initWithParentView:self.preView];
self.videoCamera.delegate = self;
self.videoCamera.defaultAVCaptureDevicePosition = AVCaptureDevicePositionFront;
self.videoCamera.defaultAVCaptureSessionPreset = AVCaptureSessionPreset1920x1080;
self.videoCamera.defaultAVCaptureVideoOrientation = AVCaptureVideoOrientationPortrait;
self.videoCamera.rotateVideo = 90;
self.videoCamera.defaultFPS = 30;
[self.view insertSubview:self.imageView atIndex:0];
}
- (IBAction)swith_video_photo:(UISwitch *)sender {
NSLog(@"%@", sender.isOn ? @"video ON" : @"video OFF");
if (sender.isOn) {
self.flag_video.on = YES;
} else {
self.flag_video.on = NO;
}
}
- (IBAction)cap_photo:(id)sender {
if (!self.flag_process.isOn) {
self.result.text = @"please turn on the camera firstly";
} else {
self.flag_cap_photo = true;
}
}
- (void)PSwitchValueChanged:(UISwitch *)sender {
NSLog(@"%@", sender.isOn ? @"process ON" : @"process OFF");
if (sender.isOn) {
[self.videoCamera start];
} else {
[self.videoCamera stop];
}
}
- (void)CSwitchValueChanged:(UISwitch *)sender {
NSLog(@"%@", sender.isOn ? @"back ON" : @"back OFF");
if (sender.isOn) {
if (self.flag_process.isOn) {
[self.videoCamera stop];
}
self.videoCamera.defaultAVCaptureDevicePosition = AVCaptureDevicePositionBack ;
if (self.flag_process.isOn) {
[self.videoCamera start];
}
} else {
if (self.flag_process.isOn) {
[self.videoCamera stop];
}
self.videoCamera.defaultAVCaptureDevicePosition = AVCaptureDevicePositionFront;
if (self.flag_process.isOn) {
[self.videoCamera start];
}
}
}
- (void)processImage:(cv::Mat &)image {
dispatch_async(dispatch_get_main_queue(), ^{
if (self.flag_process.isOn) {
if (self.flag_init) {
if (self.flag_video.isOn || self.flag_cap_photo) {
self.flag_cap_photo = false;
if (image.channels() == 4) {
cvtColor(image, self->_cvimg, CV_RGBA2RGB);
}
auto rec_out =[self ocr_infer:self->_cvimg];
std::ostringstream result;
NSInteger cnt = 0;
[_boxLayer.sublayers makeObjectsPerformSelector:@selector(removeFromSuperlayer)];
CGFloat h = _boxLayer.frame.size.height;
CGFloat w = _boxLayer.frame.size.width;
for (id obj in rec_out) {
OcrData *data = obj;
BoxLayer *singleBox = [[BoxLayer alloc] init];
[singleBox renderOcrPolygon:data withHeight:h withWidth:w withLabel:YES];
[_boxLayer addSublayer:singleBox];
result<<[data.label UTF8String] <<","<<data.accuracy<<"\n";
cnt += 1;
}
cvtColor(self->_cvimg, self->_cvimg, CV_RGB2BGR);
self.imageView.image = MatToUIImage(self->_cvimg);
}
}
}
});
}
- (void)didReceiveMemoryWarning {
[super didReceiveMemoryWarning];
// Dispose of any resources that can be recreated.
}
@end