dygraph first commit
This commit is contained in:
parent
10f7e5192d
commit
aad3093a91
|
@ -1,22 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,TrainReader
|
||||
process_function: ppocr.data.det.db_process,DBProcessTrain
|
||||
num_workers: 8
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.db_process,DBProcessTest
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
test_image_shape: [736, 1280]
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.db_process,DBProcessTest
|
||||
infer_img:
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
test_image_shape: [736, 1280]
|
||||
do_eval: True
|
|
@ -1,23 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,TrainReader
|
||||
process_function: ppocr.data.det.east_process,EASTProcessTrain
|
||||
num_workers: 8
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
|
||||
background_ratio: 0.125
|
||||
min_crop_side_ratio: 0.1
|
||||
min_text_size: 10
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.east_process,EASTProcessTest
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.east_process,EASTProcessTest
|
||||
infer_img:
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
do_eval: True
|
|
@ -1,54 +1,133 @@
|
|||
Global:
|
||||
algorithm: DB
|
||||
use_gpu: true
|
||||
epoch_num: 1200
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 2
|
||||
save_model_dir: ./output/det_db/
|
||||
save_epoch_step: 200
|
||||
save_model_dir: ./output/20201010/
|
||||
save_epoch_step: 1200
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: [4000, 5000]
|
||||
train_batch_size_per_card: 16
|
||||
test_batch_size_per_card: 16
|
||||
image_shape: [3, 640, 640]
|
||||
reader_yml: ./configs/det/det_db_icdar15_reader.yml
|
||||
pretrain_weights: ./pretrain_models/MobileNetV3_large_x0_5_pretrained/
|
||||
checkpoints:
|
||||
save_res_path: ./output/det_db/predicts_db.txt
|
||||
eval_batch_step: 8
|
||||
# if pretrained_model is saved in static mode, load_static_weights must set to True
|
||||
load_static_weights: True
|
||||
cal_metric_during_train: False
|
||||
pretrained_model: /home/zhoujun20/pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
checkpoints: #./output/det_db_0.001_DiceLoss_256_pp_config_2.0b_4gpu/best_accuracy
|
||||
save_inference_dir:
|
||||
|
||||
use_visualdl: True
|
||||
infer_img: doc/imgs_en/img_10.jpg
|
||||
save_res_path: ./output/det_db/predicts_db.txt
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
learning_rate:
|
||||
# name: Cosine
|
||||
lr: 0.001
|
||||
# warmup_epoch: 0
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.det_model,DetModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.det_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.det_db_head,DBHead
|
||||
model_name: large
|
||||
k: 50
|
||||
inner_channels: 96
|
||||
out_channels: 2
|
||||
type: det
|
||||
algorithm: DB
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
Neck:
|
||||
name: FPN
|
||||
out_channels: 256
|
||||
Head:
|
||||
name: DBHead
|
||||
k: 50
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.det_db_loss,DBLoss
|
||||
name: DBLoss
|
||||
balance_loss: true
|
||||
main_loss_type: DiceLoss
|
||||
alpha: 5
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
|
||||
PostProcess:
|
||||
function: ppocr.postprocess.db_postprocess,DBPostProcess
|
||||
name: DBPostProcess
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
box_thresh: 0.6
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 2.0
|
||||
unclip_ratio: 1.5
|
||||
|
||||
Metric:
|
||||
name: DetMetric
|
||||
main_indicator: hmean
|
||||
|
||||
TRAIN:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/detection/
|
||||
file_list:
|
||||
- /home/zhoujun20/detection/train_icdar2015_label.txt # dataset1
|
||||
ratio_list: [1.0]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- IaaAugment:
|
||||
augmenter_args:
|
||||
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
|
||||
- { 'type': Affine, 'args': { 'rotate': [ -10,10 ] } }
|
||||
- { 'type': Resize,'args': { 'size': [ 0.5,3 ] } }
|
||||
- EastRandomCropData:
|
||||
size: [ 640,640 ]
|
||||
max_tries: 50
|
||||
keep_ratio: true
|
||||
- MakeBorderMap:
|
||||
shrink_ratio: 0.4
|
||||
thresh_min: 0.3
|
||||
thresh_max: 0.7
|
||||
- MakeShrinkMap:
|
||||
shrink_ratio: 0.4
|
||||
min_text_size: 8
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [ 0.485, 0.456, 0.406 ]
|
||||
std: [ 0.229, 0.224, 0.225 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- keepKeys:
|
||||
keep_keys: ['image','threshold_map','threshold_mask','shrink_map','shrink_mask'] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size: 16
|
||||
num_workers: 6
|
||||
|
||||
EVAL:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/detection/
|
||||
file_list:
|
||||
- /home/zhoujun20/detection/test_icdar2015_label.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- DetResizeForTest:
|
||||
image_shape: [736,1280]
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [ 0.485, 0.456, 0.406 ]
|
||||
std: [ 0.229, 0.224, 0.225 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- keepKeys:
|
||||
keep_keys: ['image','shape','polys','ignore_tags']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size: 1 # must be 1
|
||||
num_workers: 6
|
|
@ -1,45 +0,0 @@
|
|||
Global:
|
||||
algorithm: EAST
|
||||
use_gpu: true
|
||||
epoch_num: 100000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 5
|
||||
save_model_dir: ./output/det_east/
|
||||
save_epoch_step: 200
|
||||
eval_batch_step: [5000, 5000]
|
||||
train_batch_size_per_card: 16
|
||||
test_batch_size_per_card: 16
|
||||
image_shape: [3, 512, 512]
|
||||
reader_yml: ./configs/det/det_east_icdar15_reader.yml
|
||||
pretrain_weights: ./pretrain_models/MobileNetV3_large_x0_5_pretrained/
|
||||
checkpoints:
|
||||
save_res_path: ./output/det_east/predicts_east.txt
|
||||
save_inference_dir:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.det_model,DetModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.det_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.det_east_head,EASTHead
|
||||
model_name: small
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.det_east_loss,EASTLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
|
||||
PostProcess:
|
||||
function: ppocr.postprocess.east_postprocess,EASTPostPocess
|
||||
score_thresh: 0.8
|
||||
cover_thresh: 0.1
|
||||
nms_thresh: 0.2
|
||||
|
|
@ -1,53 +1,132 @@
|
|||
Global:
|
||||
algorithm: DB
|
||||
use_gpu: true
|
||||
epoch_num: 1200
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 2
|
||||
save_model_dir: ./output/det_db/
|
||||
save_epoch_step: 200
|
||||
eval_batch_step: [5000, 5000]
|
||||
train_batch_size_per_card: 8
|
||||
test_batch_size_per_card: 16
|
||||
image_shape: [3, 640, 640]
|
||||
reader_yml: ./configs/det/det_db_icdar15_reader.yml
|
||||
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
|
||||
save_res_path: ./output/det_db/predicts_db.txt
|
||||
checkpoints:
|
||||
save_model_dir: ./output/20201010/
|
||||
save_epoch_step: 1200
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: 8
|
||||
# if pretrained_model is saved in static mode, load_static_weights must set to True
|
||||
load_static_weights: True
|
||||
cal_metric_during_train: False
|
||||
pretrained_model: /home/zhoujun20/pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
checkpoints: #./output/det_db_0.001_DiceLoss_256_pp_config_2.0b_4gpu/best_accuracy
|
||||
save_inference_dir:
|
||||
use_visualdl: True
|
||||
infer_img: doc/imgs_en/img_10.jpg
|
||||
save_res_path: ./output/det_db/predicts_db.txt
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
learning_rate:
|
||||
# name: Cosine
|
||||
lr: 0.001
|
||||
# warmup_epoch: 0
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.det_model,DetModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.det_resnet_vd,ResNet
|
||||
layers: 50
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.det_db_head,DBHead
|
||||
model_name: large
|
||||
k: 50
|
||||
inner_channels: 256
|
||||
out_channels: 2
|
||||
type: det
|
||||
algorithm: DB
|
||||
Transform:
|
||||
Backbone:
|
||||
name: ResNet
|
||||
layers: 50
|
||||
Neck:
|
||||
name: FPN
|
||||
out_channels: 256
|
||||
Head:
|
||||
name: DBHead
|
||||
k: 50
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.det_db_loss,DBLoss
|
||||
name: DBLoss
|
||||
balance_loss: true
|
||||
main_loss_type: DiceLoss
|
||||
alpha: 5
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
|
||||
PostProcess:
|
||||
function: ppocr.postprocess.db_postprocess,DBPostProcess
|
||||
name: DBPostProcess
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
box_thresh: 0.6
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5
|
||||
|
||||
|
||||
Metric:
|
||||
name: DetMetric
|
||||
main_indicator: hmean
|
||||
|
||||
TRAIN:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/detection/
|
||||
file_list:
|
||||
- /home/zhoujun20/detection/train_icdar2015_label.txt # dataset1
|
||||
ratio_list: [1.0]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- IaaAugment:
|
||||
augmenter_args:
|
||||
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
|
||||
- { 'type': Affine, 'args': { 'rotate': [ -10,10 ] } }
|
||||
- { 'type': Resize,'args': { 'size': [ 0.5,3 ] } }
|
||||
- EastRandomCropData:
|
||||
size: [ 640,640 ]
|
||||
max_tries: 50
|
||||
keep_ratio: true
|
||||
- MakeBorderMap:
|
||||
shrink_ratio: 0.4
|
||||
thresh_min: 0.3
|
||||
thresh_max: 0.7
|
||||
- MakeShrinkMap:
|
||||
shrink_ratio: 0.4
|
||||
min_text_size: 8
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [ 0.485, 0.456, 0.406 ]
|
||||
std: [ 0.229, 0.224, 0.225 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- keepKeys:
|
||||
keep_keys: ['image','threshold_map','threshold_mask','shrink_map','shrink_mask'] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size: 16
|
||||
num_workers: 6
|
||||
|
||||
EVAL:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/detection/
|
||||
file_list:
|
||||
- /home/zhoujun20/detection/test_icdar2015_label.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- DetResizeForTest:
|
||||
image_shape: [736,1280]
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [ 0.485, 0.456, 0.406 ]
|
||||
std: [ 0.229, 0.224, 0.225 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- keepKeys:
|
||||
keep_keys: ['image','shape','polys','ignore_tags']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size: 1 # must be 1
|
||||
num_workers: 6
|
|
@ -1,44 +0,0 @@
|
|||
Global:
|
||||
algorithm: EAST
|
||||
use_gpu: true
|
||||
epoch_num: 100000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 5
|
||||
save_model_dir: ./output/det_east/
|
||||
save_epoch_step: 200
|
||||
eval_batch_step: [5000, 5000]
|
||||
train_batch_size_per_card: 8
|
||||
test_batch_size_per_card: 16
|
||||
image_shape: [3, 512, 512]
|
||||
reader_yml: ./configs/det/det_east_icdar15_reader.yml
|
||||
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
|
||||
save_res_path: ./output/det_east/predicts_east.txt
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.det_model,DetModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.det_resnet_vd,ResNet
|
||||
layers: 50
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.det_east_head,EASTHead
|
||||
model_name: large
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.det_east_loss,EASTLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
|
||||
PostProcess:
|
||||
function: ppocr.postprocess.east_postprocess,EASTPostPocess
|
||||
score_thresh: 0.8
|
||||
cover_thresh: 0.1
|
||||
nms_thresh: 0.2
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
Global:
|
||||
algorithm: SAST
|
||||
use_gpu: true
|
||||
epoch_num: 2000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 2
|
||||
save_model_dir: ./output/det_sast/
|
||||
save_epoch_step: 20
|
||||
eval_batch_step: 5000
|
||||
train_batch_size_per_card: 8
|
||||
test_batch_size_per_card: 8
|
||||
image_shape: [3, 512, 512]
|
||||
reader_yml: ./configs/det/det_sast_icdar15_reader.yml
|
||||
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
|
||||
save_res_path: ./output/det_sast/predicts_sast.txt
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.det_model,DetModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.det_resnet_vd_sast,ResNet
|
||||
layers: 50
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.det_sast_head,SASTHead
|
||||
model_name: large
|
||||
only_fpn_up: False
|
||||
# with_cab: False
|
||||
with_cab: True
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.det_sast_loss,SASTLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,RMSProp
|
||||
base_lr: 0.001
|
||||
decay:
|
||||
function: piecewise_decay
|
||||
boundaries: [30000, 50000, 80000, 100000, 150000]
|
||||
decay_rate: 0.3
|
||||
|
||||
PostProcess:
|
||||
function: ppocr.postprocess.sast_postprocess,SASTPostProcess
|
||||
score_thresh: 0.5
|
||||
sample_pts_num: 2
|
||||
nms_thresh: 0.2
|
||||
expand_scale: 1.0
|
||||
shrink_ratio_of_width: 0.3
|
|
@ -1,50 +0,0 @@
|
|||
Global:
|
||||
algorithm: SAST
|
||||
use_gpu: true
|
||||
epoch_num: 2000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 2
|
||||
save_model_dir: ./output/det_sast/
|
||||
save_epoch_step: 20
|
||||
eval_batch_step: 5000
|
||||
train_batch_size_per_card: 8
|
||||
test_batch_size_per_card: 1
|
||||
image_shape: [3, 512, 512]
|
||||
reader_yml: ./configs/det/det_sast_totaltext_reader.yml
|
||||
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
|
||||
save_res_path: ./output/det_sast/predicts_sast.txt
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.det_model,DetModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.det_resnet_vd_sast,ResNet
|
||||
layers: 50
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.det_sast_head,SASTHead
|
||||
model_name: large
|
||||
only_fpn_up: False
|
||||
# with_cab: False
|
||||
with_cab: True
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.det_sast_loss,SASTLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,RMSProp
|
||||
base_lr: 0.001
|
||||
decay:
|
||||
function: piecewise_decay
|
||||
boundaries: [30000, 50000, 80000, 100000, 150000]
|
||||
decay_rate: 0.3
|
||||
|
||||
PostProcess:
|
||||
function: ppocr.postprocess.sast_postprocess,SASTPostProcess
|
||||
score_thresh: 0.5
|
||||
sample_pts_num: 6
|
||||
nms_thresh: 0.2
|
||||
expand_scale: 1.2
|
||||
shrink_ratio_of_width: 0.2
|
|
@ -1,24 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,TrainReader
|
||||
process_function: ppocr.data.det.sast_process,SASTProcessTrain
|
||||
num_workers: 8
|
||||
img_set_dir: ./train_data/
|
||||
label_file_path: [./train_data/icdar2013/train_label_json.txt, ./train_data/icdar2015/train_label_json.txt, ./train_data/icdar17_mlt_latin/train_label_json.txt, ./train_data/coco_text_icdar_4pts/train_label_json.txt]
|
||||
data_ratio_list: [0.1, 0.45, 0.3, 0.15]
|
||||
min_crop_side_ratio: 0.3
|
||||
min_crop_size: 24
|
||||
min_text_size: 4
|
||||
max_text_size: 512
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.sast_process,SASTProcessTest
|
||||
img_set_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
max_side_len: 1536
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.sast_process,SASTProcessTest
|
||||
infer_img: ./train_data/icdar2015/text_localization/ch4_test_images/img_11.jpg
|
||||
max_side_len: 1536
|
|
@ -1,24 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,TrainReader
|
||||
process_function: ppocr.data.det.sast_process,SASTProcessTrain
|
||||
num_workers: 8
|
||||
img_set_dir: ./train_data/
|
||||
label_file_path: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train_label_json.txt]
|
||||
data_ratio_list: [0.5, 0.5]
|
||||
min_crop_side_ratio: 0.3
|
||||
min_crop_size: 24
|
||||
min_text_size: 4
|
||||
max_text_size: 512
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.sast_process,SASTProcessTest
|
||||
img_set_dir: ./train_data/
|
||||
label_file_path: ./train_data/total_text_icdar_14pt/test_label_json.txt
|
||||
max_side_len: 768
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
|
||||
process_function: ppocr.data.det.sast_process,SASTProcessTest
|
||||
infer_img: ./train_data/afs/total_text/Images/Test/img623.jpg
|
||||
max_side_len: 768
|
|
@ -1,12 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
|
||||
num_workers: 8
|
||||
lmdb_sets_dir: ./train_data/data_lmdb_release/training/
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
|
||||
lmdb_sets_dir: ./train_data/data_lmdb_release/validation/
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
|
||||
lmdb_sets_dir: ./train_data/data_lmdb_release/evaluation/
|
|
@ -1,45 +0,0 @@
|
|||
Global:
|
||||
algorithm: CRNN
|
||||
use_gpu: true
|
||||
epoch_num: 3000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_CRNN
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 128
|
||||
test_batch_size_per_card: 128
|
||||
image_shape: [3, 32, 320]
|
||||
max_text_length: 25
|
||||
character_type: ch
|
||||
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
|
||||
loss_type: ctc
|
||||
distort: false
|
||||
use_space_char: false
|
||||
reader_yml: ./configs/rec/rec_chinese_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
|
||||
layers: 34
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 256
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.0005
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,46 +0,0 @@
|
|||
Global:
|
||||
algorithm: CRNN
|
||||
use_gpu: true
|
||||
epoch_num: 3000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_CRNN
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 320]
|
||||
max_text_length: 25
|
||||
character_type: ch
|
||||
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
|
||||
loss_type: ctc
|
||||
distort: false
|
||||
use_space_char: false
|
||||
reader_yml: ./configs/rec/rec_chinese_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: small
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 48
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.0005
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,13 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
|
||||
num_workers: 8
|
||||
img_set_dir: ./train_data
|
||||
label_file_path: ./train_data/rec_gt_train.txt
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
|
||||
img_set_dir: ./train_data
|
||||
label_file_path: ./train_data/rec_gt_test.txt
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
|
|
@ -1,13 +0,0 @@
|
|||
TrainReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
|
||||
num_workers: 8
|
||||
img_set_dir: ./train_data/ic15_data
|
||||
label_file_path: ./train_data/ic15_data/rec_gt_train.txt
|
||||
|
||||
EvalReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
|
||||
img_set_dir: ./train_data/ic15_data
|
||||
label_file_path: ./train_data/ic15_data/rec_gt_test.txt
|
||||
|
||||
TestReader:
|
||||
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
|
|
@ -1,49 +0,0 @@
|
|||
Global:
|
||||
algorithm: CRNN
|
||||
use_gpu: true
|
||||
epoch_num: 1000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_CRNN
|
||||
save_epoch_step: 300
|
||||
eval_batch_step: 500
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
distort: true
|
||||
debug: false
|
||||
reader_yml: ./configs/rec/rec_icdar15_reader.yml
|
||||
pretrain_weights: ./pretrain_models/rec_mv3_none_bilstm_ctc/best_accuracy
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 96
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.0005
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
decay:
|
||||
function: cosine_decay
|
||||
step_each_epoch: 20
|
||||
total_epoch: 1000
|
|
@ -1,43 +1,108 @@
|
|||
Global:
|
||||
algorithm: CRNN
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
use_gpu: false
|
||||
epoch_num: 500
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_CRNN
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_model_dir: ./output/rec/test/
|
||||
save_epoch_step: 500
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: 127
|
||||
# if pretrained_model is saved in static mode, load_static_weights must set to True
|
||||
load_static_weights: True
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints: #output/rec/rec_crnn/best_accuracy
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
max_text_length: 80
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
character_type: 'ch'
|
||||
use_space_char: False
|
||||
infer_mode: False
|
||||
use_tps: False
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 96
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
learning_rate:
|
||||
name: Cosine
|
||||
lr: 0.001
|
||||
warmup_epoch: 4
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.00001
|
||||
|
||||
Architecture:
|
||||
type: rec
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: small
|
||||
small_stride: [ 1, 2, 2, 2 ]
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: fc
|
||||
hidden_size: 96
|
||||
Head:
|
||||
name: CTC
|
||||
fc_decay: 0.00001
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
TRAIN:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/rec
|
||||
file_list:
|
||||
- /home/zhoujun20/rec/real_data.txt # dataset1
|
||||
ratio_list: [ 0.4,0.6 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecAug:
|
||||
- RecResizeImg:
|
||||
image_shape: [ 3,32,320 ]
|
||||
- keepKeys:
|
||||
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
batch_size: 256
|
||||
shuffle: True
|
||||
drop_last: True
|
||||
num_workers: 6
|
||||
|
||||
EVAL:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/rec
|
||||
file_list:
|
||||
- /home/zhoujun20/rec/label_val_all.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [ 3,32,320 ]
|
||||
- keepKeys:
|
||||
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size: 256
|
||||
num_workers: 6
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
Global:
|
||||
use_gpu: true
|
||||
epoch_num: 500
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 1
|
||||
save_model_dir: ./output/rec/test/
|
||||
save_epoch_step: 500
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: 1016
|
||||
# if pretrained_model is saved in static mode, load_static_weights must set to True
|
||||
load_static_weights: True
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints: #output/rec/rec_crnn/best_accuracy
|
||||
save_inference_dir:
|
||||
use_visualdl: True
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
max_text_length: 80
|
||||
character_dict_path: /home/zhoujun20/rec/lmdb/dict.txt
|
||||
character_type: 'ch'
|
||||
use_space_char: True
|
||||
infer_mode: False
|
||||
use_tps: False
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
learning_rate:
|
||||
name: Cosine
|
||||
lr: 0.0005
|
||||
warmup_epoch: 1
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.00001
|
||||
|
||||
Architecture:
|
||||
type: rec
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: small
|
||||
small_stride: [ 1, 2, 2, 2 ]
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: rnn
|
||||
hidden_size: 48
|
||||
Head:
|
||||
name: CTC
|
||||
fc_decay: 0.00001
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
TRAIN:
|
||||
dataset:
|
||||
name: LMDBDateSet
|
||||
file_list:
|
||||
- /home/zhoujun20/rec/lmdb/train # dataset1
|
||||
ratio_list: [ 0.4,0.6 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecAug:
|
||||
- RecResizeImg:
|
||||
image_shape: [ 3,32,320 ]
|
||||
- keepKeys:
|
||||
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
batch_size: 256
|
||||
shuffle: True
|
||||
drop_last: True
|
||||
num_workers: 6
|
||||
|
||||
EVAL:
|
||||
dataset:
|
||||
name: LMDBDateSet
|
||||
file_list:
|
||||
- /home/zhoujun20/rec/lmdb/val
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [ 3,32,320 ]
|
||||
- keepKeys:
|
||||
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size: 256
|
||||
num_workers: 6
|
|
@ -1,41 +0,0 @@
|
|||
Global:
|
||||
algorithm: Rosetta
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_Rosetta
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: reshape
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,54 +0,0 @@
|
|||
Global:
|
||||
algorithm: RARE
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_RARE
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: attention
|
||||
tps: true
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
TPS:
|
||||
function: ppocr.modeling.stns.tps,TPS
|
||||
num_fiducial: 20
|
||||
loc_lr: 0.1
|
||||
model_name: small
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_attention_head,AttentionPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 96
|
||||
Attention:
|
||||
decoder_size: 96
|
||||
word_vector_dim: 96
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_attention_loss,AttentionLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,51 +0,0 @@
|
|||
Global:
|
||||
algorithm: STARNet
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_STARNet
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
tps: true
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
TPS:
|
||||
function: ppocr.modeling.stns.tps,TPS
|
||||
num_fiducial: 20
|
||||
loc_lr: 0.1
|
||||
model_name: small
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 96
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,43 +1,106 @@
|
|||
Global:
|
||||
algorithm: CRNN
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
use_gpu: false
|
||||
epoch_num: 500
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_CRNN
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_model_dir: ./output/rec/test/
|
||||
save_epoch_step: 500
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: 127
|
||||
# if pretrained_model is saved in static mode, load_static_weights must set to True
|
||||
load_static_weights: True
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints: #output/rec/rec_crnn/best_accuracy
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
max_text_length: 80
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
character_type: 'ch'
|
||||
use_space_char: False
|
||||
infer_mode: False
|
||||
use_tps: False
|
||||
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
|
||||
layers: 34
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 256
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
learning_rate:
|
||||
name: Cosine
|
||||
lr: 0.001
|
||||
warmup_epoch: 4
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.00001
|
||||
|
||||
Architecture:
|
||||
type: rec
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: ResNet
|
||||
layers: 200
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: fc
|
||||
hidden_size: 96
|
||||
Head:
|
||||
name: CTC
|
||||
fc_decay: 0.00001
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
TRAIN:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/rec
|
||||
file_list:
|
||||
- /home/zhoujun20/rec/real_data.txt # dataset1
|
||||
ratio_list: [ 0.4,0.6 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecAug:
|
||||
- RecResizeImg:
|
||||
image_shape: [ 3,32,320 ]
|
||||
- keepKeys:
|
||||
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
batch_size: 256
|
||||
shuffle: True
|
||||
drop_last: True
|
||||
num_workers: 6
|
||||
|
||||
EVAL:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: /home/zhoujun20/rec
|
||||
file_list:
|
||||
- /home/zhoujun20/rec/label_val_all.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [ 3,32,320 ]
|
||||
- keepKeys:
|
||||
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size: 256
|
||||
num_workers: 6
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
Global:
|
||||
algorithm: Rosetta
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_Rosetta
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
|
||||
layers: 34
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: reshape
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,53 +0,0 @@
|
|||
Global:
|
||||
algorithm: RARE
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_RARE
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: attention
|
||||
tps: true
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
TPS:
|
||||
function: ppocr.modeling.stns.tps,TPS
|
||||
num_fiducial: 20
|
||||
loc_lr: 0.1
|
||||
model_name: large
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
|
||||
layers: 34
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_attention_head,AttentionPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 256
|
||||
Attention:
|
||||
decoder_size: 128
|
||||
word_vector_dim: 128
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_attention_loss,AttentionLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,50 +0,0 @@
|
|||
Global:
|
||||
algorithm: STARNet
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_STARNet
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: 2000
|
||||
train_batch_size_per_card: 256
|
||||
test_batch_size_per_card: 256
|
||||
image_shape: [3, 32, 100]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: ctc
|
||||
tps: true
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
TPS:
|
||||
function: ppocr.modeling.stns.tps,TPS
|
||||
num_fiducial: 20
|
||||
loc_lr: 0.1
|
||||
model_name: large
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
|
||||
layers: 34
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
|
||||
encoder_type: rnn
|
||||
SeqRNN:
|
||||
hidden_size: 256
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -1,49 +0,0 @@
|
|||
Global:
|
||||
algorithm: SRN
|
||||
use_gpu: true
|
||||
epoch_num: 72
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: output/rec_pvam_withrotate
|
||||
save_epoch_step: 1
|
||||
eval_batch_step: 8000
|
||||
train_batch_size_per_card: 64
|
||||
test_batch_size_per_card: 1
|
||||
image_shape: [1, 64, 256]
|
||||
max_text_length: 25
|
||||
character_type: en
|
||||
loss_type: srn
|
||||
num_heads: 8
|
||||
average_window: 0.15
|
||||
max_average_window: 15625
|
||||
min_average_window: 10000
|
||||
reader_yml: ./configs/rec/rec_benchmark_reader.yml
|
||||
pretrain_weights:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
infer_img:
|
||||
|
||||
Architecture:
|
||||
function: ppocr.modeling.architectures.rec_model,RecModel
|
||||
|
||||
Backbone:
|
||||
function: ppocr.modeling.backbones.rec_resnet_fpn,ResNet
|
||||
layers: 50
|
||||
|
||||
Head:
|
||||
function: ppocr.modeling.heads.rec_srn_all_head,SRNPredict
|
||||
encoder_type: rnn
|
||||
num_encoder_TUs: 2
|
||||
num_decoder_TUs: 4
|
||||
hidden_dims: 512
|
||||
SeqRNN:
|
||||
hidden_size: 256
|
||||
|
||||
Loss:
|
||||
function: ppocr.modeling.losses.rec_srn_loss,SRNLoss
|
||||
|
||||
Optimizer:
|
||||
function: ppocr.optimizer,AdamDecay
|
||||
base_lr: 0.0001
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
|
@ -11,3 +11,114 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
||||
import copy
|
||||
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
|
||||
import paddle.distributed as dist
|
||||
|
||||
from ppocr.data.imaug import transform, create_operators
|
||||
|
||||
__all__ = ['build_dataloader', 'transform', 'create_operators']
|
||||
|
||||
|
||||
def build_dataset(config, global_config):
|
||||
from ppocr.data.dataset import SimpleDataSet, LMDBDateSet
|
||||
support_dict = ['SimpleDataSet', 'LMDBDateSet']
|
||||
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception(
|
||||
'DataSet only support {}'.format(support_dict))
|
||||
|
||||
dataset = eval(module_name)(config, global_config)
|
||||
return dataset
|
||||
|
||||
|
||||
def build_dataloader(config, device, distributed=False, global_config=None):
|
||||
from ppocr.data.dataset import BatchBalancedDataLoader
|
||||
|
||||
config = copy.deepcopy(config)
|
||||
dataset_config = config['dataset']
|
||||
|
||||
_dataset_list = []
|
||||
file_list = dataset_config.pop('file_list')
|
||||
if len(file_list) == 1:
|
||||
ratio_list = [1.0]
|
||||
else:
|
||||
ratio_list = dataset_config.pop('ratio_list')
|
||||
for file in file_list:
|
||||
dataset_config['file_list'] = file
|
||||
_dataset = build_dataset(dataset_config, global_config)
|
||||
_dataset_list.append(_dataset)
|
||||
data_loader = BatchBalancedDataLoader(_dataset_list, ratio_list,
|
||||
distributed, device, config['loader'])
|
||||
return data_loader, _dataset.info_dict
|
||||
|
||||
|
||||
def test_loader():
|
||||
import time
|
||||
from tools.program import load_config, ArgsParser
|
||||
|
||||
FLAGS = ArgsParser().parse_args()
|
||||
config = load_config(FLAGS.config)
|
||||
|
||||
place = paddle.CPUPlace()
|
||||
paddle.disable_static(place)
|
||||
import time
|
||||
|
||||
data_loader, _ = build_dataloader(
|
||||
config['TRAIN'], place, global_config=config['Global'])
|
||||
start = time.time()
|
||||
print(len(data_loader))
|
||||
for epoch in range(1):
|
||||
print('epoch {} ****************'.format(epoch))
|
||||
for i, batch in enumerate(data_loader):
|
||||
if i > len(data_loader):
|
||||
break
|
||||
t = time.time() - start
|
||||
start = time.time()
|
||||
print('{}, batch : {} ,time {}'.format(i, len(batch[0]), t))
|
||||
|
||||
continue
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
import cv2
|
||||
fig = plt.figure()
|
||||
# # cv2.imwrite('img.jpg',batch[0].numpy()[0].transpose((1,2,0)))
|
||||
# # cv2.imwrite('bmap.jpg',batch[1].numpy()[0])
|
||||
# # cv2.imwrite('bmask.jpg',batch[2].numpy()[0])
|
||||
# # cv2.imwrite('smap.jpg',batch[3].numpy()[0])
|
||||
# # cv2.imwrite('smask.jpg',batch[4].numpy()[0])
|
||||
plt.title('img')
|
||||
plt.imshow(batch[0].numpy()[0].transpose((1, 2, 0)))
|
||||
# plt.figure()
|
||||
# plt.title('bmap')
|
||||
# plt.imshow(batch[1].numpy()[0],cmap='Greys')
|
||||
# plt.figure()
|
||||
# plt.title('bmask')
|
||||
# plt.imshow(batch[2].numpy()[0],cmap='Greys')
|
||||
# plt.figure()
|
||||
# plt.title('smap')
|
||||
# plt.imshow(batch[3].numpy()[0],cmap='Greys')
|
||||
# plt.figure()
|
||||
# plt.title('smask')
|
||||
# plt.imshow(batch[4].numpy()[0],cmap='Greys')
|
||||
# plt.show()
|
||||
# break
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_loader()
|
||||
|
|
|
@ -0,0 +1,300 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import numpy as np
|
||||
import os
|
||||
import lmdb
|
||||
import random
|
||||
import signal
|
||||
import paddle
|
||||
from paddle.io import Dataset, DataLoader, DistributedBatchSampler, BatchSampler
|
||||
|
||||
from .imaug import transform, create_operators
|
||||
from ppocr.utils.logging import get_logger
|
||||
|
||||
|
||||
def term_mp(sig_num, frame):
|
||||
""" kill all child processes
|
||||
"""
|
||||
pid = os.getpid()
|
||||
pgid = os.getpgid(os.getpid())
|
||||
print("main proc {} exit, kill process group " "{}".format(pid, pgid))
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
|
||||
|
||||
signal.signal(signal.SIGINT, term_mp)
|
||||
signal.signal(signal.SIGTERM, term_mp)
|
||||
|
||||
|
||||
class ModeException(Exception):
|
||||
"""
|
||||
ModeException
|
||||
"""
|
||||
|
||||
def __init__(self, message='', mode=''):
|
||||
message += "\nOnly the following 3 modes are supported: " \
|
||||
"train, valid, test. Given mode is {}".format(mode)
|
||||
super(ModeException, self).__init__(message)
|
||||
|
||||
|
||||
class SampleNumException(Exception):
|
||||
"""
|
||||
SampleNumException
|
||||
"""
|
||||
|
||||
def __init__(self, message='', sample_num=0, batch_size=1):
|
||||
message += "\nError: The number of the whole data ({}) " \
|
||||
"is smaller than the batch_size ({}), and drop_last " \
|
||||
"is turnning on, so nothing will feed in program, " \
|
||||
"Terminated now. Please reset batch_size to a smaller " \
|
||||
"number or feed more data!".format(sample_num, batch_size)
|
||||
super(SampleNumException, self).__init__(message)
|
||||
|
||||
|
||||
def get_file_list(file_list, data_dir, delimiter='\t'):
|
||||
"""
|
||||
read label list from file and shuffle the list
|
||||
|
||||
Args:
|
||||
params(dict):
|
||||
"""
|
||||
if isinstance(file_list, str):
|
||||
file_list = [file_list]
|
||||
data_source_list = []
|
||||
for file in file_list:
|
||||
with open(file) as f:
|
||||
full_lines = [line.strip() for line in f]
|
||||
for line in full_lines:
|
||||
try:
|
||||
img_path, label = line.split(delimiter)
|
||||
except:
|
||||
logger = get_logger()
|
||||
logger.warning('label error in {}'.format(line))
|
||||
img_path = os.path.join(data_dir, img_path)
|
||||
data = {'img_path': img_path, 'label': label}
|
||||
data_source_list.append(data)
|
||||
return data_source_list
|
||||
|
||||
|
||||
class LMDBDateSet(Dataset):
|
||||
def __init__(self, config, global_config):
|
||||
super(LMDBDateSet, self).__init__()
|
||||
self.data_list = self.load_lmdb_dataset(
|
||||
config['file_list'], global_config['max_text_length'])
|
||||
random.shuffle(self.data_list)
|
||||
|
||||
self.ops = create_operators(config['transforms'], global_config)
|
||||
|
||||
# for rec
|
||||
character = ''
|
||||
for op in self.ops:
|
||||
if hasattr(op, 'character'):
|
||||
character = getattr(op, 'character')
|
||||
|
||||
self.info_dict = {'character': character}
|
||||
|
||||
def load_lmdb_dataset(self, data_dir, max_text_length):
|
||||
self.env = lmdb.open(
|
||||
data_dir,
|
||||
max_readers=32,
|
||||
readonly=True,
|
||||
lock=False,
|
||||
readahead=False,
|
||||
meminit=False)
|
||||
if not self.env:
|
||||
print('cannot create lmdb from %s' % (data_dir))
|
||||
exit(0)
|
||||
|
||||
filtered_index_list = []
|
||||
with self.env.begin(write=False) as txn:
|
||||
nSamples = int(txn.get('num-samples'.encode()))
|
||||
self.nSamples = nSamples
|
||||
for index in range(self.nSamples):
|
||||
index += 1 # lmdb starts with 1
|
||||
label_key = 'label-%09d'.encode() % index
|
||||
label = txn.get(label_key).decode('utf-8')
|
||||
if len(label) > max_text_length:
|
||||
# print(f'The length of the label is longer than max_length: length
|
||||
# {len(label)}, {label} in dataset {self.root}')
|
||||
continue
|
||||
|
||||
# By default, images containing characters which are not in opt.character are filtered.
|
||||
# You can add [UNK] token to `opt.character` in utils.py instead of this filtering.
|
||||
filtered_index_list.append(index)
|
||||
return filtered_index_list
|
||||
|
||||
def print_lmdb_sets_info(self, lmdb_sets):
|
||||
lmdb_info_strs = []
|
||||
for dataset_idx in range(len(lmdb_sets)):
|
||||
tmp_str = " %s:%d," % (lmdb_sets[dataset_idx]['dirpath'],
|
||||
lmdb_sets[dataset_idx]['num_samples'])
|
||||
lmdb_info_strs.append(tmp_str)
|
||||
lmdb_info_strs = ''.join(lmdb_info_strs)
|
||||
logger = get_logger()
|
||||
logger.info("DataSummary:" + lmdb_info_strs)
|
||||
return
|
||||
|
||||
def __getitem__(self, idx):
|
||||
idx = self.data_list[idx]
|
||||
with self.env.begin(write=False) as txn:
|
||||
label_key = 'label-%09d'.encode() % idx
|
||||
label = txn.get(label_key)
|
||||
if label is not None:
|
||||
label = label.decode('utf-8')
|
||||
img_key = 'image-%09d'.encode() % idx
|
||||
imgbuf = txn.get(img_key)
|
||||
data = {'image': imgbuf, 'label': label}
|
||||
outs = transform(data, self.ops)
|
||||
else:
|
||||
outs = None
|
||||
if outs is None:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
return outs
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_list)
|
||||
|
||||
|
||||
class SimpleDataSet(Dataset):
|
||||
def __init__(self, config, global_config):
|
||||
super(SimpleDataSet, self).__init__()
|
||||
delimiter = config.get('delimiter', '\t')
|
||||
self.data_list = get_file_list(config['file_list'], config['data_dir'],
|
||||
delimiter)
|
||||
random.shuffle(self.data_list)
|
||||
|
||||
self.ops = create_operators(config['transforms'], global_config)
|
||||
|
||||
# for rec
|
||||
character = ''
|
||||
for op in self.ops:
|
||||
if hasattr(op, 'character'):
|
||||
character = getattr(op, 'character')
|
||||
|
||||
self.info_dict = {'character': character}
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = copy.deepcopy(self.data_list[idx])
|
||||
with open(data['img_path'], 'rb') as f:
|
||||
img = f.read()
|
||||
data['image'] = img
|
||||
outs = transform(data, self.ops)
|
||||
if outs is None:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
return outs
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_list)
|
||||
|
||||
|
||||
class BatchBalancedDataLoader(object):
|
||||
def __init__(self,
|
||||
dataset_list: list,
|
||||
ratio_list: list,
|
||||
distributed,
|
||||
device,
|
||||
loader_args: dict):
|
||||
"""
|
||||
对datasetlist里的dataset按照ratio_list里对应的比例组合,似的每个batch里的数据按按照比例采样的
|
||||
:param dataset_list: 数据集列表
|
||||
:param ratio_list: 比例列表
|
||||
:param loader_args: dataloader的配置
|
||||
"""
|
||||
assert sum(ratio_list) == 1 and len(dataset_list) == len(ratio_list)
|
||||
|
||||
self.dataset_len = 0
|
||||
self.data_loader_list = []
|
||||
self.dataloader_iter_list = []
|
||||
all_batch_size = loader_args.pop('batch_size')
|
||||
batch_size_list = list(
|
||||
map(int, [max(1.0, all_batch_size * x) for x in ratio_list]))
|
||||
remain_num = all_batch_size - sum(batch_size_list)
|
||||
batch_size_list[np.argmax(ratio_list)] += remain_num
|
||||
|
||||
for _dataset, _batch_size in zip(dataset_list, batch_size_list):
|
||||
if distributed:
|
||||
batch_sampler_class = DistributedBatchSampler
|
||||
else:
|
||||
batch_sampler_class = BatchSampler
|
||||
batch_sampler = batch_sampler_class(
|
||||
dataset=_dataset,
|
||||
batch_size=_batch_size,
|
||||
shuffle=loader_args['shuffle'],
|
||||
drop_last=loader_args['drop_last'], )
|
||||
_data_loader = DataLoader(
|
||||
dataset=_dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
places=device,
|
||||
num_workers=loader_args['num_workers'],
|
||||
return_list=True, )
|
||||
self.data_loader_list.append(_data_loader)
|
||||
self.dataloader_iter_list.append(iter(_data_loader))
|
||||
self.dataset_len += len(_dataset)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __len__(self):
|
||||
return min([len(x) for x in self.data_loader_list])
|
||||
|
||||
def __next__(self):
|
||||
batch = []
|
||||
for i, data_loader_iter in enumerate(self.dataloader_iter_list):
|
||||
try:
|
||||
_batch_i = next(data_loader_iter)
|
||||
batch.append(_batch_i)
|
||||
except StopIteration:
|
||||
self.dataloader_iter_list[i] = iter(self.data_loader_list[i])
|
||||
_batch_i = next(self.dataloader_iter_list[i])
|
||||
batch.append(_batch_i)
|
||||
except ValueError:
|
||||
pass
|
||||
if len(batch) > 0:
|
||||
batch_list = []
|
||||
batch_item_size = len(batch[0])
|
||||
for i in range(batch_item_size):
|
||||
cur_item_list = [batch_i[i] for batch_i in batch]
|
||||
batch_list.append(paddle.concat(cur_item_list, axis=0))
|
||||
else:
|
||||
batch_list = batch[0]
|
||||
return batch_list
|
||||
|
||||
|
||||
def fill_batch(batch):
|
||||
"""
|
||||
2020.09.08: The current paddle version only supports returning data with the same length.
|
||||
Therefore, fill in the batches with inconsistent lengths.
|
||||
this method is currently only useful for text detection
|
||||
"""
|
||||
keys = list(range(len(batch[0])))
|
||||
v_max_len_dict = {}
|
||||
for k in keys:
|
||||
v_max_len_dict[k] = max([len(item[k]) for item in batch])
|
||||
for item in batch:
|
||||
length = []
|
||||
for k in keys:
|
||||
v = item[k]
|
||||
length.append(len(v))
|
||||
assert isinstance(v, np.ndarray)
|
||||
if len(v) == v_max_len_dict[k]:
|
||||
continue
|
||||
try:
|
||||
tmp_shape = [v_max_len_dict[k] - len(v)] + list(v[0].shape)
|
||||
except:
|
||||
a = 1
|
||||
tmp_array = np.zeros(tmp_shape, dtype=v[0].dtype)
|
||||
new_array = np.concatenate([v, tmp_array])
|
||||
item[k] = new_array
|
||||
item.append(length)
|
||||
return batch
|
|
@ -1,47 +0,0 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
import cv2
|
||||
import math
|
||||
|
||||
import imgaug
|
||||
import imgaug.augmenters as iaa
|
||||
|
||||
|
||||
def AugmentData(data):
|
||||
img = data['image']
|
||||
shape = img.shape
|
||||
|
||||
aug = iaa.Sequential(
|
||||
[iaa.Fliplr(0.5), iaa.Affine(rotate=(-10, 10)), iaa.Resize(
|
||||
(0.5, 3))]).to_deterministic()
|
||||
|
||||
def may_augment_annotation(aug, data, shape):
|
||||
if aug is None:
|
||||
return data
|
||||
|
||||
line_polys = []
|
||||
for poly in data['polys']:
|
||||
new_poly = may_augment_poly(aug, shape, poly)
|
||||
line_polys.append(new_poly)
|
||||
data['polys'] = np.array(line_polys)
|
||||
return data
|
||||
|
||||
def may_augment_poly(aug, img_shape, poly):
|
||||
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
|
||||
keypoints = aug.augment_keypoints(
|
||||
[imgaug.KeypointsOnImage(
|
||||
keypoints, shape=img_shape)])[0].keypoints
|
||||
poly = [(p.x, p.y) for p in keypoints]
|
||||
return poly
|
||||
|
||||
img_aug = aug.augment_image(img)
|
||||
data['image'] = img_aug
|
||||
data = may_augment_annotation(aug, data, shape)
|
||||
return data
|
|
@ -1,167 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import math
|
||||
import random
|
||||
import functools
|
||||
import numpy as np
|
||||
import cv2
|
||||
import string
|
||||
from ppocr.utils.utility import initial_logger
|
||||
logger = initial_logger()
|
||||
from ppocr.utils.utility import create_module
|
||||
from ppocr.utils.utility import get_image_file_list
|
||||
import time
|
||||
|
||||
|
||||
class TrainReader(object):
|
||||
def __init__(self, params):
|
||||
self.num_workers = params['num_workers']
|
||||
self.label_file_path = params['label_file_path']
|
||||
print(self.label_file_path)
|
||||
self.use_mul_data = False
|
||||
if isinstance(self.label_file_path, list):
|
||||
self.use_mul_data = True
|
||||
self.data_ratio_list = params['data_ratio_list']
|
||||
self.batch_size = params['train_batch_size_per_card']
|
||||
assert 'process_function' in params,\
|
||||
"absence process_function in Reader"
|
||||
self.process = create_module(params['process_function'])(params)
|
||||
|
||||
def __call__(self, process_id):
|
||||
def sample_iter_reader():
|
||||
with open(self.label_file_path, "rb") as fin:
|
||||
label_infor_list = fin.readlines()
|
||||
img_num = len(label_infor_list)
|
||||
img_id_list = list(range(img_num))
|
||||
random.shuffle(img_id_list)
|
||||
if sys.platform == "win32" and self.num_workers != 1:
|
||||
print("multiprocess is not fully compatible with Windows."
|
||||
"num_workers will be 1.")
|
||||
self.num_workers = 1
|
||||
for img_id in range(process_id, img_num, self.num_workers):
|
||||
label_infor = label_infor_list[img_id_list[img_id]]
|
||||
outs = self.process(label_infor)
|
||||
if outs is None:
|
||||
continue
|
||||
yield outs
|
||||
|
||||
def sample_iter_reader_mul():
|
||||
batch_size = 1000
|
||||
data_source_list = self.label_file_path
|
||||
batch_size_list = list(map(int, [max(1.0, batch_size * x) for x in self.data_ratio_list]))
|
||||
print(self.data_ratio_list, batch_size_list)
|
||||
|
||||
data_filename_list, data_size_list, fetch_record_list = [], [], []
|
||||
for data_source in data_source_list:
|
||||
image_files = open(data_source, "rb").readlines()
|
||||
random.shuffle(image_files)
|
||||
data_filename_list.append(image_files)
|
||||
data_size_list.append(len(image_files))
|
||||
fetch_record_list.append(0)
|
||||
|
||||
image_batch = []
|
||||
# get a batch of img_fns and poly_fns
|
||||
for i in range(0, len(batch_size_list)):
|
||||
bs = batch_size_list[i]
|
||||
ds = data_size_list[i]
|
||||
image_names = data_filename_list[i]
|
||||
fetch_record = fetch_record_list[i]
|
||||
data_path = data_source_list[i]
|
||||
for j in range(fetch_record, fetch_record + bs):
|
||||
index = j % ds
|
||||
image_batch.append(image_names[index])
|
||||
|
||||
if (fetch_record + bs) > ds:
|
||||
fetch_record_list[i] = 0
|
||||
random.shuffle(data_filename_list[i])
|
||||
else:
|
||||
fetch_record_list[i] = fetch_record + bs
|
||||
|
||||
if sys.platform == "win32":
|
||||
print("multiprocess is not fully compatible with Windows."
|
||||
"num_workers will be 1.")
|
||||
self.num_workers = 1
|
||||
|
||||
for label_infor in image_batch:
|
||||
outs = self.process(label_infor)
|
||||
if outs is None:
|
||||
continue
|
||||
yield outs
|
||||
|
||||
def batch_iter_reader():
|
||||
batch_outs = []
|
||||
if self.use_mul_data:
|
||||
print("Sample date from multiple datasets!")
|
||||
for outs in sample_iter_reader_mul():
|
||||
batch_outs.append(outs)
|
||||
if len(batch_outs) == self.batch_size:
|
||||
yield batch_outs
|
||||
batch_outs = []
|
||||
else:
|
||||
for outs in sample_iter_reader():
|
||||
batch_outs.append(outs)
|
||||
if len(batch_outs) == self.batch_size:
|
||||
yield batch_outs
|
||||
batch_outs = []
|
||||
|
||||
return batch_iter_reader
|
||||
|
||||
|
||||
class EvalTestReader(object):
|
||||
def __init__(self, params):
|
||||
self.params = params
|
||||
assert 'process_function' in params,\
|
||||
"absence process_function in EvalTestReader"
|
||||
|
||||
def __call__(self, mode):
|
||||
process_function = create_module(self.params['process_function'])(
|
||||
self.params)
|
||||
batch_size = self.params['test_batch_size_per_card']
|
||||
|
||||
img_list = []
|
||||
if mode != "test":
|
||||
img_set_dir = self.params['img_set_dir']
|
||||
img_name_list_path = self.params['label_file_path']
|
||||
with open(img_name_list_path, "rb") as fin:
|
||||
lines = fin.readlines()
|
||||
for line in lines:
|
||||
img_name = line.decode().strip("\n").split("\t")[0]
|
||||
img_path = os.path.join(img_set_dir, img_name)
|
||||
img_list.append(img_path)
|
||||
else:
|
||||
img_path = self.params['infer_img']
|
||||
img_list = get_image_file_list(img_path)
|
||||
|
||||
def batch_iter_reader():
|
||||
batch_outs = []
|
||||
for img_path in img_list:
|
||||
img = cv2.imread(img_path)
|
||||
if img is None:
|
||||
logger.info("{} does not exist!".format(img_path))
|
||||
continue
|
||||
elif len(list(img.shape)) == 2 or img.shape[2] == 1:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
outs = process_function(img)
|
||||
outs.append(img_path)
|
||||
batch_outs.append(outs)
|
||||
if len(batch_outs) == batch_size:
|
||||
yield batch_outs
|
||||
batch_outs = []
|
||||
if len(batch_outs) != 0:
|
||||
yield batch_outs
|
||||
|
||||
return batch_iter_reader
|
|
@ -1,216 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import json
|
||||
import sys
|
||||
from ppocr.utils.utility import initial_logger, check_and_read_gif
|
||||
logger = initial_logger()
|
||||
|
||||
from .data_augment import AugmentData
|
||||
from .random_crop_data import RandomCropData
|
||||
from .make_shrink_map import MakeShrinkMap
|
||||
from .make_border_map import MakeBorderMap
|
||||
|
||||
|
||||
class DBProcessTrain(object):
|
||||
"""
|
||||
DB pre-process for Train mode
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
self.img_set_dir = params['img_set_dir']
|
||||
self.image_shape = params['image_shape']
|
||||
|
||||
def order_points_clockwise(self, pts):
|
||||
rect = np.zeros((4, 2), dtype="float32")
|
||||
s = pts.sum(axis=1)
|
||||
rect[0] = pts[np.argmin(s)]
|
||||
rect[2] = pts[np.argmax(s)]
|
||||
diff = np.diff(pts, axis=1)
|
||||
rect[1] = pts[np.argmin(diff)]
|
||||
rect[3] = pts[np.argmax(diff)]
|
||||
return rect
|
||||
|
||||
def make_data_dict(self, imgvalue, entry):
|
||||
boxes = []
|
||||
texts = []
|
||||
ignores = []
|
||||
for rect in entry:
|
||||
points = rect['points']
|
||||
transcription = rect['transcription']
|
||||
try:
|
||||
box = self.order_points_clockwise(
|
||||
np.array(points).reshape(-1, 2))
|
||||
if cv2.contourArea(box) > 0:
|
||||
boxes.append(box)
|
||||
texts.append(transcription)
|
||||
ignores.append(transcription in ['*', '###'])
|
||||
except:
|
||||
print('load label failed!')
|
||||
data = {
|
||||
'image': imgvalue,
|
||||
'shape': [imgvalue.shape[0], imgvalue.shape[1]],
|
||||
'polys': np.array(boxes),
|
||||
'texts': texts,
|
||||
'ignore_tags': ignores,
|
||||
}
|
||||
return data
|
||||
|
||||
def NormalizeImage(self, data):
|
||||
im = data['image']
|
||||
img_mean = [0.485, 0.456, 0.406]
|
||||
img_std = [0.229, 0.224, 0.225]
|
||||
im = im.astype(np.float32, copy=False)
|
||||
im = im / 255
|
||||
im -= img_mean
|
||||
im /= img_std
|
||||
channel_swap = (2, 0, 1)
|
||||
im = im.transpose(channel_swap)
|
||||
data['image'] = im
|
||||
return data
|
||||
|
||||
def FilterKeys(self, data):
|
||||
filter_keys = ['polys', 'texts', 'ignore_tags', 'shape']
|
||||
for key in filter_keys:
|
||||
if key in data:
|
||||
del data[key]
|
||||
return data
|
||||
|
||||
def convert_label_infor(self, label_infor):
|
||||
label_infor = label_infor.decode()
|
||||
label_infor = label_infor.encode('utf-8').decode('utf-8-sig')
|
||||
substr = label_infor.strip("\n").split("\t")
|
||||
img_path = self.img_set_dir + substr[0]
|
||||
label = json.loads(substr[1])
|
||||
return img_path, label
|
||||
|
||||
def __call__(self, label_infor):
|
||||
img_path, gt_label = self.convert_label_infor(label_infor)
|
||||
imgvalue, flag = check_and_read_gif(img_path)
|
||||
if not flag:
|
||||
imgvalue = cv2.imread(img_path)
|
||||
if imgvalue is None:
|
||||
logger.info("{} does not exist!".format(img_path))
|
||||
return None
|
||||
if len(list(imgvalue.shape)) == 2 or imgvalue.shape[2] == 1:
|
||||
imgvalue = cv2.cvtColor(imgvalue, cv2.COLOR_GRAY2BGR)
|
||||
data = self.make_data_dict(imgvalue, gt_label)
|
||||
data = AugmentData(data)
|
||||
data = RandomCropData(data, self.image_shape[1:])
|
||||
data = MakeShrinkMap(data)
|
||||
data = MakeBorderMap(data)
|
||||
data = self.NormalizeImage(data)
|
||||
data = self.FilterKeys(data)
|
||||
return data['image'], data['shrink_map'], data['shrink_mask'], data[
|
||||
'threshold_map'], data['threshold_mask']
|
||||
|
||||
|
||||
class DBProcessTest(object):
|
||||
"""
|
||||
DB pre-process for Test mode
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
super(DBProcessTest, self).__init__()
|
||||
self.resize_type = 0
|
||||
if 'test_image_shape' in params:
|
||||
self.image_shape = params['test_image_shape']
|
||||
# print(self.image_shape)
|
||||
self.resize_type = 1
|
||||
if 'max_side_len' in params:
|
||||
self.max_side_len = params['max_side_len']
|
||||
else:
|
||||
self.max_side_len = 2400
|
||||
|
||||
def resize_image_type0(self, im):
|
||||
"""
|
||||
resize image to a size multiple of 32 which is required by the network
|
||||
args:
|
||||
img(array): array with shape [h, w, c]
|
||||
return(tuple):
|
||||
img, (ratio_h, ratio_w)
|
||||
"""
|
||||
max_side_len = self.max_side_len
|
||||
h, w, _ = im.shape
|
||||
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
||||
# limit the max side
|
||||
if max(resize_h, resize_w) > max_side_len:
|
||||
if resize_h > resize_w:
|
||||
ratio = float(max_side_len) / resize_h
|
||||
else:
|
||||
ratio = float(max_side_len) / resize_w
|
||||
else:
|
||||
ratio = 1.
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
if resize_h % 32 == 0:
|
||||
resize_h = resize_h
|
||||
elif resize_h // 32 <= 1:
|
||||
resize_h = 32
|
||||
else:
|
||||
resize_h = (resize_h // 32 - 1) * 32
|
||||
if resize_w % 32 == 0:
|
||||
resize_w = resize_w
|
||||
elif resize_w // 32 <= 1:
|
||||
resize_w = 32
|
||||
else:
|
||||
resize_w = (resize_w // 32 - 1) * 32
|
||||
try:
|
||||
if int(resize_w) <= 0 or int(resize_h) <= 0:
|
||||
return None, (None, None)
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
except:
|
||||
print(im.shape, resize_w, resize_h)
|
||||
sys.exit(0)
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
def resize_image_type1(self, im):
|
||||
resize_h, resize_w = self.image_shape
|
||||
ori_h, ori_w = im.shape[:2] # (h, w, c)
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
ratio_h = float(resize_h) / ori_h
|
||||
ratio_w = float(resize_w) / ori_w
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
def normalize(self, im):
|
||||
img_mean = [0.485, 0.456, 0.406]
|
||||
img_std = [0.229, 0.224, 0.225]
|
||||
im = im.astype(np.float32, copy=False)
|
||||
im = im / 255
|
||||
im[:, :, 0] -= img_mean[0]
|
||||
im[:, :, 1] -= img_mean[1]
|
||||
im[:, :, 2] -= img_mean[2]
|
||||
im[:, :, 0] /= img_std[0]
|
||||
im[:, :, 1] /= img_std[1]
|
||||
im[:, :, 2] /= img_std[2]
|
||||
channel_swap = (2, 0, 1)
|
||||
im = im.transpose(channel_swap)
|
||||
return im
|
||||
|
||||
def __call__(self, im):
|
||||
if self.resize_type == 0:
|
||||
im, (ratio_h, ratio_w) = self.resize_image_type0(im)
|
||||
else:
|
||||
im, (ratio_h, ratio_w) = self.resize_image_type1(im)
|
||||
im = self.normalize(im)
|
||||
im = im[np.newaxis, :]
|
||||
return [im, (ratio_h, ratio_w)]
|
|
@ -1,537 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
class EASTProcessTrain(object):
|
||||
def __init__(self, params):
|
||||
self.img_set_dir = params['img_set_dir']
|
||||
self.random_scale = np.array([0.5, 1, 2.0, 3.0])
|
||||
self.background_ratio = params['background_ratio']
|
||||
self.min_crop_side_ratio = params['min_crop_side_ratio']
|
||||
image_shape = params['image_shape']
|
||||
self.input_size = image_shape[1]
|
||||
self.min_text_size = params['min_text_size']
|
||||
|
||||
def preprocess(self, im):
|
||||
input_size = self.input_size
|
||||
im_shape = im.shape
|
||||
im_size_min = np.min(im_shape[0:2])
|
||||
im_size_max = np.max(im_shape[0:2])
|
||||
im_scale = float(input_size) / float(im_size_max)
|
||||
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale)
|
||||
img_mean = [0.485, 0.456, 0.406]
|
||||
img_std = [0.229, 0.224, 0.225]
|
||||
im = im[:, :, ::-1].astype(np.float32)
|
||||
im = im / 255
|
||||
im -= img_mean
|
||||
im /= img_std
|
||||
new_h, new_w, _ = im.shape
|
||||
im_padded = np.zeros((input_size, input_size, 3), dtype=np.float32)
|
||||
im_padded[:new_h, :new_w, :] = im
|
||||
im_padded = im_padded.transpose((2, 0, 1))
|
||||
im_padded = im_padded[np.newaxis, :]
|
||||
return im_padded, im_scale
|
||||
|
||||
def convert_label_infor(self, label_infor):
|
||||
label_infor = label_infor.decode()
|
||||
label_infor = label_infor.encode('utf-8').decode('utf-8-sig')
|
||||
substr = label_infor.strip("\n").split("\t")
|
||||
img_path = os.path.join(self.img_set_dir, substr[0])
|
||||
label = json.loads(substr[1])
|
||||
nBox = len(label)
|
||||
wordBBs, txts, txt_tags = [], [], []
|
||||
for bno in range(0, nBox):
|
||||
wordBB = label[bno]['points']
|
||||
txt = label[bno]['transcription']
|
||||
wordBBs.append(wordBB)
|
||||
txts.append(txt)
|
||||
if txt == '###':
|
||||
txt_tags.append(True)
|
||||
else:
|
||||
txt_tags.append(False)
|
||||
wordBBs = np.array(wordBBs, dtype=np.float32)
|
||||
txt_tags = np.array(txt_tags, dtype=np.bool)
|
||||
return img_path, wordBBs, txt_tags, txts
|
||||
|
||||
def rotate_im_poly(self, im, text_polys):
|
||||
"""
|
||||
rotate image with 90 / 180 / 270 degre
|
||||
"""
|
||||
im_w, im_h = im.shape[1], im.shape[0]
|
||||
dst_im = im.copy()
|
||||
dst_polys = []
|
||||
rand_degree_ratio = np.random.rand()
|
||||
rand_degree_cnt = 1
|
||||
if 0.333 < rand_degree_ratio < 0.666:
|
||||
rand_degree_cnt = 2
|
||||
elif rand_degree_ratio > 0.666:
|
||||
rand_degree_cnt = 3
|
||||
for i in range(rand_degree_cnt):
|
||||
dst_im = np.rot90(dst_im)
|
||||
rot_degree = -90 * rand_degree_cnt
|
||||
rot_angle = rot_degree * math.pi / 180.0
|
||||
n_poly = text_polys.shape[0]
|
||||
cx, cy = 0.5 * im_w, 0.5 * im_h
|
||||
ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0]
|
||||
for i in range(n_poly):
|
||||
wordBB = text_polys[i]
|
||||
poly = []
|
||||
for j in range(4):
|
||||
sx, sy = wordBB[j][0], wordBB[j][1]
|
||||
dx = math.cos(rot_angle) * (sx - cx)\
|
||||
- math.sin(rot_angle) * (sy - cy) + ncx
|
||||
dy = math.sin(rot_angle) * (sx - cx)\
|
||||
+ math.cos(rot_angle) * (sy - cy) + ncy
|
||||
poly.append([dx, dy])
|
||||
dst_polys.append(poly)
|
||||
dst_polys = np.array(dst_polys, dtype=np.float32)
|
||||
return dst_im, dst_polys
|
||||
|
||||
def polygon_area(self, poly):
|
||||
"""
|
||||
compute area of a polygon
|
||||
:param poly:
|
||||
:return:
|
||||
"""
|
||||
edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
|
||||
(poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
|
||||
(poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
|
||||
(poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])]
|
||||
return np.sum(edge) / 2.
|
||||
|
||||
def check_and_validate_polys(self, polys, tags, img_height, img_width):
|
||||
"""
|
||||
check so that the text poly is in the same direction,
|
||||
and also filter some invalid polygons
|
||||
:param polys:
|
||||
:param tags:
|
||||
:return:
|
||||
"""
|
||||
h, w = img_height, img_width
|
||||
if polys.shape[0] == 0:
|
||||
return polys
|
||||
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
|
||||
polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)
|
||||
|
||||
validated_polys = []
|
||||
validated_tags = []
|
||||
for poly, tag in zip(polys, tags):
|
||||
p_area = self.polygon_area(poly)
|
||||
#invalid poly
|
||||
if abs(p_area) < 1:
|
||||
continue
|
||||
if p_area > 0:
|
||||
#'poly in wrong direction'
|
||||
if not tag:
|
||||
tag = True #reversed cases should be ignore
|
||||
poly = poly[(0, 3, 2, 1), :]
|
||||
validated_polys.append(poly)
|
||||
validated_tags.append(tag)
|
||||
return np.array(validated_polys), np.array(validated_tags)
|
||||
|
||||
def draw_img_polys(self, img, polys):
|
||||
if len(img.shape) == 4:
|
||||
img = np.squeeze(img, axis=0)
|
||||
if img.shape[0] == 3:
|
||||
img = img.transpose((1, 2, 0))
|
||||
img[:, :, 2] += 123.68
|
||||
img[:, :, 1] += 116.78
|
||||
img[:, :, 0] += 103.94
|
||||
cv2.imwrite("tmp.jpg", img)
|
||||
img = cv2.imread("tmp.jpg")
|
||||
for box in polys:
|
||||
box = box.astype(np.int32).reshape((-1, 1, 2))
|
||||
cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
|
||||
import random
|
||||
ino = random.randint(0, 100)
|
||||
cv2.imwrite("tmp_%d.jpg" % ino, img)
|
||||
return
|
||||
|
||||
def shrink_poly(self, poly, r):
|
||||
"""
|
||||
fit a poly inside the origin poly, maybe bugs here...
|
||||
used for generate the score map
|
||||
:param poly: the text poly
|
||||
:param r: r in the paper
|
||||
:return: the shrinked poly
|
||||
"""
|
||||
# shrink ratio
|
||||
R = 0.3
|
||||
# find the longer pair
|
||||
dist0 = np.linalg.norm(poly[0] - poly[1])
|
||||
dist1 = np.linalg.norm(poly[2] - poly[3])
|
||||
dist2 = np.linalg.norm(poly[0] - poly[3])
|
||||
dist3 = np.linalg.norm(poly[1] - poly[2])
|
||||
if dist0 + dist1 > dist2 + dist3:
|
||||
# first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2)
|
||||
## p0, p1
|
||||
theta = np.arctan2((poly[1][1] - poly[0][1]),
|
||||
(poly[1][0] - poly[0][0]))
|
||||
poly[0][0] += R * r[0] * np.cos(theta)
|
||||
poly[0][1] += R * r[0] * np.sin(theta)
|
||||
poly[1][0] -= R * r[1] * np.cos(theta)
|
||||
poly[1][1] -= R * r[1] * np.sin(theta)
|
||||
## p2, p3
|
||||
theta = np.arctan2((poly[2][1] - poly[3][1]),
|
||||
(poly[2][0] - poly[3][0]))
|
||||
poly[3][0] += R * r[3] * np.cos(theta)
|
||||
poly[3][1] += R * r[3] * np.sin(theta)
|
||||
poly[2][0] -= R * r[2] * np.cos(theta)
|
||||
poly[2][1] -= R * r[2] * np.sin(theta)
|
||||
## p0, p3
|
||||
theta = np.arctan2((poly[3][0] - poly[0][0]),
|
||||
(poly[3][1] - poly[0][1]))
|
||||
poly[0][0] += R * r[0] * np.sin(theta)
|
||||
poly[0][1] += R * r[0] * np.cos(theta)
|
||||
poly[3][0] -= R * r[3] * np.sin(theta)
|
||||
poly[3][1] -= R * r[3] * np.cos(theta)
|
||||
## p1, p2
|
||||
theta = np.arctan2((poly[2][0] - poly[1][0]),
|
||||
(poly[2][1] - poly[1][1]))
|
||||
poly[1][0] += R * r[1] * np.sin(theta)
|
||||
poly[1][1] += R * r[1] * np.cos(theta)
|
||||
poly[2][0] -= R * r[2] * np.sin(theta)
|
||||
poly[2][1] -= R * r[2] * np.cos(theta)
|
||||
else:
|
||||
## p0, p3
|
||||
# print poly
|
||||
theta = np.arctan2((poly[3][0] - poly[0][0]),
|
||||
(poly[3][1] - poly[0][1]))
|
||||
poly[0][0] += R * r[0] * np.sin(theta)
|
||||
poly[0][1] += R * r[0] * np.cos(theta)
|
||||
poly[3][0] -= R * r[3] * np.sin(theta)
|
||||
poly[3][1] -= R * r[3] * np.cos(theta)
|
||||
## p1, p2
|
||||
theta = np.arctan2((poly[2][0] - poly[1][0]),
|
||||
(poly[2][1] - poly[1][1]))
|
||||
poly[1][0] += R * r[1] * np.sin(theta)
|
||||
poly[1][1] += R * r[1] * np.cos(theta)
|
||||
poly[2][0] -= R * r[2] * np.sin(theta)
|
||||
poly[2][1] -= R * r[2] * np.cos(theta)
|
||||
## p0, p1
|
||||
theta = np.arctan2((poly[1][1] - poly[0][1]),
|
||||
(poly[1][0] - poly[0][0]))
|
||||
poly[0][0] += R * r[0] * np.cos(theta)
|
||||
poly[0][1] += R * r[0] * np.sin(theta)
|
||||
poly[1][0] -= R * r[1] * np.cos(theta)
|
||||
poly[1][1] -= R * r[1] * np.sin(theta)
|
||||
## p2, p3
|
||||
theta = np.arctan2((poly[2][1] - poly[3][1]),
|
||||
(poly[2][0] - poly[3][0]))
|
||||
poly[3][0] += R * r[3] * np.cos(theta)
|
||||
poly[3][1] += R * r[3] * np.sin(theta)
|
||||
poly[2][0] -= R * r[2] * np.cos(theta)
|
||||
poly[2][1] -= R * r[2] * np.sin(theta)
|
||||
return poly
|
||||
|
||||
def generate_quad(self, im_size, polys, tags):
|
||||
"""
|
||||
Generate quadrangle.
|
||||
"""
|
||||
h, w = im_size
|
||||
poly_mask = np.zeros((h, w), dtype=np.uint8)
|
||||
score_map = np.zeros((h, w), dtype=np.uint8)
|
||||
# (x1, y1, ..., x4, y4, short_edge_norm)
|
||||
geo_map = np.zeros((h, w, 9), dtype=np.float32)
|
||||
# mask used during traning, to ignore some hard areas
|
||||
training_mask = np.ones((h, w), dtype=np.uint8)
|
||||
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
|
||||
poly = poly_tag[0]
|
||||
tag = poly_tag[1]
|
||||
|
||||
r = [None, None, None, None]
|
||||
for i in range(4):
|
||||
dist1 = np.linalg.norm(poly[i] - poly[(i + 1) % 4])
|
||||
dist2 = np.linalg.norm(poly[i] - poly[(i - 1) % 4])
|
||||
r[i] = min(dist1, dist2)
|
||||
# score map
|
||||
shrinked_poly = self.shrink_poly(
|
||||
poly.copy(), r).astype(np.int32)[np.newaxis, :, :]
|
||||
cv2.fillPoly(score_map, shrinked_poly, 1)
|
||||
cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1)
|
||||
# if the poly is too small, then ignore it during training
|
||||
poly_h = min(
|
||||
np.linalg.norm(poly[0] - poly[3]),
|
||||
np.linalg.norm(poly[1] - poly[2]))
|
||||
poly_w = min(
|
||||
np.linalg.norm(poly[0] - poly[1]),
|
||||
np.linalg.norm(poly[2] - poly[3]))
|
||||
if min(poly_h, poly_w) < self.min_text_size:
|
||||
cv2.fillPoly(training_mask,
|
||||
poly.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
|
||||
if tag:
|
||||
cv2.fillPoly(training_mask,
|
||||
poly.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
|
||||
xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1))
|
||||
# geo map.
|
||||
y_in_poly = xy_in_poly[:, 0]
|
||||
x_in_poly = xy_in_poly[:, 1]
|
||||
poly[:, 0] = np.minimum(np.maximum(poly[:, 0], 0), w)
|
||||
poly[:, 1] = np.minimum(np.maximum(poly[:, 1], 0), h)
|
||||
for pno in range(4):
|
||||
geo_channel_beg = pno * 2
|
||||
geo_map[y_in_poly, x_in_poly, geo_channel_beg] =\
|
||||
x_in_poly - poly[pno, 0]
|
||||
geo_map[y_in_poly, x_in_poly, geo_channel_beg+1] =\
|
||||
y_in_poly - poly[pno, 1]
|
||||
geo_map[y_in_poly, x_in_poly, 8] = \
|
||||
1.0 / max(min(poly_h, poly_w), 1.0)
|
||||
return score_map, geo_map, training_mask
|
||||
|
||||
def crop_area(self,
|
||||
im,
|
||||
polys,
|
||||
tags,
|
||||
txts,
|
||||
crop_background=False,
|
||||
max_tries=50):
|
||||
"""
|
||||
make random crop from the input image
|
||||
:param im:
|
||||
:param polys:
|
||||
:param tags:
|
||||
:param crop_background:
|
||||
:param max_tries:
|
||||
:return:
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
pad_h = h // 10
|
||||
pad_w = w // 10
|
||||
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
|
||||
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
|
||||
for poly in polys:
|
||||
poly = np.round(poly, decimals=0).astype(np.int32)
|
||||
minx = np.min(poly[:, 0])
|
||||
maxx = np.max(poly[:, 0])
|
||||
w_array[minx + pad_w:maxx + pad_w] = 1
|
||||
miny = np.min(poly[:, 1])
|
||||
maxy = np.max(poly[:, 1])
|
||||
h_array[miny + pad_h:maxy + pad_h] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return im, polys, tags, txts
|
||||
|
||||
for i in range(max_tries):
|
||||
xx = np.random.choice(w_axis, size=2)
|
||||
xmin = np.min(xx) - pad_w
|
||||
xmax = np.max(xx) - pad_w
|
||||
xmin = np.clip(xmin, 0, w - 1)
|
||||
xmax = np.clip(xmax, 0, w - 1)
|
||||
yy = np.random.choice(h_axis, size=2)
|
||||
ymin = np.min(yy) - pad_h
|
||||
ymax = np.max(yy) - pad_h
|
||||
ymin = np.clip(ymin, 0, h - 1)
|
||||
ymax = np.clip(ymax, 0, h - 1)
|
||||
if xmax - xmin < self.min_crop_side_ratio * w or \
|
||||
ymax - ymin < self.min_crop_side_ratio * h:
|
||||
# area too small
|
||||
continue
|
||||
if polys.shape[0] != 0:
|
||||
poly_axis_in_area = (polys[:, :, 0] >= xmin)\
|
||||
& (polys[:, :, 0] <= xmax)\
|
||||
& (polys[:, :, 1] >= ymin)\
|
||||
& (polys[:, :, 1] <= ymax)
|
||||
selected_polys = np.where(
|
||||
np.sum(poly_axis_in_area, axis=1) == 4)[0]
|
||||
else:
|
||||
selected_polys = []
|
||||
|
||||
if len(selected_polys) == 0:
|
||||
# no text in this area
|
||||
if crop_background:
|
||||
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
|
||||
polys = []
|
||||
tags = []
|
||||
txts = []
|
||||
return im, polys, tags, txts
|
||||
else:
|
||||
continue
|
||||
|
||||
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
|
||||
polys = polys[selected_polys]
|
||||
tags = tags[selected_polys]
|
||||
txts_tmp = []
|
||||
for selected_poly in selected_polys:
|
||||
txts_tmp.append(txts[selected_poly])
|
||||
txts = txts_tmp
|
||||
polys[:, :, 0] -= xmin
|
||||
polys[:, :, 1] -= ymin
|
||||
return im, polys, tags, txts
|
||||
return im, polys, tags, txts
|
||||
|
||||
def crop_background_infor(self, im, text_polys, text_tags, text_strs):
|
||||
im, text_polys, text_tags, text_strs = self.crop_area(
|
||||
im, text_polys, text_tags, text_strs, crop_background=True)
|
||||
if len(text_polys) > 0:
|
||||
return None
|
||||
# pad and resize image
|
||||
input_size = self.input_size
|
||||
im, ratio = self.preprocess(im)
|
||||
score_map = np.zeros((input_size, input_size), dtype=np.float32)
|
||||
geo_map = np.zeros((input_size, input_size, 9), dtype=np.float32)
|
||||
training_mask = np.ones((input_size, input_size), dtype=np.float32)
|
||||
return im, score_map, geo_map, training_mask
|
||||
|
||||
def crop_foreground_infor(self, im, text_polys, text_tags, text_strs):
|
||||
im, text_polys, text_tags, text_strs = self.crop_area(
|
||||
im, text_polys, text_tags, text_strs, crop_background=False)
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
#continue for all ignore case
|
||||
if np.sum((text_tags * 1.0)) >= text_tags.size:
|
||||
return None
|
||||
# pad and resize image
|
||||
input_size = self.input_size
|
||||
im, ratio = self.preprocess(im)
|
||||
text_polys[:, :, 0] *= ratio
|
||||
text_polys[:, :, 1] *= ratio
|
||||
_, _, new_h, new_w = im.shape
|
||||
# print(im.shape)
|
||||
# self.draw_img_polys(im, text_polys)
|
||||
score_map, geo_map, training_mask = self.generate_quad(
|
||||
(new_h, new_w), text_polys, text_tags)
|
||||
return im, score_map, geo_map, training_mask
|
||||
|
||||
def __call__(self, label_infor):
|
||||
infor = self.convert_label_infor(label_infor)
|
||||
im_path, text_polys, text_tags, text_strs = infor
|
||||
im = cv2.imread(im_path)
|
||||
if im is None:
|
||||
return None
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
#add rotate cases
|
||||
if np.random.rand() < 0.5:
|
||||
im, text_polys = self.rotate_im_poly(im, text_polys)
|
||||
h, w, _ = im.shape
|
||||
text_polys, text_tags = self.check_and_validate_polys(text_polys,
|
||||
text_tags, h, w)
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
# random scale this image
|
||||
rd_scale = np.random.choice(self.random_scale)
|
||||
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
|
||||
text_polys *= rd_scale
|
||||
if np.random.rand() < self.background_ratio:
|
||||
outs = self.crop_background_infor(im, text_polys, text_tags,
|
||||
text_strs)
|
||||
else:
|
||||
outs = self.crop_foreground_infor(im, text_polys, text_tags,
|
||||
text_strs)
|
||||
|
||||
if outs is None:
|
||||
return None
|
||||
im, score_map, geo_map, training_mask = outs
|
||||
score_map = score_map[np.newaxis, ::4, ::4].astype(np.float32)
|
||||
geo_map = np.swapaxes(geo_map, 1, 2)
|
||||
geo_map = np.swapaxes(geo_map, 1, 0)
|
||||
geo_map = geo_map[:, ::4, ::4].astype(np.float32)
|
||||
training_mask = training_mask[np.newaxis, ::4, ::4]
|
||||
training_mask = training_mask.astype(np.float32)
|
||||
return im, score_map, geo_map, training_mask
|
||||
|
||||
|
||||
class EASTProcessTest(object):
|
||||
def __init__(self, params):
|
||||
super(EASTProcessTest, self).__init__()
|
||||
self.resize_type = 0
|
||||
if 'test_image_shape' in params:
|
||||
self.image_shape = params['test_image_shape']
|
||||
# print(self.image_shape)
|
||||
self.resize_type = 1
|
||||
if 'max_side_len' in params:
|
||||
self.max_side_len = params['max_side_len']
|
||||
else:
|
||||
self.max_side_len = 2400
|
||||
|
||||
def resize_image_type0(self, im):
|
||||
"""
|
||||
resize image to a size multiple of 32 which is required by the network
|
||||
args:
|
||||
img(array): array with shape [h, w, c]
|
||||
return(tuple):
|
||||
img, (ratio_h, ratio_w)
|
||||
"""
|
||||
max_side_len = self.max_side_len
|
||||
h, w, _ = im.shape
|
||||
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
||||
# limit the max side
|
||||
if max(resize_h, resize_w) > max_side_len:
|
||||
if resize_h > resize_w:
|
||||
ratio = float(max_side_len) / resize_h
|
||||
else:
|
||||
ratio = float(max_side_len) / resize_w
|
||||
else:
|
||||
ratio = 1.
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
if resize_h % 32 == 0:
|
||||
resize_h = resize_h
|
||||
elif resize_h // 32 <= 1:
|
||||
resize_h = 32
|
||||
else:
|
||||
resize_h = (resize_h // 32 - 1) * 32
|
||||
if resize_w % 32 == 0:
|
||||
resize_w = resize_w
|
||||
elif resize_w // 32 <= 1:
|
||||
resize_w = 32
|
||||
else:
|
||||
resize_w = (resize_w // 32 - 1) * 32
|
||||
try:
|
||||
if int(resize_w) <= 0 or int(resize_h) <= 0:
|
||||
return None, (None, None)
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
except:
|
||||
print(im.shape, resize_w, resize_h)
|
||||
sys.exit(0)
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
def resize_image_type1(self, im):
|
||||
resize_h, resize_w = self.image_shape
|
||||
ori_h, ori_w = im.shape[:2] # (h, w, c)
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
ratio_h = float(resize_h) / ori_h
|
||||
ratio_w = float(resize_w) / ori_w
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
def __call__(self, im):
|
||||
if self.resize_type == 0:
|
||||
im, (ratio_h, ratio_w) = self.resize_image_type0(im)
|
||||
else:
|
||||
im, (ratio_h, ratio_w) = self.resize_image_type1(im)
|
||||
img_mean = [0.485, 0.456, 0.406]
|
||||
img_std = [0.229, 0.224, 0.225]
|
||||
im = im[:, :, ::-1].astype(np.float32)
|
||||
im = im / 255
|
||||
im -= img_mean
|
||||
im /= img_std
|
||||
im = im.transpose((2, 0, 1))
|
||||
im = im[np.newaxis, :]
|
||||
return [im, (ratio_h, ratio_w)]
|
|
@ -1,147 +0,0 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
np.seterr(divide='ignore', invalid='ignore')
|
||||
import pyclipper
|
||||
from shapely.geometry import Polygon
|
||||
import sys
|
||||
import warnings
|
||||
warnings.simplefilter("ignore")
|
||||
|
||||
|
||||
def draw_border_map(polygon, canvas, mask, shrink_ratio):
|
||||
polygon = np.array(polygon)
|
||||
assert polygon.ndim == 2
|
||||
assert polygon.shape[1] == 2
|
||||
|
||||
polygon_shape = Polygon(polygon)
|
||||
if polygon_shape.area <= 0:
|
||||
return
|
||||
distance = polygon_shape.area * (
|
||||
1 - np.power(shrink_ratio, 2)) / polygon_shape.length
|
||||
subject = [tuple(l) for l in polygon]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
|
||||
padded_polygon = np.array(padding.Execute(distance)[0])
|
||||
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
|
||||
|
||||
xmin = padded_polygon[:, 0].min()
|
||||
xmax = padded_polygon[:, 0].max()
|
||||
ymin = padded_polygon[:, 1].min()
|
||||
ymax = padded_polygon[:, 1].max()
|
||||
width = xmax - xmin + 1
|
||||
height = ymax - ymin + 1
|
||||
|
||||
polygon[:, 0] = polygon[:, 0] - xmin
|
||||
polygon[:, 1] = polygon[:, 1] - ymin
|
||||
|
||||
xs = np.broadcast_to(
|
||||
np.linspace(
|
||||
0, width - 1, num=width).reshape(1, width), (height, width))
|
||||
ys = np.broadcast_to(
|
||||
np.linspace(
|
||||
0, height - 1, num=height).reshape(height, 1), (height, width))
|
||||
|
||||
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
|
||||
for i in range(polygon.shape[0]):
|
||||
j = (i + 1) % polygon.shape[0]
|
||||
absolute_distance = _distance(xs, ys, polygon[i], polygon[j])
|
||||
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
|
||||
distance_map = distance_map.min(axis=0)
|
||||
|
||||
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
|
||||
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
|
||||
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
|
||||
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
|
||||
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
|
||||
1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
|
||||
xmin_valid - xmin:xmax_valid - xmax + width],
|
||||
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
|
||||
|
||||
|
||||
def _distance(xs, ys, point_1, point_2):
|
||||
'''
|
||||
compute the distance from point to a line
|
||||
ys: coordinates in the first axis
|
||||
xs: coordinates in the second axis
|
||||
point_1, point_2: (x, y), the end of the line
|
||||
'''
|
||||
height, width = xs.shape[:2]
|
||||
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
|
||||
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
|
||||
square_distance = np.square(point_1[0] - point_2[0]) + np.square(point_1[
|
||||
1] - point_2[1])
|
||||
|
||||
cosin = (square_distance - square_distance_1 - square_distance_2) / (
|
||||
2 * np.sqrt(square_distance_1 * square_distance_2))
|
||||
square_sin = 1 - np.square(cosin)
|
||||
square_sin = np.nan_to_num(square_sin)
|
||||
result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
|
||||
square_distance)
|
||||
|
||||
result[cosin <
|
||||
0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin <
|
||||
0]
|
||||
# self.extend_line(point_1, point_2, result)
|
||||
return result
|
||||
|
||||
|
||||
def extend_line(point_1, point_2, result, shrink_ratio):
|
||||
ex_point_1 = (
|
||||
int(
|
||||
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
|
||||
int(
|
||||
round(point_1[1] + (point_1[1] - point_2[1]) * (1 + shrink_ratio))))
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_1),
|
||||
tuple(point_1),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0)
|
||||
ex_point_2 = (
|
||||
int(
|
||||
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
|
||||
int(
|
||||
round(point_2[1] + (point_2[1] - point_1[1]) * (1 + shrink_ratio))))
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_2),
|
||||
tuple(point_2),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0)
|
||||
return ex_point_1, ex_point_2
|
||||
|
||||
|
||||
def MakeBorderMap(data):
|
||||
shrink_ratio = 0.4
|
||||
thresh_min = 0.3
|
||||
thresh_max = 0.7
|
||||
|
||||
im = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
canvas = np.zeros(im.shape[:2], dtype=np.float32)
|
||||
mask = np.zeros(im.shape[:2], dtype=np.float32)
|
||||
|
||||
for i in range(len(text_polys)):
|
||||
if ignore_tags[i]:
|
||||
continue
|
||||
draw_border_map(
|
||||
text_polys[i], canvas, mask=mask, shrink_ratio=shrink_ratio)
|
||||
canvas = canvas * (thresh_max - thresh_min) + thresh_min
|
||||
|
||||
data['threshold_map'] = canvas
|
||||
data['threshold_mask'] = mask
|
||||
return data
|
|
@ -1,88 +0,0 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
||||
|
||||
def validate_polygons(polygons, ignore_tags, h, w):
|
||||
'''
|
||||
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
|
||||
'''
|
||||
if len(polygons) == 0:
|
||||
return polygons, ignore_tags
|
||||
assert len(polygons) == len(ignore_tags)
|
||||
for polygon in polygons:
|
||||
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
|
||||
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
|
||||
|
||||
for i in range(len(polygons)):
|
||||
area = polygon_area(polygons[i])
|
||||
if abs(area) < 1:
|
||||
ignore_tags[i] = True
|
||||
if area > 0:
|
||||
polygons[i] = polygons[i][::-1, :]
|
||||
return polygons, ignore_tags
|
||||
|
||||
|
||||
def polygon_area(polygon):
|
||||
edge = 0
|
||||
for i in range(polygon.shape[0]):
|
||||
next_index = (i + 1) % polygon.shape[0]
|
||||
edge += (polygon[next_index, 0] - polygon[i, 0]) * (
|
||||
polygon[next_index, 1] - polygon[i, 1])
|
||||
|
||||
return edge / 2.
|
||||
|
||||
|
||||
def MakeShrinkMap(data):
|
||||
min_text_size = 8
|
||||
shrink_ratio = 0.4
|
||||
|
||||
image = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
h, w = image.shape[:2]
|
||||
text_polys, ignore_tags = validate_polygons(text_polys, ignore_tags, h, w)
|
||||
gt = np.zeros((h, w), dtype=np.float32)
|
||||
# gt = np.zeros((1, h, w), dtype=np.float32)
|
||||
mask = np.ones((h, w), dtype=np.float32)
|
||||
for i in range(len(text_polys)):
|
||||
polygon = text_polys[i]
|
||||
height = max(polygon[:, 1]) - min(polygon[:, 1])
|
||||
width = max(polygon[:, 0]) - min(polygon[:, 0])
|
||||
# height = min(np.linalg.norm(polygon[0] - polygon[3]),
|
||||
# np.linalg.norm(polygon[1] - polygon[2]))
|
||||
# width = min(np.linalg.norm(polygon[0] - polygon[1]),
|
||||
# np.linalg.norm(polygon[2] - polygon[3]))
|
||||
if ignore_tags[i] or min(height, width) < min_text_size:
|
||||
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
else:
|
||||
polygon_shape = Polygon(polygon)
|
||||
distance = polygon_shape.area * (
|
||||
1 - np.power(shrink_ratio, 2)) / polygon_shape.length
|
||||
subject = [tuple(l) for l in text_polys[i]]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND,
|
||||
pyclipper.ET_CLOSEDPOLYGON)
|
||||
shrinked = padding.Execute(-distance)
|
||||
if shrinked == []:
|
||||
cv2.fillPoly(mask,
|
||||
polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
continue
|
||||
shrinked = np.array(shrinked[0]).reshape(-1, 2)
|
||||
cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1)
|
||||
# cv2.fillPoly(gt[0], [shrinked.astype(np.int32)], 1)
|
||||
|
||||
data['shrink_map'] = gt
|
||||
data['shrink_mask'] = mask
|
||||
return data
|
|
@ -1,155 +0,0 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
import random
|
||||
|
||||
|
||||
def is_poly_in_rect(poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
|
||||
return False
|
||||
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_poly_outside_rect(poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
|
||||
return True
|
||||
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def split_regions(axis):
|
||||
regions = []
|
||||
min_axis = 0
|
||||
for i in range(1, axis.shape[0]):
|
||||
if axis[i] != axis[i - 1] + 1:
|
||||
region = axis[min_axis:i]
|
||||
min_axis = i
|
||||
regions.append(region)
|
||||
return regions
|
||||
|
||||
|
||||
def random_select(axis, max_size):
|
||||
xx = np.random.choice(axis, size=2)
|
||||
xmin = np.min(xx)
|
||||
xmax = np.max(xx)
|
||||
xmin = np.clip(xmin, 0, max_size - 1)
|
||||
xmax = np.clip(xmax, 0, max_size - 1)
|
||||
return xmin, xmax
|
||||
|
||||
|
||||
def region_wise_random_select(regions, max_size):
|
||||
selected_index = list(np.random.choice(len(regions), 2))
|
||||
selected_values = []
|
||||
for index in selected_index:
|
||||
axis = regions[index]
|
||||
xx = int(np.random.choice(axis, size=1))
|
||||
selected_values.append(xx)
|
||||
xmin = min(selected_values)
|
||||
xmax = max(selected_values)
|
||||
return xmin, xmax
|
||||
|
||||
|
||||
def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
|
||||
h, w, _ = im.shape
|
||||
h_array = np.zeros(h, dtype=np.int32)
|
||||
w_array = np.zeros(w, dtype=np.int32)
|
||||
for points in text_polys:
|
||||
points = np.round(points, decimals=0).astype(np.int32)
|
||||
minx = np.min(points[:, 0])
|
||||
maxx = np.max(points[:, 0])
|
||||
w_array[minx:maxx] = 1
|
||||
miny = np.min(points[:, 1])
|
||||
maxy = np.max(points[:, 1])
|
||||
h_array[miny:maxy] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return 0, 0, w, h
|
||||
|
||||
h_regions = split_regions(h_axis)
|
||||
w_regions = split_regions(w_axis)
|
||||
|
||||
for i in range(max_tries):
|
||||
if len(w_regions) > 1:
|
||||
xmin, xmax = region_wise_random_select(w_regions, w)
|
||||
else:
|
||||
xmin, xmax = random_select(w_axis, w)
|
||||
if len(h_regions) > 1:
|
||||
ymin, ymax = region_wise_random_select(h_regions, h)
|
||||
else:
|
||||
ymin, ymax = random_select(h_axis, h)
|
||||
|
||||
if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h:
|
||||
# area too small
|
||||
continue
|
||||
num_poly_in_rect = 0
|
||||
for poly in text_polys:
|
||||
if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin,
|
||||
ymax - ymin):
|
||||
num_poly_in_rect += 1
|
||||
break
|
||||
|
||||
if num_poly_in_rect > 0:
|
||||
return xmin, ymin, xmax - xmin, ymax - ymin
|
||||
|
||||
return 0, 0, w, h
|
||||
|
||||
|
||||
def RandomCropData(data, size):
|
||||
max_tries = 10
|
||||
min_crop_side_ratio = 0.1
|
||||
require_original_image = False
|
||||
keep_ratio = True
|
||||
|
||||
im = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
texts = data['texts']
|
||||
all_care_polys = [
|
||||
text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
|
||||
]
|
||||
# 计算crop区域
|
||||
crop_x, crop_y, crop_w, crop_h = crop_area(im, all_care_polys,
|
||||
min_crop_side_ratio, max_tries)
|
||||
# crop 图片 保持比例填充
|
||||
scale_w = size[0] / crop_w
|
||||
scale_h = size[1] / crop_h
|
||||
scale = min(scale_w, scale_h)
|
||||
h = int(crop_h * scale)
|
||||
w = int(crop_w * scale)
|
||||
if keep_ratio:
|
||||
padimg = np.zeros((size[1], size[0], im.shape[2]), im.dtype)
|
||||
padimg[:h, :w] = cv2.resize(
|
||||
im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
|
||||
img = padimg
|
||||
else:
|
||||
img = cv2.resize(im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
|
||||
tuple(size))
|
||||
# crop 文本框
|
||||
text_polys_crop = []
|
||||
ignore_tags_crop = []
|
||||
texts_crop = []
|
||||
for poly, text, tag in zip(text_polys, texts, ignore_tags):
|
||||
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
|
||||
if not is_poly_outside_rect(poly, 0, 0, w, h):
|
||||
text_polys_crop.append(poly)
|
||||
ignore_tags_crop.append(tag)
|
||||
texts_crop.append(text)
|
||||
data['image'] = img
|
||||
data['polys'] = np.array(text_polys_crop)
|
||||
data['ignore_tags'] = ignore_tags_crop
|
||||
data['texts'] = texts_crop
|
||||
return data
|
|
@ -1,781 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import json
|
||||
|
||||
|
||||
class SASTProcessTrain(object):
|
||||
"""
|
||||
SAST process function for training
|
||||
"""
|
||||
def __init__(self, params):
|
||||
self.img_set_dir = params['img_set_dir']
|
||||
self.min_crop_side_ratio = params['min_crop_side_ratio']
|
||||
self.min_crop_size = params['min_crop_size']
|
||||
image_shape = params['image_shape']
|
||||
self.input_size = image_shape[1]
|
||||
self.min_text_size = params['min_text_size']
|
||||
self.max_text_size = params['max_text_size']
|
||||
|
||||
def convert_label_infor(self, label_infor):
|
||||
label_infor = label_infor.decode()
|
||||
label_infor = label_infor.encode('utf-8').decode('utf-8-sig')
|
||||
substr = label_infor.strip("\n").split("\t")
|
||||
img_path = self.img_set_dir + substr[0]
|
||||
label = json.loads(substr[1])
|
||||
nBox = len(label)
|
||||
wordBBs, txts, txt_tags = [], [], []
|
||||
for bno in range(0, nBox):
|
||||
wordBB = label[bno]['points']
|
||||
txt = label[bno]['transcription']
|
||||
wordBBs.append(wordBB)
|
||||
txts.append(txt)
|
||||
if txt == '###':
|
||||
txt_tags.append(True)
|
||||
else:
|
||||
txt_tags.append(False)
|
||||
wordBBs = np.array(wordBBs, dtype=np.float32)
|
||||
txt_tags = np.array(txt_tags, dtype=np.bool)
|
||||
return img_path, wordBBs, txt_tags, txts
|
||||
|
||||
def quad_area(self, poly):
|
||||
"""
|
||||
compute area of a polygon
|
||||
:param poly:
|
||||
:return:
|
||||
"""
|
||||
edge = [
|
||||
(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
|
||||
(poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
|
||||
(poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
|
||||
(poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])
|
||||
]
|
||||
return np.sum(edge) / 2.
|
||||
|
||||
def gen_quad_from_poly(self, poly):
|
||||
"""
|
||||
Generate min area quad from poly.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
min_area_quad = np.zeros((4, 2), dtype=np.float32)
|
||||
if True:
|
||||
rect = cv2.minAreaRect(poly.astype(np.int32)) # (center (x,y), (width, height), angle of rotation)
|
||||
center_point = rect[0]
|
||||
box = np.array(cv2.boxPoints(rect))
|
||||
|
||||
first_point_idx = 0
|
||||
min_dist = 1e4
|
||||
for i in range(4):
|
||||
dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
|
||||
np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
|
||||
np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
|
||||
np.linalg.norm(box[(i + 3) % 4] - poly[-1])
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
first_point_idx = i
|
||||
for i in range(4):
|
||||
min_area_quad[i] = box[(first_point_idx + i) % 4]
|
||||
|
||||
return min_area_quad
|
||||
|
||||
def check_and_validate_polys(self, polys, tags, xxx_todo_changeme):
|
||||
"""
|
||||
check so that the text poly is in the same direction,
|
||||
and also filter some invalid polygons
|
||||
:param polys:
|
||||
:param tags:
|
||||
:return:
|
||||
"""
|
||||
(h, w) = xxx_todo_changeme
|
||||
if polys.shape[0] == 0:
|
||||
return polys, np.array([]), np.array([])
|
||||
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
|
||||
polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)
|
||||
|
||||
validated_polys = []
|
||||
validated_tags = []
|
||||
hv_tags = []
|
||||
for poly, tag in zip(polys, tags):
|
||||
quad = self.gen_quad_from_poly(poly)
|
||||
p_area = self.quad_area(quad)
|
||||
if abs(p_area) < 1:
|
||||
print('invalid poly')
|
||||
continue
|
||||
if p_area > 0:
|
||||
if tag == False:
|
||||
print('poly in wrong direction')
|
||||
tag = True # reversed cases should be ignore
|
||||
poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1), :]
|
||||
quad = quad[(0, 3, 2, 1), :]
|
||||
|
||||
len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] - quad[2])
|
||||
len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - quad[2])
|
||||
hv_tag = 1
|
||||
|
||||
if len_w * 2.0 < len_h:
|
||||
hv_tag = 0
|
||||
|
||||
validated_polys.append(poly)
|
||||
validated_tags.append(tag)
|
||||
hv_tags.append(hv_tag)
|
||||
return np.array(validated_polys), np.array(validated_tags), np.array(hv_tags)
|
||||
|
||||
def crop_area(self, im, polys, tags, hv_tags, txts, crop_background=False, max_tries=25):
|
||||
"""
|
||||
make random crop from the input image
|
||||
:param im:
|
||||
:param polys:
|
||||
:param tags:
|
||||
:param crop_background:
|
||||
:param max_tries: 50 -> 25
|
||||
:return:
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
pad_h = h // 10
|
||||
pad_w = w // 10
|
||||
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
|
||||
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
|
||||
for poly in polys:
|
||||
poly = np.round(poly, decimals=0).astype(np.int32)
|
||||
minx = np.min(poly[:, 0])
|
||||
maxx = np.max(poly[:, 0])
|
||||
w_array[minx + pad_w: maxx + pad_w] = 1
|
||||
miny = np.min(poly[:, 1])
|
||||
maxy = np.max(poly[:, 1])
|
||||
h_array[miny + pad_h: maxy + pad_h] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return im, polys, tags, hv_tags, txts
|
||||
for i in range(max_tries):
|
||||
xx = np.random.choice(w_axis, size=2)
|
||||
xmin = np.min(xx) - pad_w
|
||||
xmax = np.max(xx) - pad_w
|
||||
xmin = np.clip(xmin, 0, w - 1)
|
||||
xmax = np.clip(xmax, 0, w - 1)
|
||||
yy = np.random.choice(h_axis, size=2)
|
||||
ymin = np.min(yy) - pad_h
|
||||
ymax = np.max(yy) - pad_h
|
||||
ymin = np.clip(ymin, 0, h - 1)
|
||||
ymax = np.clip(ymax, 0, h - 1)
|
||||
# if xmax - xmin < ARGS.min_crop_side_ratio * w or \
|
||||
# ymax - ymin < ARGS.min_crop_side_ratio * h:
|
||||
if xmax - xmin < self.min_crop_size or \
|
||||
ymax - ymin < self.min_crop_size:
|
||||
# area too small
|
||||
continue
|
||||
if polys.shape[0] != 0:
|
||||
poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \
|
||||
& (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax)
|
||||
selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0]
|
||||
else:
|
||||
selected_polys = []
|
||||
if len(selected_polys) == 0:
|
||||
# no text in this area
|
||||
if crop_background:
|
||||
txts_tmp = []
|
||||
for selected_poly in selected_polys:
|
||||
txts_tmp.append(txts[selected_poly])
|
||||
txts = txts_tmp
|
||||
return im[ymin : ymax + 1, xmin : xmax + 1, :], \
|
||||
polys[selected_polys], tags[selected_polys], hv_tags[selected_polys], txts
|
||||
else:
|
||||
continue
|
||||
im = im[ymin: ymax + 1, xmin: xmax + 1, :]
|
||||
polys = polys[selected_polys]
|
||||
tags = tags[selected_polys]
|
||||
hv_tags = hv_tags[selected_polys]
|
||||
txts_tmp = []
|
||||
for selected_poly in selected_polys:
|
||||
txts_tmp.append(txts[selected_poly])
|
||||
txts = txts_tmp
|
||||
polys[:, :, 0] -= xmin
|
||||
polys[:, :, 1] -= ymin
|
||||
return im, polys, tags, hv_tags, txts
|
||||
|
||||
return im, polys, tags, hv_tags, txts
|
||||
|
||||
def generate_direction_map(self, poly_quads, direction_map):
|
||||
"""
|
||||
"""
|
||||
width_list = []
|
||||
height_list = []
|
||||
for quad in poly_quads:
|
||||
quad_w = (np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[2] - quad[3])) / 2.0
|
||||
quad_h = (np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[2] - quad[1])) / 2.0
|
||||
width_list.append(quad_w)
|
||||
height_list.append(quad_h)
|
||||
norm_width = max(sum(width_list) / (len(width_list) + 1e-6), 1.0)
|
||||
average_height = max(sum(height_list) / (len(height_list) + 1e-6), 1.0)
|
||||
|
||||
for quad in poly_quads:
|
||||
direct_vector_full = ((quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0
|
||||
direct_vector = direct_vector_full / (np.linalg.norm(direct_vector_full) + 1e-6) * norm_width
|
||||
direction_label = tuple(map(float, [direct_vector[0], direct_vector[1], 1.0 / (average_height + 1e-6)]))
|
||||
cv2.fillPoly(direction_map, quad.round().astype(np.int32)[np.newaxis, :, :], direction_label)
|
||||
return direction_map
|
||||
|
||||
def calculate_average_height(self, poly_quads):
|
||||
"""
|
||||
"""
|
||||
height_list = []
|
||||
for quad in poly_quads:
|
||||
quad_h = (np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[2] - quad[1])) / 2.0
|
||||
height_list.append(quad_h)
|
||||
average_height = max(sum(height_list) / len(height_list), 1.0)
|
||||
return average_height
|
||||
|
||||
def generate_tcl_label(self, hw, polys, tags, ds_ratio,
|
||||
tcl_ratio=0.3, shrink_ratio_of_width=0.15):
|
||||
"""
|
||||
Generate polygon.
|
||||
"""
|
||||
h, w = hw
|
||||
h, w = int(h * ds_ratio), int(w * ds_ratio)
|
||||
polys = polys * ds_ratio
|
||||
|
||||
score_map = np.zeros((h, w,), dtype=np.float32)
|
||||
tbo_map = np.zeros((h, w, 5), dtype=np.float32)
|
||||
training_mask = np.ones((h, w,), dtype=np.float32)
|
||||
direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape([1, 1, 3]).astype(np.float32)
|
||||
|
||||
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
|
||||
poly = poly_tag[0]
|
||||
tag = poly_tag[1]
|
||||
|
||||
# generate min_area_quad
|
||||
min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
|
||||
min_area_quad_h = 0.5 * (np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
|
||||
np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
|
||||
min_area_quad_w = 0.5 * (np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
|
||||
np.linalg.norm(min_area_quad[2] - min_area_quad[3]))
|
||||
|
||||
if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \
|
||||
or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio:
|
||||
continue
|
||||
|
||||
if tag:
|
||||
# continue
|
||||
cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0.15)
|
||||
else:
|
||||
tcl_poly = self.poly2tcl(poly, tcl_ratio)
|
||||
tcl_quads = self.poly2quads(tcl_poly)
|
||||
poly_quads = self.poly2quads(poly)
|
||||
# stcl map
|
||||
stcl_quads, quad_index = self.shrink_poly_along_width(tcl_quads, shrink_ratio_of_width=shrink_ratio_of_width,
|
||||
expand_height_ratio=1.0 / tcl_ratio)
|
||||
# generate tcl map
|
||||
cv2.fillPoly(score_map, np.round(stcl_quads).astype(np.int32), 1.0)
|
||||
|
||||
# generate tbo map
|
||||
for idx, quad in enumerate(stcl_quads):
|
||||
quad_mask = np.zeros((h, w), dtype=np.float32)
|
||||
quad_mask = cv2.fillPoly(quad_mask, np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0)
|
||||
tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]], quad_mask, tbo_map)
|
||||
return score_map, tbo_map, training_mask
|
||||
|
||||
def generate_tvo_and_tco(self, hw, polys, tags, tcl_ratio=0.3, ds_ratio=0.25):
|
||||
"""
|
||||
Generate tcl map, tvo map and tbo map.
|
||||
"""
|
||||
h, w = hw
|
||||
h, w = int(h * ds_ratio), int(w * ds_ratio)
|
||||
polys = polys * ds_ratio
|
||||
poly_mask = np.zeros((h, w), dtype=np.float32)
|
||||
|
||||
tvo_map = np.ones((9, h, w), dtype=np.float32)
|
||||
tvo_map[0:-1:2] = np.tile(np.arange(0, w), (h, 1))
|
||||
tvo_map[1:-1:2] = np.tile(np.arange(0, w), (h, 1)).T
|
||||
poly_tv_xy_map = np.zeros((8, h, w), dtype=np.float32)
|
||||
|
||||
# tco map
|
||||
tco_map = np.ones((3, h, w), dtype=np.float32)
|
||||
tco_map[0] = np.tile(np.arange(0, w), (h, 1))
|
||||
tco_map[1] = np.tile(np.arange(0, w), (h, 1)).T
|
||||
poly_tc_xy_map = np.zeros((2, h, w), dtype=np.float32)
|
||||
|
||||
poly_short_edge_map = np.ones((h, w), dtype=np.float32)
|
||||
|
||||
for poly, poly_tag in zip(polys, tags):
|
||||
|
||||
if poly_tag == True:
|
||||
continue
|
||||
|
||||
# adjust point order for vertical poly
|
||||
poly = self.adjust_point(poly)
|
||||
|
||||
# generate min_area_quad
|
||||
min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
|
||||
min_area_quad_h = 0.5 * (np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
|
||||
np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
|
||||
min_area_quad_w = 0.5 * (np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
|
||||
np.linalg.norm(min_area_quad[2] - min_area_quad[3]))
|
||||
|
||||
# generate tcl map and text, 128 * 128
|
||||
tcl_poly = self.poly2tcl(poly, tcl_ratio)
|
||||
|
||||
# generate poly_tv_xy_map
|
||||
for idx in range(4):
|
||||
cv2.fillPoly(poly_tv_xy_map[2 * idx],
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(min(max(min_area_quad[idx, 0], 0), w)))
|
||||
cv2.fillPoly(poly_tv_xy_map[2 * idx + 1],
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(min(max(min_area_quad[idx, 1], 0), h)))
|
||||
|
||||
# generate poly_tc_xy_map
|
||||
for idx in range(2):
|
||||
cv2.fillPoly(poly_tc_xy_map[idx],
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), float(center_point[idx]))
|
||||
|
||||
# generate poly_short_edge_map
|
||||
cv2.fillPoly(poly_short_edge_map,
|
||||
np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
|
||||
float(max(min(min_area_quad_h, min_area_quad_w), 1.0)))
|
||||
|
||||
# generate poly_mask and training_mask
|
||||
cv2.fillPoly(poly_mask, np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), 1)
|
||||
|
||||
tvo_map *= poly_mask
|
||||
tvo_map[:8] -= poly_tv_xy_map
|
||||
tvo_map[-1] /= poly_short_edge_map
|
||||
tvo_map = tvo_map.transpose((1, 2, 0))
|
||||
|
||||
tco_map *= poly_mask
|
||||
tco_map[:2] -= poly_tc_xy_map
|
||||
tco_map[-1] /= poly_short_edge_map
|
||||
tco_map = tco_map.transpose((1, 2, 0))
|
||||
|
||||
return tvo_map, tco_map
|
||||
|
||||
def adjust_point(self, poly):
|
||||
"""
|
||||
adjust point order.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
if point_num == 4:
|
||||
len_1 = np.linalg.norm(poly[0] - poly[1])
|
||||
len_2 = np.linalg.norm(poly[1] - poly[2])
|
||||
len_3 = np.linalg.norm(poly[2] - poly[3])
|
||||
len_4 = np.linalg.norm(poly[3] - poly[0])
|
||||
|
||||
if (len_1 + len_3) * 1.5 < (len_2 + len_4):
|
||||
poly = poly[[1, 2, 3, 0], :]
|
||||
|
||||
elif point_num > 4:
|
||||
vector_1 = poly[0] - poly[1]
|
||||
vector_2 = poly[1] - poly[2]
|
||||
cos_theta = np.dot(vector_1, vector_2) / (np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6)
|
||||
theta = np.arccos(np.round(cos_theta, decimals=4))
|
||||
|
||||
if abs(theta) > (70 / 180 * math.pi):
|
||||
index = list(range(1, point_num)) + [0]
|
||||
poly = poly[np.array(index), :]
|
||||
return poly
|
||||
|
||||
def gen_min_area_quad_from_poly(self, poly):
|
||||
"""
|
||||
Generate min area quad from poly.
|
||||
"""
|
||||
point_num = poly.shape[0]
|
||||
min_area_quad = np.zeros((4, 2), dtype=np.float32)
|
||||
if point_num == 4:
|
||||
min_area_quad = poly
|
||||
center_point = np.sum(poly, axis=0) / 4
|
||||
else:
|
||||
rect = cv2.minAreaRect(poly.astype(np.int32)) # (center (x,y), (width, height), angle of rotation)
|
||||
center_point = rect[0]
|
||||
box = np.array(cv2.boxPoints(rect))
|
||||
|
||||
first_point_idx = 0
|
||||
min_dist = 1e4
|
||||
for i in range(4):
|
||||
dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
|
||||
np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
|
||||
np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
|
||||
np.linalg.norm(box[(i + 3) % 4] - poly[-1])
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
first_point_idx = i
|
||||
|
||||
for i in range(4):
|
||||
min_area_quad[i] = box[(first_point_idx + i) % 4]
|
||||
|
||||
return min_area_quad, center_point
|
||||
|
||||
def shrink_quad_along_width(self, quad, begin_width_ratio=0., end_width_ratio=1.):
|
||||
"""
|
||||
Generate shrink_quad_along_width.
|
||||
"""
|
||||
ratio_pair = np.array([[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
|
||||
p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
|
||||
p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
|
||||
return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
|
||||
|
||||
def shrink_poly_along_width(self, quads, shrink_ratio_of_width, expand_height_ratio=1.0):
|
||||
"""
|
||||
shrink poly with given length.
|
||||
"""
|
||||
upper_edge_list = []
|
||||
|
||||
def get_cut_info(edge_len_list, cut_len):
|
||||
for idx, edge_len in enumerate(edge_len_list):
|
||||
cut_len -= edge_len
|
||||
if cut_len <= 0.000001:
|
||||
ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx]
|
||||
return idx, ratio
|
||||
|
||||
for quad in quads:
|
||||
upper_edge_len = np.linalg.norm(quad[0] - quad[1])
|
||||
upper_edge_list.append(upper_edge_len)
|
||||
|
||||
# length of left edge and right edge.
|
||||
left_length = np.linalg.norm(quads[0][0] - quads[0][3]) * expand_height_ratio
|
||||
right_length = np.linalg.norm(quads[-1][1] - quads[-1][2]) * expand_height_ratio
|
||||
|
||||
shrink_length = min(left_length, right_length, sum(upper_edge_list)) * shrink_ratio_of_width
|
||||
# shrinking length
|
||||
upper_len_left = shrink_length
|
||||
upper_len_right = sum(upper_edge_list) - shrink_length
|
||||
|
||||
left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left)
|
||||
left_quad = self.shrink_quad_along_width(quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1)
|
||||
right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right)
|
||||
right_quad = self.shrink_quad_along_width(quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio)
|
||||
|
||||
out_quad_list = []
|
||||
if left_idx == right_idx:
|
||||
out_quad_list.append([left_quad[0], right_quad[1], right_quad[2], left_quad[3]])
|
||||
else:
|
||||
out_quad_list.append(left_quad)
|
||||
for idx in range(left_idx + 1, right_idx):
|
||||
out_quad_list.append(quads[idx])
|
||||
out_quad_list.append(right_quad)
|
||||
|
||||
return np.array(out_quad_list), list(range(left_idx, right_idx + 1))
|
||||
|
||||
def vector_angle(self, A, B):
|
||||
"""
|
||||
Calculate the angle between vector AB and x-axis positive direction.
|
||||
"""
|
||||
AB = np.array([B[1] - A[1], B[0] - A[0]])
|
||||
return np.arctan2(*AB)
|
||||
|
||||
def theta_line_cross_point(self, theta, point):
|
||||
"""
|
||||
Calculate the line through given point and angle in ax + by + c =0 form.
|
||||
"""
|
||||
x, y = point
|
||||
cos = np.cos(theta)
|
||||
sin = np.sin(theta)
|
||||
return [sin, -cos, cos * y - sin * x]
|
||||
|
||||
def line_cross_two_point(self, A, B):
|
||||
"""
|
||||
Calculate the line through given point A and B in ax + by + c =0 form.
|
||||
"""
|
||||
angle = self.vector_angle(A, B)
|
||||
return self.theta_line_cross_point(angle, A)
|
||||
|
||||
def average_angle(self, poly):
|
||||
"""
|
||||
Calculate the average angle between left and right edge in given poly.
|
||||
"""
|
||||
p0, p1, p2, p3 = poly
|
||||
angle30 = self.vector_angle(p3, p0)
|
||||
angle21 = self.vector_angle(p2, p1)
|
||||
return (angle30 + angle21) / 2
|
||||
|
||||
def line_cross_point(self, line1, line2):
|
||||
"""
|
||||
line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2
|
||||
"""
|
||||
a1, b1, c1 = line1
|
||||
a2, b2, c2 = line2
|
||||
d = a1 * b2 - a2 * b1
|
||||
|
||||
if d == 0:
|
||||
#print("line1", line1)
|
||||
#print("line2", line2)
|
||||
print('Cross point does not exist')
|
||||
return np.array([0, 0], dtype=np.float32)
|
||||
else:
|
||||
x = (b1 * c2 - b2 * c1) / d
|
||||
y = (a2 * c1 - a1 * c2) / d
|
||||
|
||||
return np.array([x, y], dtype=np.float32)
|
||||
|
||||
def quad2tcl(self, poly, ratio):
|
||||
"""
|
||||
Generate center line by poly clock-wise point. (4, 2)
|
||||
"""
|
||||
ratio_pair = np.array([[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
|
||||
p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair
|
||||
p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair
|
||||
return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]])
|
||||
|
||||
def poly2tcl(self, poly, ratio):
|
||||
"""
|
||||
Generate center line by poly clock-wise point.
|
||||
"""
|
||||
ratio_pair = np.array([[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
|
||||
tcl_poly = np.zeros_like(poly)
|
||||
point_num = poly.shape[0]
|
||||
|
||||
for idx in range(point_num // 2):
|
||||
point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx]) * ratio_pair
|
||||
tcl_poly[idx] = point_pair[0]
|
||||
tcl_poly[point_num - 1 - idx] = point_pair[1]
|
||||
return tcl_poly
|
||||
|
||||
def gen_quad_tbo(self, quad, tcl_mask, tbo_map):
|
||||
"""
|
||||
Generate tbo_map for give quad.
|
||||
"""
|
||||
# upper and lower line function: ax + by + c = 0;
|
||||
up_line = self.line_cross_two_point(quad[0], quad[1])
|
||||
lower_line = self.line_cross_two_point(quad[3], quad[2])
|
||||
|
||||
quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - quad[2]))
|
||||
quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[2] - quad[3]))
|
||||
|
||||
# average angle of left and right line.
|
||||
angle = self.average_angle(quad)
|
||||
|
||||
xy_in_poly = np.argwhere(tcl_mask == 1)
|
||||
for y, x in xy_in_poly:
|
||||
point = (x, y)
|
||||
line = self.theta_line_cross_point(angle, point)
|
||||
cross_point_upper = self.line_cross_point(up_line, line)
|
||||
cross_point_lower = self.line_cross_point(lower_line, line)
|
||||
##FIX, offset reverse
|
||||
upper_offset_x, upper_offset_y = cross_point_upper - point
|
||||
lower_offset_x, lower_offset_y = cross_point_lower - point
|
||||
tbo_map[y, x, 0] = upper_offset_y
|
||||
tbo_map[y, x, 1] = upper_offset_x
|
||||
tbo_map[y, x, 2] = lower_offset_y
|
||||
tbo_map[y, x, 3] = lower_offset_x
|
||||
tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2
|
||||
return tbo_map
|
||||
|
||||
def poly2quads(self, poly):
|
||||
"""
|
||||
Split poly into quads.
|
||||
"""
|
||||
quad_list = []
|
||||
point_num = poly.shape[0]
|
||||
|
||||
# point pair
|
||||
point_pair_list = []
|
||||
for idx in range(point_num // 2):
|
||||
point_pair = [poly[idx], poly[point_num - 1 - idx]]
|
||||
point_pair_list.append(point_pair)
|
||||
|
||||
quad_num = point_num // 2 - 1
|
||||
for idx in range(quad_num):
|
||||
# reshape and adjust to clock-wise
|
||||
quad_list.append((np.array(point_pair_list)[[idx, idx + 1]]).reshape(4, 2)[[0, 2, 3, 1]])
|
||||
|
||||
return np.array(quad_list)
|
||||
|
||||
def extract_polys(self, poly_txt_path):
|
||||
"""
|
||||
Read text_polys, txt_tags, txts from give txt file.
|
||||
"""
|
||||
text_polys, txt_tags, txts = [], [], []
|
||||
|
||||
with open(poly_txt_path) as f:
|
||||
for line in f.readlines():
|
||||
poly_str, txt = line.strip().split('\t')
|
||||
poly = map(float, poly_str.split(','))
|
||||
text_polys.append(np.array(poly, dtype=np.float32).reshape(-1, 2))
|
||||
txts.append(txt)
|
||||
if txt == '###':
|
||||
txt_tags.append(True)
|
||||
else:
|
||||
txt_tags.append(False)
|
||||
|
||||
return np.array(map(np.array, text_polys)), \
|
||||
np.array(txt_tags, dtype=np.bool), txts
|
||||
|
||||
def __call__(self, label_infor):
|
||||
infor = self.convert_label_infor(label_infor)
|
||||
im_path, text_polys, text_tags, text_strs = infor
|
||||
im = cv2.imread(im_path)
|
||||
if im is None:
|
||||
return None
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
h, w, _ = im.shape
|
||||
text_polys, text_tags, hv_tags = self.check_and_validate_polys(text_polys, text_tags, (h, w))
|
||||
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
|
||||
#set aspect ratio and keep area fix
|
||||
asp_scales = np.arange(1.0, 1.55, 0.1)
|
||||
asp_scale = np.random.choice(asp_scales)
|
||||
|
||||
if np.random.rand() < 0.5:
|
||||
asp_scale = 1.0 / asp_scale
|
||||
asp_scale = math.sqrt(asp_scale)
|
||||
|
||||
asp_wx = asp_scale
|
||||
asp_hy = 1.0 / asp_scale
|
||||
im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy)
|
||||
text_polys[:, :, 0] *= asp_wx
|
||||
text_polys[:, :, 1] *= asp_hy
|
||||
|
||||
h, w, _ = im.shape
|
||||
if max(h, w) > 2048:
|
||||
rd_scale = 2048.0 / max(h, w)
|
||||
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
|
||||
text_polys *= rd_scale
|
||||
h, w, _ = im.shape
|
||||
if min(h, w) < 16:
|
||||
return None
|
||||
|
||||
#no background
|
||||
im, text_polys, text_tags, hv_tags, text_strs = self.crop_area(im, \
|
||||
text_polys, text_tags, hv_tags, text_strs, crop_background=False)
|
||||
if text_polys.shape[0] == 0:
|
||||
return None
|
||||
#continue for all ignore case
|
||||
if np.sum((text_tags * 1.0)) >= text_tags.size:
|
||||
return None
|
||||
new_h, new_w, _ = im.shape
|
||||
if (new_h is None) or (new_w is None):
|
||||
return None
|
||||
#resize image
|
||||
std_ratio = float(self.input_size) / max(new_w, new_h)
|
||||
rand_scales = np.array([0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
rz_scale = std_ratio * np.random.choice(rand_scales)
|
||||
im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale)
|
||||
text_polys[:, :, 0] *= rz_scale
|
||||
text_polys[:, :, 1] *= rz_scale
|
||||
|
||||
#add gaussian blur
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
ks = np.random.permutation(5)[0] + 1
|
||||
ks = int(ks/2)*2 + 1
|
||||
im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0)
|
||||
#add brighter
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
im = im * (1.0 + np.random.rand() * 0.5)
|
||||
im = np.clip(im, 0.0, 255.0)
|
||||
#add darker
|
||||
if np.random.rand() < 0.1 * 0.5:
|
||||
im = im * (1.0 - np.random.rand() * 0.5)
|
||||
im = np.clip(im, 0.0, 255.0)
|
||||
|
||||
# Padding the im to [input_size, input_size]
|
||||
new_h, new_w, _ = im.shape
|
||||
if min(new_w, new_h) < self.input_size * 0.5:
|
||||
return None
|
||||
|
||||
im_padded = np.ones((self.input_size, self.input_size, 3), dtype=np.float32)
|
||||
im_padded[:, :, 2] = 0.485 * 255
|
||||
im_padded[:, :, 1] = 0.456 * 255
|
||||
im_padded[:, :, 0] = 0.406 * 255
|
||||
|
||||
# Random the start position
|
||||
del_h = self.input_size - new_h
|
||||
del_w = self.input_size - new_w
|
||||
sh, sw = 0, 0
|
||||
if del_h > 1:
|
||||
sh = int(np.random.rand() * del_h)
|
||||
if del_w > 1:
|
||||
sw = int(np.random.rand() * del_w)
|
||||
|
||||
# Padding
|
||||
im_padded[sh: sh + new_h, sw: sw + new_w, :] = im.copy()
|
||||
text_polys[:, :, 0] += sw
|
||||
text_polys[:, :, 1] += sh
|
||||
|
||||
score_map, border_map, training_mask = self.generate_tcl_label((self.input_size, self.input_size),
|
||||
text_polys, text_tags, 0.25)
|
||||
|
||||
# SAST head
|
||||
tvo_map, tco_map = self.generate_tvo_and_tco((self.input_size, self.input_size), text_polys, text_tags, tcl_ratio=0.3, ds_ratio=0.25)
|
||||
# print("test--------tvo_map shape:", tvo_map.shape)
|
||||
|
||||
im_padded[:, :, 2] -= 0.485 * 255
|
||||
im_padded[:, :, 1] -= 0.456 * 255
|
||||
im_padded[:, :, 0] -= 0.406 * 255
|
||||
im_padded[:, :, 2] /= (255.0 * 0.229)
|
||||
im_padded[:, :, 1] /= (255.0 * 0.224)
|
||||
im_padded[:, :, 0] /= (255.0 * 0.225)
|
||||
im_padded = im_padded.transpose((2, 0, 1))
|
||||
|
||||
return im_padded[::-1, :, :], score_map[np.newaxis, :, :], border_map.transpose((2, 0, 1)), training_mask[np.newaxis, :, :], tvo_map.transpose((2, 0, 1)), tco_map.transpose((2, 0, 1))
|
||||
|
||||
|
||||
class SASTProcessTest(object):
|
||||
"""
|
||||
SAST process function for test
|
||||
"""
|
||||
def __init__(self, params):
|
||||
super(SASTProcessTest, self).__init__()
|
||||
if 'max_side_len' in params:
|
||||
self.max_side_len = params['max_side_len']
|
||||
else:
|
||||
self.max_side_len = 2400
|
||||
|
||||
def resize_image(self, im):
|
||||
"""
|
||||
resize image to a size multiple of max_stride which is required by the network
|
||||
:param im: the resized image
|
||||
:param max_side_len: limit of max image size to avoid out of memory in gpu
|
||||
:return: the resized image and the resize ratio
|
||||
"""
|
||||
h, w, _ = im.shape
|
||||
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
||||
# Fix the longer side
|
||||
if resize_h > resize_w:
|
||||
ratio = float(self.max_side_len) / resize_h
|
||||
else:
|
||||
ratio = float(self.max_side_len) / resize_w
|
||||
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
|
||||
max_stride = 128
|
||||
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
|
||||
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
|
||||
im = cv2.resize(im, (int(resize_w), int(resize_h)))
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
|
||||
return im, (ratio_h, ratio_w)
|
||||
|
||||
def __call__(self, im):
|
||||
src_h, src_w, _ = im.shape
|
||||
im, (ratio_h, ratio_w) = self.resize_image(im)
|
||||
img_mean = [0.485, 0.456, 0.406]
|
||||
img_std = [0.229, 0.224, 0.225]
|
||||
im = im[:, :, ::-1].astype(np.float32)
|
||||
im = im / 255
|
||||
im -= img_mean
|
||||
im /= img_std
|
||||
im = im.transpose((2, 0, 1))
|
||||
im = im[np.newaxis, :]
|
||||
return [im, (ratio_h, ratio_w, src_h, src_w)]
|
|
@ -0,0 +1,59 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .iaa_augment import IaaAugment
|
||||
from .make_border_map import MakeBorderMap
|
||||
from .make_shrink_map import MakeShrinkMap
|
||||
from .random_crop_data import EastRandomCropData, PSERandomCrop
|
||||
|
||||
from .rec_img_aug import RecAug, RecResizeImg
|
||||
|
||||
from .operators import *
|
||||
from .label_ops import *
|
||||
|
||||
|
||||
def transform(data, ops=None):
|
||||
""" transform """
|
||||
if ops is None:
|
||||
ops = []
|
||||
for op in ops:
|
||||
data = op(data)
|
||||
if data is None:
|
||||
return None
|
||||
return data
|
||||
|
||||
|
||||
def create_operators(op_param_list, global_config=None):
|
||||
"""
|
||||
create operators based on the config
|
||||
|
||||
Args:
|
||||
params(list): a dict list, used to create some operators
|
||||
"""
|
||||
assert isinstance(op_param_list, list), ('operator config should be a list')
|
||||
ops = []
|
||||
for operator in op_param_list:
|
||||
assert isinstance(operator,
|
||||
dict) and len(operator) == 1, "yaml format error"
|
||||
op_name = list(operator)[0]
|
||||
param = {} if operator[op_name] is None else operator[op_name]
|
||||
if global_config is not None:
|
||||
param.update(global_config)
|
||||
op = eval(op_name)(**param)
|
||||
ops.append(op)
|
||||
return ops
|
|
@ -0,0 +1,101 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import imgaug
|
||||
import imgaug.augmenters as iaa
|
||||
|
||||
|
||||
class AugmenterBuilder(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def build(self, args, root=True):
|
||||
if args is None or len(args) == 0:
|
||||
return None
|
||||
elif isinstance(args, list):
|
||||
if root:
|
||||
sequence = [self.build(value, root=False) for value in args]
|
||||
return iaa.Sequential(sequence)
|
||||
else:
|
||||
return getattr(iaa, args[0])(
|
||||
*[self.to_tuple_if_list(a) for a in args[1:]])
|
||||
elif isinstance(args, dict):
|
||||
cls = getattr(iaa, args['type'])
|
||||
return cls(**{
|
||||
k: self.to_tuple_if_list(v)
|
||||
for k, v in args['args'].items()
|
||||
})
|
||||
else:
|
||||
raise RuntimeError('unknown augmenter arg: ' + str(args))
|
||||
|
||||
def to_tuple_if_list(self, obj):
|
||||
if isinstance(obj, list):
|
||||
return tuple(obj)
|
||||
return obj
|
||||
|
||||
|
||||
class IaaAugment():
|
||||
def __init__(self, augmenter_args=None, **kwargs):
|
||||
if augmenter_args is None:
|
||||
augmenter_args = [{
|
||||
'type': 'Fliplr',
|
||||
'args': {
|
||||
'p': 0.5
|
||||
}
|
||||
}, {
|
||||
'type': 'Affine',
|
||||
'args': {
|
||||
'rotate': [-10, 10]
|
||||
}
|
||||
}, {
|
||||
'type': 'Resize',
|
||||
'args': {
|
||||
'size': [0.5, 3]
|
||||
}
|
||||
}]
|
||||
self.augmenter = AugmenterBuilder().build(augmenter_args)
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
shape = image.shape
|
||||
|
||||
if self.augmenter:
|
||||
aug = self.augmenter.to_deterministic()
|
||||
data['image'] = aug.augment_image(image)
|
||||
data = self.may_augment_annotation(aug, data, shape)
|
||||
return data
|
||||
|
||||
def may_augment_annotation(self, aug, data, shape):
|
||||
if aug is None:
|
||||
return data
|
||||
|
||||
line_polys = []
|
||||
for poly in data['polys']:
|
||||
new_poly = self.may_augment_poly(aug, shape, poly)
|
||||
line_polys.append(new_poly)
|
||||
data['polys'] = np.array(line_polys)
|
||||
return data
|
||||
|
||||
def may_augment_poly(self, aug, img_shape, poly):
|
||||
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
|
||||
keypoints = aug.augment_keypoints(
|
||||
[imgaug.KeypointsOnImage(
|
||||
keypoints, shape=img_shape)])[0].keypoints
|
||||
poly = [(p.x, p.y) for p in keypoints]
|
||||
return poly
|
|
@ -0,0 +1,197 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
from ppocr.utils.logging import get_logger
|
||||
|
||||
|
||||
class DetLabelEncode(object):
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __call__(self, data):
|
||||
import json
|
||||
label = data['label']
|
||||
label = json.loads(label)
|
||||
nBox = len(label)
|
||||
boxes, txts, txt_tags = [], [], []
|
||||
for bno in range(0, nBox):
|
||||
box = label[bno]['points']
|
||||
txt = label[bno]['transcription']
|
||||
boxes.append(box)
|
||||
txts.append(txt)
|
||||
if txt in ['*', '###']:
|
||||
txt_tags.append(True)
|
||||
else:
|
||||
txt_tags.append(False)
|
||||
boxes = np.array(boxes, dtype=np.float32)
|
||||
txt_tags = np.array(txt_tags, dtype=np.bool)
|
||||
|
||||
data['polys'] = boxes
|
||||
data['texts'] = txts
|
||||
data['ignore_tags'] = txt_tags
|
||||
return data
|
||||
|
||||
def order_points_clockwise(self, pts):
|
||||
rect = np.zeros((4, 2), dtype="float32")
|
||||
s = pts.sum(axis=1)
|
||||
rect[0] = pts[np.argmin(s)]
|
||||
rect[2] = pts[np.argmax(s)]
|
||||
diff = np.diff(pts, axis=1)
|
||||
rect[1] = pts[np.argmin(diff)]
|
||||
rect[3] = pts[np.argmax(diff)]
|
||||
return rect
|
||||
|
||||
|
||||
class BaseRecLabelEncode(object):
|
||||
""" Convert between text-label and text-index """
|
||||
|
||||
def __init__(self,
|
||||
max_text_length,
|
||||
character_dict_path=None,
|
||||
character_type='ch',
|
||||
use_space_char=False):
|
||||
support_character_type = ['ch', 'en', 'en_sensitive']
|
||||
assert character_type in support_character_type, "Only {} are supported now but get {}".format(
|
||||
support_character_type, self.character_str)
|
||||
|
||||
self.max_text_len = max_text_length
|
||||
if character_type == "en":
|
||||
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
dict_character = list(self.character_str)
|
||||
elif character_type == "ch":
|
||||
self.character_str = ""
|
||||
assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch"
|
||||
with open(character_dict_path, "rb") as fin:
|
||||
lines = fin.readlines()
|
||||
for line in lines:
|
||||
line = line.decode('utf-8').strip("\n").strip("\r\n")
|
||||
self.character_str += line
|
||||
if use_space_char:
|
||||
self.character_str += " "
|
||||
dict_character = list(self.character_str)
|
||||
elif character_type == "en_sensitive":
|
||||
# same with ASTER setting (use 94 char).
|
||||
import string
|
||||
self.character_str = string.printable[:-6]
|
||||
dict_character = list(self.character_str)
|
||||
self.character_type = character_type
|
||||
dict_character = self.add_special_char(dict_character)
|
||||
self.dict = {}
|
||||
for i, char in enumerate(dict_character):
|
||||
self.dict[char] = i
|
||||
self.character = dict_character
|
||||
|
||||
def add_special_char(self, dict_character):
|
||||
return dict_character
|
||||
|
||||
def encode(self, text):
|
||||
"""convert text-label into text-index.
|
||||
input:
|
||||
text: text labels of each image. [batch_size]
|
||||
|
||||
output:
|
||||
text: concatenated text index for CTCLoss.
|
||||
[sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)]
|
||||
length: length of each text. [batch_size]
|
||||
"""
|
||||
if len(text) > self.max_text_len:
|
||||
return None
|
||||
if self.character_type == "en":
|
||||
text = text.lower()
|
||||
text_list = []
|
||||
for char in text:
|
||||
if char not in self.dict:
|
||||
# logger = get_logger()
|
||||
# logger.warning('{} is not in dict'.format(char))
|
||||
continue
|
||||
text_list.append(self.dict[char])
|
||||
if len(text_list) == 0:
|
||||
return None
|
||||
return text_list
|
||||
|
||||
def get_ignored_tokens(self):
|
||||
return [0] # for ctc blank
|
||||
|
||||
|
||||
class CTCLabelEncode(BaseRecLabelEncode):
|
||||
""" Convert between text-label and text-index """
|
||||
|
||||
def __init__(self,
|
||||
max_text_length,
|
||||
character_dict_path=None,
|
||||
character_type='ch',
|
||||
use_space_char=False,
|
||||
**kwargs):
|
||||
super(CTCLabelEncode,
|
||||
self).__init__(max_text_length, character_dict_path,
|
||||
character_type, use_space_char)
|
||||
|
||||
def __call__(self, data):
|
||||
text = data['label']
|
||||
text = self.encode(text)
|
||||
if text is None:
|
||||
return None
|
||||
data['length'] = np.array(len(text))
|
||||
text = text + [0] * (self.max_text_len - len(text))
|
||||
data['label'] = np.array(text)
|
||||
return data
|
||||
|
||||
def add_special_char(self, dict_character):
|
||||
dict_character = ['blank'] + dict_character
|
||||
return dict_character
|
||||
|
||||
|
||||
class AttnLabelEncode(BaseRecLabelEncode):
|
||||
""" Convert between text-label and text-index """
|
||||
|
||||
def __init__(self,
|
||||
max_text_length,
|
||||
character_dict_path=None,
|
||||
character_type='ch',
|
||||
use_space_char=False,
|
||||
**kwargs):
|
||||
super(AttnLabelEncode,
|
||||
self).__init__(max_text_length, character_dict_path,
|
||||
character_type, use_space_char)
|
||||
self.beg_str = "sos"
|
||||
self.end_str = "eos"
|
||||
|
||||
def add_special_char(self, dict_character):
|
||||
dict_character = [self.beg_str, self.end_str] + dict_character
|
||||
return dict_character
|
||||
|
||||
def __call__(self, text):
|
||||
text = self.encode(text)
|
||||
return text
|
||||
|
||||
def get_ignored_tokens(self):
|
||||
beg_idx = self.get_beg_end_flag_idx("beg")
|
||||
end_idx = self.get_beg_end_flag_idx("end")
|
||||
return [beg_idx, end_idx]
|
||||
|
||||
def get_beg_end_flag_idx(self, beg_or_end):
|
||||
if beg_or_end == "beg":
|
||||
idx = np.array(self.dict[self.beg_str])
|
||||
elif beg_or_end == "end":
|
||||
idx = np.array(self.dict[self.end_str])
|
||||
else:
|
||||
assert False, "Unsupport type %s in get_beg_end_flag_idx" \
|
||||
% beg_or_end
|
||||
return idx
|
|
@ -0,0 +1,157 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
np.seterr(divide='ignore', invalid='ignore')
|
||||
import pyclipper
|
||||
from shapely.geometry import Polygon
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
warnings.simplefilter("ignore")
|
||||
|
||||
__all__ = ['MakeBorderMap']
|
||||
|
||||
|
||||
class MakeBorderMap(object):
|
||||
def __init__(self,
|
||||
shrink_ratio=0.4,
|
||||
thresh_min=0.3,
|
||||
thresh_max=0.7,
|
||||
**kwargs):
|
||||
self.shrink_ratio = shrink_ratio
|
||||
self.thresh_min = thresh_min
|
||||
self.thresh_max = thresh_max
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
|
||||
img = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
canvas = np.zeros(img.shape[:2], dtype=np.float32)
|
||||
mask = np.zeros(img.shape[:2], dtype=np.float32)
|
||||
|
||||
for i in range(len(text_polys)):
|
||||
if ignore_tags[i]:
|
||||
continue
|
||||
self.draw_border_map(text_polys[i], canvas, mask=mask)
|
||||
canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
|
||||
|
||||
data['threshold_map'] = canvas
|
||||
data['threshold_mask'] = mask
|
||||
return data
|
||||
|
||||
def draw_border_map(self, polygon, canvas, mask):
|
||||
polygon = np.array(polygon)
|
||||
assert polygon.ndim == 2
|
||||
assert polygon.shape[1] == 2
|
||||
|
||||
polygon_shape = Polygon(polygon)
|
||||
if polygon_shape.area <= 0:
|
||||
return
|
||||
distance = polygon_shape.area * (
|
||||
1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
|
||||
subject = [tuple(l) for l in polygon]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
|
||||
padded_polygon = np.array(padding.Execute(distance)[0])
|
||||
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
|
||||
|
||||
xmin = padded_polygon[:, 0].min()
|
||||
xmax = padded_polygon[:, 0].max()
|
||||
ymin = padded_polygon[:, 1].min()
|
||||
ymax = padded_polygon[:, 1].max()
|
||||
width = xmax - xmin + 1
|
||||
height = ymax - ymin + 1
|
||||
|
||||
polygon[:, 0] = polygon[:, 0] - xmin
|
||||
polygon[:, 1] = polygon[:, 1] - ymin
|
||||
|
||||
xs = np.broadcast_to(
|
||||
np.linspace(
|
||||
0, width - 1, num=width).reshape(1, width), (height, width))
|
||||
ys = np.broadcast_to(
|
||||
np.linspace(
|
||||
0, height - 1, num=height).reshape(height, 1), (height, width))
|
||||
|
||||
distance_map = np.zeros(
|
||||
(polygon.shape[0], height, width), dtype=np.float32)
|
||||
for i in range(polygon.shape[0]):
|
||||
j = (i + 1) % polygon.shape[0]
|
||||
absolute_distance = self._distance(xs, ys, polygon[i], polygon[j])
|
||||
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
|
||||
distance_map = distance_map.min(axis=0)
|
||||
|
||||
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
|
||||
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
|
||||
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
|
||||
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
|
||||
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
|
||||
1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
|
||||
xmin_valid - xmin:xmax_valid - xmax + width],
|
||||
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
|
||||
|
||||
def _distance(self, xs, ys, point_1, point_2):
|
||||
'''
|
||||
compute the distance from point to a line
|
||||
ys: coordinates in the first axis
|
||||
xs: coordinates in the second axis
|
||||
point_1, point_2: (x, y), the end of the line
|
||||
'''
|
||||
height, width = xs.shape[:2]
|
||||
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[
|
||||
1])
|
||||
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[
|
||||
1])
|
||||
square_distance = np.square(point_1[0] - point_2[0]) + np.square(
|
||||
point_1[1] - point_2[1])
|
||||
|
||||
cosin = (square_distance - square_distance_1 - square_distance_2) / (
|
||||
2 * np.sqrt(square_distance_1 * square_distance_2))
|
||||
square_sin = 1 - np.square(cosin)
|
||||
square_sin = np.nan_to_num(square_sin)
|
||||
result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
|
||||
square_distance)
|
||||
|
||||
result[cosin <
|
||||
0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin
|
||||
< 0]
|
||||
# self.extend_line(point_1, point_2, result)
|
||||
return result
|
||||
|
||||
def extend_line(self, point_1, point_2, result, shrink_ratio):
|
||||
ex_point_1 = (int(
|
||||
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
|
||||
int(
|
||||
round(point_1[1] + (point_1[1] - point_2[1]) * (
|
||||
1 + shrink_ratio))))
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_1),
|
||||
tuple(point_1),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0)
|
||||
ex_point_2 = (int(
|
||||
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
|
||||
int(
|
||||
round(point_2[1] + (point_2[1] - point_1[1]) * (
|
||||
1 + shrink_ratio))))
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_2),
|
||||
tuple(point_2),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0)
|
||||
return ex_point_1, ex_point_2
|
|
@ -0,0 +1,94 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
||||
__all__ = ['MakeShrinkMap']
|
||||
|
||||
|
||||
class MakeShrinkMap(object):
|
||||
r'''
|
||||
Making binary mask from detection data with ICDAR format.
|
||||
Typically following the process of class `MakeICDARData`.
|
||||
'''
|
||||
|
||||
def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs):
|
||||
self.min_text_size = min_text_size
|
||||
self.shrink_ratio = shrink_ratio
|
||||
|
||||
def __call__(self, data):
|
||||
image = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
|
||||
h, w = image.shape[:2]
|
||||
text_polys, ignore_tags = self.validate_polygons(text_polys,
|
||||
ignore_tags, h, w)
|
||||
gt = np.zeros((h, w), dtype=np.float32)
|
||||
# gt = np.zeros((1, h, w), dtype=np.float32)
|
||||
mask = np.ones((h, w), dtype=np.float32)
|
||||
for i in range(len(text_polys)):
|
||||
polygon = text_polys[i]
|
||||
height = max(polygon[:, 1]) - min(polygon[:, 1])
|
||||
width = max(polygon[:, 0]) - min(polygon[:, 0])
|
||||
if ignore_tags[i] or min(height, width) < self.min_text_size:
|
||||
cv2.fillPoly(mask,
|
||||
polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
else:
|
||||
polygon_shape = Polygon(polygon)
|
||||
distance = polygon_shape.area * (
|
||||
1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
|
||||
subject = [tuple(l) for l in text_polys[i]]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND,
|
||||
pyclipper.ET_CLOSEDPOLYGON)
|
||||
shrinked = padding.Execute(-distance)
|
||||
if shrinked == []:
|
||||
cv2.fillPoly(mask,
|
||||
polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
continue
|
||||
shrinked = np.array(shrinked[0]).reshape(-1, 2)
|
||||
cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1)
|
||||
# cv2.fillPoly(gt[0], [shrinked.astype(np.int32)], 1)
|
||||
|
||||
data['shrink_map'] = gt
|
||||
data['shrink_mask'] = mask
|
||||
return data
|
||||
|
||||
def validate_polygons(self, polygons, ignore_tags, h, w):
|
||||
'''
|
||||
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
|
||||
'''
|
||||
if len(polygons) == 0:
|
||||
return polygons, ignore_tags
|
||||
assert len(polygons) == len(ignore_tags)
|
||||
for polygon in polygons:
|
||||
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
|
||||
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
|
||||
|
||||
for i in range(len(polygons)):
|
||||
area = self.polygon_area(polygons[i])
|
||||
if abs(area) < 1:
|
||||
ignore_tags[i] = True
|
||||
if area > 0:
|
||||
polygons[i] = polygons[i][::-1, :]
|
||||
return polygons, ignore_tags
|
||||
|
||||
def polygon_area(self, polygon):
|
||||
# return cv2.contourArea(polygon.astype(np.float32))
|
||||
edge = 0
|
||||
for i in range(polygon.shape[0]):
|
||||
next_index = (i + 1) % polygon.shape[0]
|
||||
edge += (polygon[next_index, 0] - polygon[i, 0]) * (
|
||||
polygon[next_index, 1] - polygon[i, 1])
|
||||
|
||||
return edge / 2.
|
|
@ -0,0 +1,185 @@
|
|||
"""
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import six
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
class DecodeImage(object):
|
||||
""" decode image """
|
||||
|
||||
def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
|
||||
self.img_mode = img_mode
|
||||
self.channel_first = channel_first
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if six.PY2:
|
||||
assert type(img) is str and len(
|
||||
img) > 0, "invalid input 'img' in DecodeImage"
|
||||
else:
|
||||
assert type(img) is bytes and len(
|
||||
img) > 0, "invalid input 'img' in DecodeImage"
|
||||
img = np.frombuffer(img, dtype='uint8')
|
||||
img = cv2.imdecode(img, 1)
|
||||
if self.img_mode == 'GRAY':
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
elif self.img_mode == 'RGB':
|
||||
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
|
||||
img = img[:, :, ::-1]
|
||||
|
||||
if self.channel_first:
|
||||
img = img.transpose((2, 0, 1))
|
||||
|
||||
data['image'] = img
|
||||
return data
|
||||
|
||||
|
||||
class NormalizeImage(object):
|
||||
""" normalize image such as substract mean, divide std
|
||||
"""
|
||||
|
||||
def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
|
||||
if isinstance(scale, str):
|
||||
scale = eval(scale)
|
||||
self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
|
||||
mean = mean if mean is not None else [0.485, 0.456, 0.406]
|
||||
std = std if std is not None else [0.229, 0.224, 0.225]
|
||||
|
||||
shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
|
||||
self.mean = np.array(mean).reshape(shape).astype('float32')
|
||||
self.std = np.array(std).reshape(shape).astype('float32')
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
from PIL import Image
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.array(img)
|
||||
|
||||
assert isinstance(img,
|
||||
np.ndarray), "invalid input 'img' in NormalizeImage"
|
||||
data['image'] = (
|
||||
img.astype('float32') * self.scale - self.mean) / self.std
|
||||
return data
|
||||
|
||||
|
||||
class ToCHWImage(object):
|
||||
""" convert hwc image to chw image
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
from PIL import Image
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.array(img)
|
||||
data['image'] = img.transpose((2, 0, 1))
|
||||
return data
|
||||
|
||||
|
||||
class keepKeys(object):
|
||||
def __init__(self, keep_keys, **kwargs):
|
||||
self.keep_keys = keep_keys
|
||||
|
||||
def __call__(self, data):
|
||||
data_list = []
|
||||
for key in self.keep_keys:
|
||||
data_list.append(data[key])
|
||||
return data_list
|
||||
|
||||
|
||||
class DetResizeForTest(object):
|
||||
def __init__(self, **kwargs):
|
||||
super(DetResizeForTest, self).__init__()
|
||||
self.resize_type = 0
|
||||
if 'image_shape' in kwargs:
|
||||
self.image_shape = kwargs['image_shape']
|
||||
self.resize_type = 1
|
||||
if 'limit_side_len' in kwargs:
|
||||
self.limit_side_len = kwargs['limit_side_len']
|
||||
self.limit_type = kwargs.get('limit_type', 'min')
|
||||
else:
|
||||
self.limit_side_len = 736
|
||||
self.limit_type = 'min'
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
|
||||
if self.resize_type == 0:
|
||||
img, shape = self.resize_image_type0(img)
|
||||
else:
|
||||
img, shape = self.resize_image_type1(img)
|
||||
data['image'] = img
|
||||
data['shape'] = shape
|
||||
return data
|
||||
|
||||
def resize_image_type1(self, img):
|
||||
resize_h, resize_w = self.image_shape
|
||||
ori_h, ori_w = img.shape[:2] # (h, w, c)
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
return img, np.array([ori_h, ori_w])
|
||||
|
||||
def resize_image_type0(self, img):
|
||||
"""
|
||||
resize image to a size multiple of 32 which is required by the network
|
||||
args:
|
||||
img(array): array with shape [h, w, c]
|
||||
return(tuple):
|
||||
img, (ratio_h, ratio_w)
|
||||
"""
|
||||
limit_side_len = self.limit_side_len
|
||||
h, w, _ = img.shape
|
||||
|
||||
# limit the max side
|
||||
if self.limit_type == 'max':
|
||||
if max(h, w) > limit_side_len:
|
||||
if h > w:
|
||||
ratio = float(limit_side_len) / h
|
||||
else:
|
||||
ratio = float(limit_side_len) / w
|
||||
else:
|
||||
ratio = 1.
|
||||
else:
|
||||
if min(h, w) < limit_side_len:
|
||||
if h < w:
|
||||
ratio = float(limit_side_len) / h
|
||||
else:
|
||||
ratio = float(limit_side_len) / w
|
||||
else:
|
||||
ratio = 1.
|
||||
resize_h = int(h * ratio)
|
||||
resize_w = int(w * ratio)
|
||||
|
||||
resize_h = int(round(resize_h / 32) * 32)
|
||||
resize_w = int(round(resize_w / 32) * 32)
|
||||
|
||||
try:
|
||||
if int(resize_w) <= 0 or int(resize_h) <= 0:
|
||||
return None, (None, None)
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
except:
|
||||
print(img.shape, resize_w, resize_h)
|
||||
sys.exit(0)
|
||||
return img, np.array([h, w])
|
|
@ -0,0 +1,210 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
import random
|
||||
|
||||
|
||||
def is_poly_in_rect(poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
|
||||
return False
|
||||
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_poly_outside_rect(poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
|
||||
return True
|
||||
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def split_regions(axis):
|
||||
regions = []
|
||||
min_axis = 0
|
||||
for i in range(1, axis.shape[0]):
|
||||
if axis[i] != axis[i - 1] + 1:
|
||||
region = axis[min_axis:i]
|
||||
min_axis = i
|
||||
regions.append(region)
|
||||
return regions
|
||||
|
||||
|
||||
def random_select(axis, max_size):
|
||||
xx = np.random.choice(axis, size=2)
|
||||
xmin = np.min(xx)
|
||||
xmax = np.max(xx)
|
||||
xmin = np.clip(xmin, 0, max_size - 1)
|
||||
xmax = np.clip(xmax, 0, max_size - 1)
|
||||
return xmin, xmax
|
||||
|
||||
|
||||
def region_wise_random_select(regions, max_size):
|
||||
selected_index = list(np.random.choice(len(regions), 2))
|
||||
selected_values = []
|
||||
for index in selected_index:
|
||||
axis = regions[index]
|
||||
xx = int(np.random.choice(axis, size=1))
|
||||
selected_values.append(xx)
|
||||
xmin = min(selected_values)
|
||||
xmax = max(selected_values)
|
||||
return xmin, xmax
|
||||
|
||||
|
||||
def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
|
||||
h, w, _ = im.shape
|
||||
h_array = np.zeros(h, dtype=np.int32)
|
||||
w_array = np.zeros(w, dtype=np.int32)
|
||||
for points in text_polys:
|
||||
points = np.round(points, decimals=0).astype(np.int32)
|
||||
minx = np.min(points[:, 0])
|
||||
maxx = np.max(points[:, 0])
|
||||
w_array[minx:maxx] = 1
|
||||
miny = np.min(points[:, 1])
|
||||
maxy = np.max(points[:, 1])
|
||||
h_array[miny:maxy] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return 0, 0, w, h
|
||||
|
||||
h_regions = split_regions(h_axis)
|
||||
w_regions = split_regions(w_axis)
|
||||
|
||||
for i in range(max_tries):
|
||||
if len(w_regions) > 1:
|
||||
xmin, xmax = region_wise_random_select(w_regions, w)
|
||||
else:
|
||||
xmin, xmax = random_select(w_axis, w)
|
||||
if len(h_regions) > 1:
|
||||
ymin, ymax = region_wise_random_select(h_regions, h)
|
||||
else:
|
||||
ymin, ymax = random_select(h_axis, h)
|
||||
|
||||
if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h:
|
||||
# area too small
|
||||
continue
|
||||
num_poly_in_rect = 0
|
||||
for poly in text_polys:
|
||||
if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin,
|
||||
ymax - ymin):
|
||||
num_poly_in_rect += 1
|
||||
break
|
||||
|
||||
if num_poly_in_rect > 0:
|
||||
return xmin, ymin, xmax - xmin, ymax - ymin
|
||||
|
||||
return 0, 0, w, h
|
||||
|
||||
|
||||
class EastRandomCropData(object):
|
||||
def __init__(self,
|
||||
size=(640, 640),
|
||||
max_tries=10,
|
||||
min_crop_side_ratio=0.1,
|
||||
keep_ratio=True,
|
||||
**kwargs):
|
||||
self.size = size
|
||||
self.max_tries = max_tries
|
||||
self.min_crop_side_ratio = min_crop_side_ratio
|
||||
self.keep_ratio = keep_ratio
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
text_polys = data['polys']
|
||||
ignore_tags = data['ignore_tags']
|
||||
texts = data['texts']
|
||||
all_care_polys = [
|
||||
text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
|
||||
]
|
||||
# 计算crop区域
|
||||
crop_x, crop_y, crop_w, crop_h = crop_area(
|
||||
img, all_care_polys, self.min_crop_side_ratio, self.max_tries)
|
||||
# crop 图片 保持比例填充
|
||||
scale_w = self.size[0] / crop_w
|
||||
scale_h = self.size[1] / crop_h
|
||||
scale = min(scale_w, scale_h)
|
||||
h = int(crop_h * scale)
|
||||
w = int(crop_w * scale)
|
||||
if self.keep_ratio:
|
||||
padimg = np.zeros((self.size[1], self.size[0], img.shape[2]),
|
||||
img.dtype)
|
||||
padimg[:h, :w] = cv2.resize(
|
||||
img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
|
||||
img = padimg
|
||||
else:
|
||||
img = cv2.resize(
|
||||
img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
|
||||
tuple(self.size))
|
||||
# crop 文本框
|
||||
text_polys_crop = []
|
||||
ignore_tags_crop = []
|
||||
texts_crop = []
|
||||
for poly, text, tag in zip(text_polys, texts, ignore_tags):
|
||||
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
|
||||
if not is_poly_outside_rect(poly, 0, 0, w, h):
|
||||
text_polys_crop.append(poly)
|
||||
ignore_tags_crop.append(tag)
|
||||
texts_crop.append(text)
|
||||
data['image'] = img
|
||||
data['polys'] = np.array(text_polys_crop)
|
||||
data['ignore_tags'] = ignore_tags_crop
|
||||
data['texts'] = texts_crop
|
||||
return data
|
||||
|
||||
|
||||
class PSERandomCrop(object):
|
||||
def __init__(self, size, **kwargs):
|
||||
self.size = size
|
||||
|
||||
def __call__(self, data):
|
||||
imgs = data['imgs']
|
||||
|
||||
h, w = imgs[0].shape[0:2]
|
||||
th, tw = self.size
|
||||
if w == tw and h == th:
|
||||
return imgs
|
||||
|
||||
# label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制
|
||||
if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
|
||||
# 文本实例的左上角点
|
||||
tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
|
||||
tl[tl < 0] = 0
|
||||
# 文本实例的右下角点
|
||||
br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
|
||||
br[br < 0] = 0
|
||||
# 保证选到右下角点时,有足够的距离进行crop
|
||||
br[0] = min(br[0], h - th)
|
||||
br[1] = min(br[1], w - tw)
|
||||
|
||||
for _ in range(50000):
|
||||
i = random.randint(tl[0], br[0])
|
||||
j = random.randint(tl[1], br[1])
|
||||
# 保证shrink_label_map有文本
|
||||
if imgs[1][i:i + th, j:j + tw].sum() <= 0:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
else:
|
||||
i = random.randint(0, h - th)
|
||||
j = random.randint(0, w - tw)
|
||||
|
||||
# return i, j, th, tw
|
||||
for idx in range(len(imgs)):
|
||||
if len(imgs[idx].shape) == 3:
|
||||
imgs[idx] = imgs[idx][i:i + th, j:j + tw, :]
|
||||
else:
|
||||
imgs[idx] = imgs[idx][i:i + th, j:j + tw]
|
||||
data['imgs'] = imgs
|
||||
return data
|
|
@ -1,31 +1,70 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import cv2
|
||||
import numpy as np
|
||||
import random
|
||||
from ppocr.utils.utility import initial_logger
|
||||
logger = initial_logger()
|
||||
|
||||
from .text_image_aug import tia_perspective, tia_stretch, tia_distort
|
||||
|
||||
|
||||
def get_bounding_box_rect(pos):
|
||||
left = min(pos[0])
|
||||
right = max(pos[0])
|
||||
top = min(pos[1])
|
||||
bottom = max(pos[1])
|
||||
return [left, top, right, bottom]
|
||||
class RecAug(object):
|
||||
def __init__(self, **kwargsz):
|
||||
pass
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
img = warp(img, 10)
|
||||
data['image'] = img
|
||||
return data
|
||||
|
||||
|
||||
class RecResizeImg(object):
|
||||
def __init__(self,
|
||||
image_shape,
|
||||
infer_mode=False,
|
||||
character_type='ch',
|
||||
use_tps=False,
|
||||
**kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.infer_mode = infer_mode
|
||||
self.character_type = character_type
|
||||
self.use_tps = use_tps
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
if self.infer_mode and self.character_type == "ch" and not self.use_tps:
|
||||
norm_img = resize_norm_img_chinese(img, self.image_shape)
|
||||
else:
|
||||
norm_img = resize_norm_img(img, self.image_shape)
|
||||
data['image'] = norm_img
|
||||
return data
|
||||
|
||||
|
||||
def resize_norm_img(img, image_shape):
|
||||
|
@ -77,19 +116,6 @@ def resize_norm_img_chinese(img, image_shape):
|
|||
return padding_im
|
||||
|
||||
|
||||
def get_img_data(value):
|
||||
"""get_img_data"""
|
||||
if not value:
|
||||
return None
|
||||
imgdata = np.frombuffer(value, dtype='uint8')
|
||||
if imgdata is None:
|
||||
return None
|
||||
imgori = cv2.imdecode(imgdata, 1)
|
||||
if imgori is None:
|
||||
return None
|
||||
return imgori
|
||||
|
||||
|
||||
def flag():
|
||||
"""
|
||||
flag
|
||||
|
@ -196,6 +222,9 @@ class Config:
|
|||
self.h = h
|
||||
|
||||
self.perspective = True
|
||||
self.stretch = True
|
||||
self.distort = True
|
||||
|
||||
self.crop = True
|
||||
self.affine = False
|
||||
self.reverse = True
|
||||
|
@ -299,168 +328,39 @@ def warp(img, ang):
|
|||
config.make(w, h, ang)
|
||||
new_img = img
|
||||
|
||||
prob = 0.4
|
||||
|
||||
if config.distort:
|
||||
img_height, img_width = img.shape[0:2]
|
||||
if random.random() <= prob and img_height >= 20 and img_width >= 20:
|
||||
new_img = tia_distort(new_img, random.randint(3, 6))
|
||||
|
||||
if config.stretch:
|
||||
img_height, img_width = img.shape[0:2]
|
||||
if random.random() <= prob and img_height >= 20 and img_width >= 20:
|
||||
new_img = tia_stretch(new_img, random.randint(3, 6))
|
||||
|
||||
if config.perspective:
|
||||
tp = random.randint(1, 100)
|
||||
if tp >= 50:
|
||||
warpR, (r1, c1), ratio, dst = get_warpR(config)
|
||||
new_w = int(np.max(dst[:, 0])) - int(np.min(dst[:, 0]))
|
||||
new_img = cv2.warpPerspective(
|
||||
new_img,
|
||||
warpR, (int(new_w * ratio), h),
|
||||
borderMode=config.borderMode)
|
||||
if random.random() <= prob:
|
||||
new_img = tia_perspective(new_img)
|
||||
|
||||
if config.crop:
|
||||
img_height, img_width = img.shape[0:2]
|
||||
tp = random.randint(1, 100)
|
||||
if tp >= 50 and img_height >= 20 and img_width >= 20:
|
||||
if random.random() <= prob and img_height >= 20 and img_width >= 20:
|
||||
new_img = get_crop(new_img)
|
||||
if config.affine:
|
||||
warpT = get_warpAffine(config)
|
||||
new_img = cv2.warpAffine(
|
||||
new_img, warpT, (w, h), borderMode=config.borderMode)
|
||||
|
||||
if config.blur:
|
||||
tp = random.randint(1, 100)
|
||||
if tp >= 50:
|
||||
if random.random() <= prob:
|
||||
new_img = blur(new_img)
|
||||
if config.color:
|
||||
tp = random.randint(1, 100)
|
||||
if tp >= 50:
|
||||
if random.random() <= prob:
|
||||
new_img = cvtColor(new_img)
|
||||
if config.jitter:
|
||||
new_img = jitter(new_img)
|
||||
if config.noise:
|
||||
tp = random.randint(1, 100)
|
||||
if tp >= 50:
|
||||
if random.random() <= prob:
|
||||
new_img = add_gasuss_noise(new_img)
|
||||
if config.reverse:
|
||||
tp = random.randint(1, 100)
|
||||
if tp >= 50:
|
||||
if random.random() <= prob:
|
||||
new_img = 255 - new_img
|
||||
return new_img
|
||||
|
||||
|
||||
def process_image(img,
|
||||
image_shape,
|
||||
label=None,
|
||||
char_ops=None,
|
||||
loss_type=None,
|
||||
max_text_length=None,
|
||||
tps=None,
|
||||
infer_mode=False,
|
||||
distort=False):
|
||||
if distort:
|
||||
img = warp(img, 10)
|
||||
if infer_mode and char_ops.character_type == "ch" and not tps:
|
||||
norm_img = resize_norm_img_chinese(img, image_shape)
|
||||
else:
|
||||
norm_img = resize_norm_img(img, image_shape)
|
||||
|
||||
norm_img = norm_img[np.newaxis, :]
|
||||
if label is not None:
|
||||
# char_num = char_ops.get_char_num()
|
||||
text = char_ops.encode(label)
|
||||
if len(text) == 0 or len(text) > max_text_length:
|
||||
logger.info(
|
||||
"Warning in ppocr/data/rec/img_tools.py: Wrong data type."
|
||||
"Excepted string with length between 1 and {}, but "
|
||||
"got '{}'. Label is '{}'".format(max_text_length,
|
||||
len(text), label))
|
||||
return None
|
||||
else:
|
||||
if loss_type == "ctc":
|
||||
text = text.reshape(-1, 1)
|
||||
return (norm_img, text)
|
||||
elif loss_type == "attention":
|
||||
beg_flag_idx = char_ops.get_beg_end_flag_idx("beg")
|
||||
end_flag_idx = char_ops.get_beg_end_flag_idx("end")
|
||||
beg_text = np.append(beg_flag_idx, text)
|
||||
end_text = np.append(text, end_flag_idx)
|
||||
beg_text = beg_text.reshape(-1, 1)
|
||||
end_text = end_text.reshape(-1, 1)
|
||||
return (norm_img, beg_text, end_text)
|
||||
else:
|
||||
assert False, "Unsupport loss_type %s in process_image"\
|
||||
% loss_type
|
||||
return (norm_img)
|
||||
|
||||
def resize_norm_img_srn(img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
|
||||
img_black = np.zeros((imgH, imgW))
|
||||
im_hei = img.shape[0]
|
||||
im_wid = img.shape[1]
|
||||
|
||||
if im_wid <= im_hei * 1:
|
||||
img_new = cv2.resize(img, (imgH * 1, imgH))
|
||||
elif im_wid <= im_hei * 2:
|
||||
img_new = cv2.resize(img, (imgH * 2, imgH))
|
||||
elif im_wid <= im_hei * 3:
|
||||
img_new = cv2.resize(img, (imgH * 3, imgH))
|
||||
else:
|
||||
img_new = cv2.resize(img, (imgW, imgH))
|
||||
|
||||
img_np = np.asarray(img_new)
|
||||
img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
|
||||
img_black[:, 0:img_np.shape[1]] = img_np
|
||||
img_black = img_black[:, :, np.newaxis]
|
||||
|
||||
row, col, c = img_black.shape
|
||||
c = 1
|
||||
|
||||
return np.reshape(img_black, (c, row, col)).astype(np.float32)
|
||||
|
||||
def srn_other_inputs(image_shape,
|
||||
num_heads,
|
||||
max_text_length,
|
||||
char_num):
|
||||
|
||||
imgC, imgH, imgW = image_shape
|
||||
feature_dim = int((imgH / 8) * (imgW / 8))
|
||||
|
||||
encoder_word_pos = np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype('int64')
|
||||
gsrm_word_pos = np.array(range(0, max_text_length)).reshape((max_text_length, 1)).astype('int64')
|
||||
|
||||
lbl_weight = np.array([int(char_num-1)] * max_text_length).reshape((-1,1)).astype('int64')
|
||||
|
||||
gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
|
||||
gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape([-1, 1, max_text_length, max_text_length])
|
||||
gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]) * [-1e9]
|
||||
|
||||
gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape([-1, 1, max_text_length, max_text_length])
|
||||
gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]) * [-1e9]
|
||||
|
||||
encoder_word_pos = encoder_word_pos[np.newaxis, :]
|
||||
gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
|
||||
|
||||
return [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2]
|
||||
|
||||
def process_image_srn(img,
|
||||
image_shape,
|
||||
num_heads,
|
||||
max_text_length,
|
||||
label=None,
|
||||
char_ops=None,
|
||||
loss_type=None):
|
||||
norm_img = resize_norm_img_srn(img, image_shape)
|
||||
norm_img = norm_img[np.newaxis, :]
|
||||
char_num = char_ops.get_char_num()
|
||||
|
||||
[lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
|
||||
srn_other_inputs(image_shape, num_heads, max_text_length,char_num)
|
||||
|
||||
if label is not None:
|
||||
text = char_ops.encode(label)
|
||||
if len(text) == 0 or len(text) > max_text_length:
|
||||
return None
|
||||
else:
|
||||
if loss_type == "srn":
|
||||
text_padded = [int(char_num-1)] * max_text_length
|
||||
for i in range(len(text)):
|
||||
text_padded[i] = text[i]
|
||||
lbl_weight[i] = [1.0]
|
||||
text_padded = np.array(text_padded)
|
||||
text = text_padded.reshape(-1, 1)
|
||||
return (norm_img, text,encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2,lbl_weight)
|
||||
else:
|
||||
assert False, "Unsupport loss_type %s in process_image"\
|
||||
% loss_type
|
||||
return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2)
|
8
ppocr/data/rec/__init__.py → ppocr/data/imaug/text_image_aug/__init__.py
Executable file → Normal file
8
ppocr/data/rec/__init__.py → ppocr/data/imaug/text_image_aug/__init__.py
Executable file → Normal file
|
@ -1,13 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .augment import tia_perspective, tia_distort, tia_stretch
|
||||
|
||||
__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective']
|
|
@ -0,0 +1,116 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from .warp_mls import WarpMLS
|
||||
|
||||
|
||||
def tia_distort(src, segment=4):
|
||||
img_h, img_w = src.shape[:2]
|
||||
|
||||
cut = img_w // segment
|
||||
thresh = cut // 3
|
||||
|
||||
src_pts = list()
|
||||
dst_pts = list()
|
||||
|
||||
src_pts.append([0, 0])
|
||||
src_pts.append([img_w, 0])
|
||||
src_pts.append([img_w, img_h])
|
||||
src_pts.append([0, img_h])
|
||||
|
||||
dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)])
|
||||
dst_pts.append(
|
||||
[img_w - np.random.randint(thresh), np.random.randint(thresh)])
|
||||
dst_pts.append(
|
||||
[img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)])
|
||||
dst_pts.append(
|
||||
[np.random.randint(thresh), img_h - np.random.randint(thresh)])
|
||||
|
||||
half_thresh = thresh * 0.5
|
||||
|
||||
for cut_idx in np.arange(1, segment, 1):
|
||||
src_pts.append([cut * cut_idx, 0])
|
||||
src_pts.append([cut * cut_idx, img_h])
|
||||
dst_pts.append([
|
||||
cut * cut_idx + np.random.randint(thresh) - half_thresh,
|
||||
np.random.randint(thresh) - half_thresh
|
||||
])
|
||||
dst_pts.append([
|
||||
cut * cut_idx + np.random.randint(thresh) - half_thresh,
|
||||
img_h + np.random.randint(thresh) - half_thresh
|
||||
])
|
||||
|
||||
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
||||
dst = trans.generate()
|
||||
|
||||
return dst
|
||||
|
||||
|
||||
def tia_stretch(src, segment=4):
|
||||
img_h, img_w = src.shape[:2]
|
||||
|
||||
cut = img_w // segment
|
||||
thresh = cut * 4 // 5
|
||||
|
||||
src_pts = list()
|
||||
dst_pts = list()
|
||||
|
||||
src_pts.append([0, 0])
|
||||
src_pts.append([img_w, 0])
|
||||
src_pts.append([img_w, img_h])
|
||||
src_pts.append([0, img_h])
|
||||
|
||||
dst_pts.append([0, 0])
|
||||
dst_pts.append([img_w, 0])
|
||||
dst_pts.append([img_w, img_h])
|
||||
dst_pts.append([0, img_h])
|
||||
|
||||
half_thresh = thresh * 0.5
|
||||
|
||||
for cut_idx in np.arange(1, segment, 1):
|
||||
move = np.random.randint(thresh) - half_thresh
|
||||
src_pts.append([cut * cut_idx, 0])
|
||||
src_pts.append([cut * cut_idx, img_h])
|
||||
dst_pts.append([cut * cut_idx + move, 0])
|
||||
dst_pts.append([cut * cut_idx + move, img_h])
|
||||
|
||||
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
||||
dst = trans.generate()
|
||||
|
||||
return dst
|
||||
|
||||
|
||||
def tia_perspective(src):
|
||||
img_h, img_w = src.shape[:2]
|
||||
|
||||
thresh = img_h // 2
|
||||
|
||||
src_pts = list()
|
||||
dst_pts = list()
|
||||
|
||||
src_pts.append([0, 0])
|
||||
src_pts.append([img_w, 0])
|
||||
src_pts.append([img_w, img_h])
|
||||
src_pts.append([0, img_h])
|
||||
|
||||
dst_pts.append([0, np.random.randint(thresh)])
|
||||
dst_pts.append([img_w, np.random.randint(thresh)])
|
||||
dst_pts.append([img_w, img_h - np.random.randint(thresh)])
|
||||
dst_pts.append([0, img_h - np.random.randint(thresh)])
|
||||
|
||||
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
|
||||
dst = trans.generate()
|
||||
|
||||
return dst
|
|
@ -0,0 +1,164 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class WarpMLS:
|
||||
def __init__(self, src, src_pts, dst_pts, dst_w, dst_h, trans_ratio=1.):
|
||||
self.src = src
|
||||
self.src_pts = src_pts
|
||||
self.dst_pts = dst_pts
|
||||
self.pt_count = len(self.dst_pts)
|
||||
self.dst_w = dst_w
|
||||
self.dst_h = dst_h
|
||||
self.trans_ratio = trans_ratio
|
||||
self.grid_size = 100
|
||||
self.rdx = np.zeros((self.dst_h, self.dst_w))
|
||||
self.rdy = np.zeros((self.dst_h, self.dst_w))
|
||||
|
||||
@staticmethod
|
||||
def __bilinear_interp(x, y, v11, v12, v21, v22):
|
||||
return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 *
|
||||
(1 - y) + v22 * y) * x
|
||||
|
||||
def generate(self):
|
||||
self.calc_delta()
|
||||
return self.gen_img()
|
||||
|
||||
def calc_delta(self):
|
||||
w = np.zeros(self.pt_count, dtype=np.float32)
|
||||
|
||||
if self.pt_count < 2:
|
||||
return
|
||||
|
||||
i = 0
|
||||
while 1:
|
||||
if self.dst_w <= i < self.dst_w + self.grid_size - 1:
|
||||
i = self.dst_w - 1
|
||||
elif i >= self.dst_w:
|
||||
break
|
||||
|
||||
j = 0
|
||||
while 1:
|
||||
if self.dst_h <= j < self.dst_h + self.grid_size - 1:
|
||||
j = self.dst_h - 1
|
||||
elif j >= self.dst_h:
|
||||
break
|
||||
|
||||
sw = 0
|
||||
swp = np.zeros(2, dtype=np.float32)
|
||||
swq = np.zeros(2, dtype=np.float32)
|
||||
new_pt = np.zeros(2, dtype=np.float32)
|
||||
cur_pt = np.array([i, j], dtype=np.float32)
|
||||
|
||||
k = 0
|
||||
for k in range(self.pt_count):
|
||||
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
||||
break
|
||||
|
||||
w[k] = 1. / (
|
||||
(i - self.dst_pts[k][0]) * (i - self.dst_pts[k][0]) +
|
||||
(j - self.dst_pts[k][1]) * (j - self.dst_pts[k][1]))
|
||||
|
||||
sw += w[k]
|
||||
swp = swp + w[k] * np.array(self.dst_pts[k])
|
||||
swq = swq + w[k] * np.array(self.src_pts[k])
|
||||
|
||||
if k == self.pt_count - 1:
|
||||
pstar = 1 / sw * swp
|
||||
qstar = 1 / sw * swq
|
||||
|
||||
miu_s = 0
|
||||
for k in range(self.pt_count):
|
||||
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
||||
continue
|
||||
pt_i = self.dst_pts[k] - pstar
|
||||
miu_s += w[k] * np.sum(pt_i * pt_i)
|
||||
|
||||
cur_pt -= pstar
|
||||
cur_pt_j = np.array([-cur_pt[1], cur_pt[0]])
|
||||
|
||||
for k in range(self.pt_count):
|
||||
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
|
||||
continue
|
||||
|
||||
pt_i = self.dst_pts[k] - pstar
|
||||
pt_j = np.array([-pt_i[1], pt_i[0]])
|
||||
|
||||
tmp_pt = np.zeros(2, dtype=np.float32)
|
||||
tmp_pt[0] = np.sum(pt_i * cur_pt) * self.src_pts[k][0] - \
|
||||
np.sum(pt_j * cur_pt) * self.src_pts[k][1]
|
||||
tmp_pt[1] = -np.sum(pt_i * cur_pt_j) * self.src_pts[k][0] + \
|
||||
np.sum(pt_j * cur_pt_j) * self.src_pts[k][1]
|
||||
tmp_pt *= (w[k] / miu_s)
|
||||
new_pt += tmp_pt
|
||||
|
||||
new_pt += qstar
|
||||
else:
|
||||
new_pt = self.src_pts[k]
|
||||
|
||||
self.rdx[j, i] = new_pt[0] - i
|
||||
self.rdy[j, i] = new_pt[1] - j
|
||||
|
||||
j += self.grid_size
|
||||
i += self.grid_size
|
||||
|
||||
def gen_img(self):
|
||||
src_h, src_w = self.src.shape[:2]
|
||||
dst = np.zeros_like(self.src, dtype=np.float32)
|
||||
|
||||
for i in np.arange(0, self.dst_h, self.grid_size):
|
||||
for j in np.arange(0, self.dst_w, self.grid_size):
|
||||
ni = i + self.grid_size
|
||||
nj = j + self.grid_size
|
||||
w = h = self.grid_size
|
||||
if ni >= self.dst_h:
|
||||
ni = self.dst_h - 1
|
||||
h = ni - i + 1
|
||||
if nj >= self.dst_w:
|
||||
nj = self.dst_w - 1
|
||||
w = nj - j + 1
|
||||
|
||||
di = np.reshape(np.arange(h), (-1, 1))
|
||||
dj = np.reshape(np.arange(w), (1, -1))
|
||||
delta_x = self.__bilinear_interp(
|
||||
di / h, dj / w, self.rdx[i, j], self.rdx[i, nj],
|
||||
self.rdx[ni, j], self.rdx[ni, nj])
|
||||
delta_y = self.__bilinear_interp(
|
||||
di / h, dj / w, self.rdy[i, j], self.rdy[i, nj],
|
||||
self.rdy[ni, j], self.rdy[ni, nj])
|
||||
nx = j + dj + delta_x * self.trans_ratio
|
||||
ny = i + di + delta_y * self.trans_ratio
|
||||
nx = np.clip(nx, 0, src_w - 1)
|
||||
ny = np.clip(ny, 0, src_h - 1)
|
||||
nxi = np.array(np.floor(nx), dtype=np.int32)
|
||||
nyi = np.array(np.floor(ny), dtype=np.int32)
|
||||
nxi1 = np.array(np.ceil(nx), dtype=np.int32)
|
||||
nyi1 = np.array(np.ceil(ny), dtype=np.int32)
|
||||
|
||||
if len(self.src.shape) == 3:
|
||||
x = np.tile(np.expand_dims(ny - nyi, axis=-1), (1, 1, 3))
|
||||
y = np.tile(np.expand_dims(nx - nxi, axis=-1), (1, 1, 3))
|
||||
else:
|
||||
x = ny - nyi
|
||||
y = nx - nxi
|
||||
dst[i:i + h, j:j + w] = self.__bilinear_interp(
|
||||
x, y, self.src[nyi, nxi], self.src[nyi, nxi1],
|
||||
self.src[nyi1, nxi], self.src[nyi1, nxi1])
|
||||
|
||||
dst = np.clip(dst, 0, 255)
|
||||
dst = np.array(dst, dtype=np.uint8)
|
||||
|
||||
return dst
|
|
@ -1,77 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from ppocr.utils.utility import create_module
|
||||
from copy import deepcopy
|
||||
|
||||
from .rec.img_tools import process_image
|
||||
import cv2
|
||||
|
||||
import sys
|
||||
import signal
|
||||
|
||||
|
||||
# handle terminate reader process, do not print stack frame
|
||||
def _reader_quit(signum, frame):
|
||||
print("Reader process exit.")
|
||||
sys.exit()
|
||||
|
||||
|
||||
def _term_group(sig_num, frame):
|
||||
print('pid {} terminated, terminate group '
|
||||
'{}...'.format(os.getpid(), os.getpgrp()))
|
||||
os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
|
||||
|
||||
|
||||
signal.signal(signal.SIGTERM, _reader_quit)
|
||||
signal.signal(signal.SIGINT, _term_group)
|
||||
|
||||
|
||||
def reader_main(config=None, mode=None):
|
||||
"""Create a reader for trainning
|
||||
|
||||
Args:
|
||||
settings: arguments
|
||||
|
||||
Returns:
|
||||
train reader
|
||||
"""
|
||||
assert mode in ["train", "eval", "test"],\
|
||||
"Nonsupport mode:{}".format(mode)
|
||||
global_params = config['Global']
|
||||
if mode == "train":
|
||||
params = deepcopy(config['TrainReader'])
|
||||
elif mode == "eval":
|
||||
params = deepcopy(config['EvalReader'])
|
||||
else:
|
||||
params = deepcopy(config['TestReader'])
|
||||
params['mode'] = mode
|
||||
params.update(global_params)
|
||||
reader_function = params['reader_function']
|
||||
function = create_module(reader_function)(params)
|
||||
if mode == "train":
|
||||
if sys.platform == "win32":
|
||||
return function(0)
|
||||
readers = []
|
||||
num_workers = params['num_workers']
|
||||
for process_id in range(num_workers):
|
||||
readers.append(function(process_id))
|
||||
return paddle.reader.multiprocess_reader(readers, False)
|
||||
else:
|
||||
return function(mode)
|
|
@ -1,335 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import math
|
||||
import random
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
import string
|
||||
import lmdb
|
||||
|
||||
from ppocr.utils.utility import initial_logger
|
||||
from ppocr.utils.utility import get_image_file_list
|
||||
logger = initial_logger()
|
||||
|
||||
from .img_tools import process_image, process_image_srn, get_img_data
|
||||
|
||||
|
||||
class LMDBReader(object):
|
||||
def __init__(self, params):
|
||||
if params['mode'] != 'train':
|
||||
self.num_workers = 1
|
||||
else:
|
||||
self.num_workers = params['num_workers']
|
||||
self.lmdb_sets_dir = params['lmdb_sets_dir']
|
||||
self.char_ops = params['char_ops']
|
||||
self.image_shape = params['image_shape']
|
||||
self.loss_type = params['loss_type']
|
||||
self.max_text_length = params['max_text_length']
|
||||
self.mode = params['mode']
|
||||
self.drop_last = False
|
||||
self.use_tps = False
|
||||
self.num_heads = None
|
||||
if "num_heads" in params:
|
||||
self.num_heads = params['num_heads']
|
||||
if "tps" in params:
|
||||
self.ues_tps = True
|
||||
self.use_distort = False
|
||||
if "distort" in params:
|
||||
self.use_distort = params['distort'] and params['use_gpu']
|
||||
if not params['use_gpu']:
|
||||
logger.info(
|
||||
"Distort operation can only support in GPU. Distort will be set to False."
|
||||
)
|
||||
if params['mode'] == 'train':
|
||||
self.batch_size = params['train_batch_size_per_card']
|
||||
self.drop_last = True
|
||||
else:
|
||||
self.batch_size = params['test_batch_size_per_card']
|
||||
self.drop_last = False
|
||||
self.use_distort = False
|
||||
self.infer_img = params['infer_img']
|
||||
|
||||
def load_hierarchical_lmdb_dataset(self):
|
||||
lmdb_sets = {}
|
||||
dataset_idx = 0
|
||||
for dirpath, dirnames, filenames in os.walk(self.lmdb_sets_dir + '/'):
|
||||
if not dirnames:
|
||||
env = lmdb.open(
|
||||
dirpath,
|
||||
max_readers=32,
|
||||
readonly=True,
|
||||
lock=False,
|
||||
readahead=False,
|
||||
meminit=False)
|
||||
txn = env.begin(write=False)
|
||||
num_samples = int(txn.get('num-samples'.encode()))
|
||||
lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \
|
||||
"txn":txn, "num_samples":num_samples}
|
||||
dataset_idx += 1
|
||||
return lmdb_sets
|
||||
|
||||
def print_lmdb_sets_info(self, lmdb_sets):
|
||||
lmdb_info_strs = []
|
||||
for dataset_idx in range(len(lmdb_sets)):
|
||||
tmp_str = " %s:%d," % (lmdb_sets[dataset_idx]['dirpath'],
|
||||
lmdb_sets[dataset_idx]['num_samples'])
|
||||
lmdb_info_strs.append(tmp_str)
|
||||
lmdb_info_strs = ''.join(lmdb_info_strs)
|
||||
logger.info("DataSummary:" + lmdb_info_strs)
|
||||
return
|
||||
|
||||
def close_lmdb_dataset(self, lmdb_sets):
|
||||
for dataset_idx in lmdb_sets:
|
||||
lmdb_sets[dataset_idx]['env'].close()
|
||||
return
|
||||
|
||||
def get_lmdb_sample_info(self, txn, index):
|
||||
label_key = 'label-%09d'.encode() % index
|
||||
label = txn.get(label_key)
|
||||
if label is None:
|
||||
return None
|
||||
label = label.decode('utf-8')
|
||||
img_key = 'image-%09d'.encode() % index
|
||||
imgbuf = txn.get(img_key)
|
||||
img = get_img_data(imgbuf)
|
||||
if img is None:
|
||||
return None
|
||||
return img, label
|
||||
|
||||
def __call__(self, process_id):
|
||||
if self.mode != 'train':
|
||||
process_id = 0
|
||||
|
||||
def sample_iter_reader():
|
||||
if self.mode != 'train' and self.infer_img is not None:
|
||||
image_file_list = get_image_file_list(self.infer_img)
|
||||
for single_img in image_file_list:
|
||||
img = cv2.imread(single_img)
|
||||
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
if self.loss_type == 'srn':
|
||||
norm_img = process_image_srn(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
num_heads=self.num_heads,
|
||||
max_text_length=self.max_text_length)
|
||||
else:
|
||||
norm_img = process_image(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
char_ops=self.char_ops,
|
||||
tps=self.use_tps,
|
||||
infer_mode=True)
|
||||
yield norm_img
|
||||
else:
|
||||
lmdb_sets = self.load_hierarchical_lmdb_dataset()
|
||||
if process_id == 0:
|
||||
self.print_lmdb_sets_info(lmdb_sets)
|
||||
cur_index_sets = [1 + process_id] * len(lmdb_sets)
|
||||
while True:
|
||||
finish_read_num = 0
|
||||
for dataset_idx in range(len(lmdb_sets)):
|
||||
cur_index = cur_index_sets[dataset_idx]
|
||||
if cur_index > lmdb_sets[dataset_idx]['num_samples']:
|
||||
finish_read_num += 1
|
||||
else:
|
||||
sample_info = self.get_lmdb_sample_info(
|
||||
lmdb_sets[dataset_idx]['txn'], cur_index)
|
||||
cur_index_sets[dataset_idx] += self.num_workers
|
||||
if sample_info is None:
|
||||
continue
|
||||
img, label = sample_info
|
||||
outs = []
|
||||
if self.loss_type == "srn":
|
||||
outs = process_image_srn(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
num_heads=self.num_heads,
|
||||
max_text_length=self.max_text_length,
|
||||
label=label,
|
||||
char_ops=self.char_ops,
|
||||
loss_type=self.loss_type)
|
||||
|
||||
else:
|
||||
outs = process_image(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
label=label,
|
||||
char_ops=self.char_ops,
|
||||
loss_type=self.loss_type,
|
||||
max_text_length=self.max_text_length)
|
||||
if outs is None:
|
||||
continue
|
||||
yield outs
|
||||
|
||||
if finish_read_num == len(lmdb_sets):
|
||||
break
|
||||
self.close_lmdb_dataset(lmdb_sets)
|
||||
|
||||
def batch_iter_reader():
|
||||
batch_outs = []
|
||||
for outs in sample_iter_reader():
|
||||
batch_outs.append(outs)
|
||||
if len(batch_outs) == self.batch_size:
|
||||
yield batch_outs
|
||||
batch_outs = []
|
||||
if not self.drop_last:
|
||||
if len(batch_outs) != 0:
|
||||
yield batch_outs
|
||||
|
||||
if self.infer_img is None:
|
||||
return batch_iter_reader
|
||||
return sample_iter_reader
|
||||
|
||||
|
||||
class SimpleReader(object):
|
||||
def __init__(self, params):
|
||||
if params['mode'] != 'train':
|
||||
self.num_workers = 1
|
||||
else:
|
||||
self.num_workers = params['num_workers']
|
||||
if params['mode'] != 'test':
|
||||
self.img_set_dir = params['img_set_dir']
|
||||
self.label_file_path = params['label_file_path']
|
||||
self.use_gpu = params['use_gpu']
|
||||
self.char_ops = params['char_ops']
|
||||
self.image_shape = params['image_shape']
|
||||
self.loss_type = params['loss_type']
|
||||
self.max_text_length = params['max_text_length']
|
||||
self.mode = params['mode']
|
||||
self.infer_img = params['infer_img']
|
||||
self.use_tps = False
|
||||
if "num_heads" in params:
|
||||
self.num_heads = params['num_heads']
|
||||
if "tps" in params:
|
||||
self.use_tps = True
|
||||
self.use_distort = False
|
||||
if "distort" in params:
|
||||
self.use_distort = params['distort'] and params['use_gpu']
|
||||
if not params['use_gpu']:
|
||||
logger.info(
|
||||
"Distort operation can only support in GPU.Distort will be set to False."
|
||||
)
|
||||
if params['mode'] == 'train':
|
||||
self.batch_size = params['train_batch_size_per_card']
|
||||
self.drop_last = True
|
||||
else:
|
||||
self.batch_size = params['test_batch_size_per_card']
|
||||
self.drop_last = False
|
||||
self.use_distort = False
|
||||
|
||||
def __call__(self, process_id):
|
||||
if self.mode != 'train':
|
||||
process_id = 0
|
||||
|
||||
def get_device_num():
|
||||
if self.use_gpu:
|
||||
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", '1')
|
||||
gpu_num = len(gpus.split(','))
|
||||
return gpu_num
|
||||
else:
|
||||
cpu_num = os.environ.get("CPU_NUM", 1)
|
||||
return int(cpu_num)
|
||||
|
||||
def sample_iter_reader():
|
||||
if self.mode != 'train' and self.infer_img is not None:
|
||||
image_file_list = get_image_file_list(self.infer_img)
|
||||
for single_img in image_file_list:
|
||||
img = cv2.imread(single_img)
|
||||
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
if self.loss_type == 'srn':
|
||||
norm_img = process_image_srn(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
char_ops=self.char_ops,
|
||||
num_heads=self.num_heads,
|
||||
max_text_length=self.max_text_length)
|
||||
else:
|
||||
norm_img = process_image(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
char_ops=self.char_ops,
|
||||
tps=self.use_tps,
|
||||
infer_mode=True)
|
||||
yield norm_img
|
||||
else:
|
||||
with open(self.label_file_path, "rb") as fin:
|
||||
label_infor_list = fin.readlines()
|
||||
img_num = len(label_infor_list)
|
||||
img_id_list = list(range(img_num))
|
||||
random.shuffle(img_id_list)
|
||||
if sys.platform == "win32" and self.num_workers != 1:
|
||||
print("multiprocess is not fully compatible with Windows."
|
||||
"num_workers will be 1.")
|
||||
self.num_workers = 1
|
||||
if self.batch_size * get_device_num(
|
||||
) * self.num_workers > img_num:
|
||||
raise Exception(
|
||||
"The number of the whole data ({}) is smaller than the batch_size * devices_num * num_workers ({})".
|
||||
format(img_num, self.batch_size * get_device_num() *
|
||||
self.num_workers))
|
||||
for img_id in range(process_id, img_num, self.num_workers):
|
||||
label_infor = label_infor_list[img_id_list[img_id]]
|
||||
substr = label_infor.decode('utf-8').strip("\n").split("\t")
|
||||
img_path = self.img_set_dir + "/" + substr[0]
|
||||
img = cv2.imread(img_path)
|
||||
if img is None:
|
||||
logger.info("{} does not exist!".format(img_path))
|
||||
continue
|
||||
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
label = substr[1]
|
||||
if self.loss_type == "srn":
|
||||
outs = process_image_srn(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
num_heads=self.num_heads,
|
||||
max_text_length=self.max_text_length,
|
||||
label=label,
|
||||
char_ops=self.char_ops,
|
||||
loss_type=self.loss_type)
|
||||
|
||||
else:
|
||||
outs = process_image(
|
||||
img=img,
|
||||
image_shape=self.image_shape,
|
||||
label=label,
|
||||
char_ops=self.char_ops,
|
||||
loss_type=self.loss_type,
|
||||
max_text_length=self.max_text_length,
|
||||
distort=self.use_distort)
|
||||
if outs is None:
|
||||
continue
|
||||
yield outs
|
||||
|
||||
def batch_iter_reader():
|
||||
batch_outs = []
|
||||
for outs in sample_iter_reader():
|
||||
batch_outs.append(outs)
|
||||
if len(batch_outs) == self.batch_size:
|
||||
yield batch_outs
|
||||
batch_outs = []
|
||||
if not self.drop_last:
|
||||
if len(batch_outs) != 0:
|
||||
yield batch_outs
|
||||
|
||||
if self.infer_img is None:
|
||||
return batch_iter_reader
|
||||
return sample_iter_reader
|
|
@ -0,0 +1,72 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
__all__ = ['DetMetric']
|
||||
|
||||
from .eval_det_iou import DetectionIoUEvaluator
|
||||
|
||||
|
||||
class DetMetric(object):
|
||||
def __init__(self, main_indicator='hmean', **kwargs):
|
||||
self.evaluator = DetectionIoUEvaluator()
|
||||
self.main_indicator = main_indicator
|
||||
self.reset()
|
||||
|
||||
def __call__(self, preds, batch, **kwargs):
|
||||
'''
|
||||
batch: a list produced by dataloaders.
|
||||
image: np.ndarray of shape (N, C, H, W).
|
||||
ratio_list: np.ndarray of shape(N,2)
|
||||
polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions.
|
||||
ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not.
|
||||
preds: a list of dict produced by post process
|
||||
points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions.
|
||||
'''
|
||||
gt_polyons_batch = batch[2]
|
||||
ignore_tags_batch = batch[3]
|
||||
for pred, gt_polyons, ignore_tags in zip(preds, gt_polyons_batch,
|
||||
ignore_tags_batch):
|
||||
# prepare gt
|
||||
gt_info_list = [{
|
||||
'points': gt_polyon,
|
||||
'text': '',
|
||||
'ignore': ignore_tag
|
||||
} for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags)]
|
||||
# prepare det
|
||||
det_info_list = [{
|
||||
'points': det_polyon,
|
||||
'text': ''
|
||||
} for det_polyon in pred['points']]
|
||||
result = self.evaluator.evaluate_image(gt_info_list, det_info_list)
|
||||
self.results.append(result)
|
||||
|
||||
def get_metric(self):
|
||||
"""
|
||||
return metircs {
|
||||
'precision': 0,
|
||||
'recall': 0,
|
||||
'hmean': 0
|
||||
}
|
||||
"""
|
||||
|
||||
metircs = self.evaluator.combine_results(self.results)
|
||||
self.reset()
|
||||
return metircs
|
||||
|
||||
def reset(self):
|
||||
self.results = [] # clear results
|
|
@ -0,0 +1,59 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import Levenshtein
|
||||
|
||||
|
||||
class RecMetric(object):
|
||||
def __init__(self, main_indicator='acc', **kwargs):
|
||||
self.main_indicator = main_indicator
|
||||
self.reset()
|
||||
|
||||
def __call__(self, pred_label, *args, **kwargs):
|
||||
preds, labels = pred_label
|
||||
correct_num = 0
|
||||
all_num = 0
|
||||
norm_edit_dis = 0.0
|
||||
for (pred, pred_conf), (target, _) in zip(preds, labels):
|
||||
norm_edit_dis += Levenshtein.distance(pred, target) / max(
|
||||
len(pred), len(target))
|
||||
if pred == target:
|
||||
correct_num += 1
|
||||
all_num += 1
|
||||
# if all_num < 10 and kwargs.get('show_str', False):
|
||||
# print('{} -> {}'.format(pred, target))
|
||||
self.correct_num += correct_num
|
||||
self.all_num += all_num
|
||||
self.norm_edit_dis += norm_edit_dis
|
||||
return {
|
||||
'acc': correct_num / all_num,
|
||||
'norm_edit_dis': 1 - norm_edit_dis / all_num
|
||||
}
|
||||
|
||||
def get_metric(self):
|
||||
"""
|
||||
return metircs {
|
||||
'acc': 0,
|
||||
'norm_edit_dis': 0,
|
||||
}
|
||||
"""
|
||||
acc = self.correct_num / self.all_num
|
||||
norm_edit_dis = 1 - self.norm_edit_dis / self.all_num
|
||||
self.reset()
|
||||
return {'acc': acc, 'norm_edit_dis': norm_edit_dis}
|
||||
|
||||
def reset(self):
|
||||
self.correct_num = 0
|
||||
self.all_num = 0
|
||||
self.norm_edit_dis = 0
|
|
@ -0,0 +1,36 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import copy
|
||||
|
||||
__all__ = ['build_metric']
|
||||
|
||||
|
||||
def build_metric(config):
|
||||
from .DetMetric import DetMetric
|
||||
from .RecMetric import RecMetric
|
||||
|
||||
support_dict = ['DetMetric', 'RecMetric']
|
||||
|
||||
config = copy.deepcopy(config)
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception(
|
||||
'metric only support {}'.format(support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
|
@ -88,8 +88,8 @@ class DetectionIoUEvaluator(object):
|
|||
points = gt[n]['points']
|
||||
# transcription = gt[n]['text']
|
||||
dontCare = gt[n]['ignore']
|
||||
# points = Polygon(points)
|
||||
# points = points.buffer(0)
|
||||
# points = Polygon(points)
|
||||
# points = points.buffer(0)
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
|
@ -105,8 +105,8 @@ class DetectionIoUEvaluator(object):
|
|||
|
||||
for n in range(len(pred)):
|
||||
points = pred[n]['points']
|
||||
# points = Polygon(points)
|
||||
# points = points.buffer(0)
|
||||
# points = Polygon(points)
|
||||
# points = points.buffer(0)
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
|
@ -11,3 +11,16 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
from .losses import build_loss
|
||||
|
||||
__all__ = ['build_model', 'build_loss']
|
||||
|
||||
|
||||
def build_model(config):
|
||||
from .architectures import Model
|
||||
|
||||
config = copy.deepcopy(config)
|
||||
module_class = Model(config)
|
||||
return module_class
|
||||
|
|
|
@ -11,3 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .model import Model
|
||||
__all__ = ['Model']
|
|
@ -1,146 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from paddle import fluid
|
||||
|
||||
from ppocr.utils.utility import create_module
|
||||
from ppocr.utils.utility import initial_logger
|
||||
logger = initial_logger()
|
||||
from copy import deepcopy
|
||||
|
||||
|
||||
class DetModel(object):
|
||||
def __init__(self, params):
|
||||
"""
|
||||
Detection module for OCR text detection.
|
||||
args:
|
||||
params (dict): the super parameters for detection module.
|
||||
"""
|
||||
global_params = params['Global']
|
||||
self.algorithm = global_params['algorithm']
|
||||
|
||||
backbone_params = deepcopy(params["Backbone"])
|
||||
backbone_params.update(global_params)
|
||||
self.backbone = create_module(backbone_params['function'])\
|
||||
(params=backbone_params)
|
||||
|
||||
head_params = deepcopy(params["Head"])
|
||||
head_params.update(global_params)
|
||||
self.head = create_module(head_params['function'])\
|
||||
(params=head_params)
|
||||
|
||||
loss_params = deepcopy(params["Loss"])
|
||||
loss_params.update(global_params)
|
||||
self.loss = create_module(loss_params['function'])\
|
||||
(params=loss_params)
|
||||
|
||||
self.image_shape = global_params['image_shape']
|
||||
|
||||
def create_feed(self, mode):
|
||||
"""
|
||||
create Dataloader feeds
|
||||
args:
|
||||
mode (str): 'train' for training or else for evaluation
|
||||
return: (image, corresponding label, dataloader)
|
||||
"""
|
||||
image_shape = deepcopy(self.image_shape)
|
||||
if image_shape[1] % 4 != 0 or image_shape[2] % 4 != 0:
|
||||
raise Exception("The size of the image must be divisible by 4, "
|
||||
"received image shape is {}, please reset the "
|
||||
"Global.image_shape in the yml file".format(
|
||||
image_shape))
|
||||
|
||||
image = fluid.layers.data(
|
||||
name='image', shape=image_shape, dtype='float32')
|
||||
if mode == "train":
|
||||
if self.algorithm == "EAST":
|
||||
h, w = int(image_shape[1] // 4), int(image_shape[2] // 4)
|
||||
score = fluid.layers.data(
|
||||
name='score', shape=[1, h, w], dtype='float32')
|
||||
geo = fluid.layers.data(
|
||||
name='geo', shape=[9, h, w], dtype='float32')
|
||||
mask = fluid.layers.data(
|
||||
name='mask', shape=[1, h, w], dtype='float32')
|
||||
feed_list = [image, score, geo, mask]
|
||||
labels = {'score': score, 'geo': geo, 'mask': mask}
|
||||
elif self.algorithm == "DB":
|
||||
shrink_map = fluid.layers.data(
|
||||
name='shrink_map', shape=image_shape[1:], dtype='float32')
|
||||
shrink_mask = fluid.layers.data(
|
||||
name='shrink_mask', shape=image_shape[1:], dtype='float32')
|
||||
threshold_map = fluid.layers.data(
|
||||
name='threshold_map',
|
||||
shape=image_shape[1:],
|
||||
dtype='float32')
|
||||
threshold_mask = fluid.layers.data(
|
||||
name='threshold_mask',
|
||||
shape=image_shape[1:],
|
||||
dtype='float32')
|
||||
feed_list=[image, shrink_map, shrink_mask,\
|
||||
threshold_map, threshold_mask]
|
||||
labels = {'shrink_map':shrink_map,\
|
||||
'shrink_mask':shrink_mask,\
|
||||
'threshold_map':threshold_map,\
|
||||
'threshold_mask':threshold_mask}
|
||||
elif self.algorithm == "SAST":
|
||||
input_score = fluid.layers.data(
|
||||
name='score', shape=[1, 128, 128], dtype='float32')
|
||||
input_border = fluid.layers.data(
|
||||
name='border', shape=[5, 128, 128], dtype='float32')
|
||||
input_mask = fluid.layers.data(
|
||||
name='mask', shape=[1, 128, 128], dtype='float32')
|
||||
input_tvo = fluid.layers.data(
|
||||
name='tvo', shape=[9, 128, 128], dtype='float32')
|
||||
input_tco = fluid.layers.data(
|
||||
name='tco', shape=[3, 128, 128], dtype='float32')
|
||||
feed_list = [image, input_score, input_border, input_mask, input_tvo, input_tco]
|
||||
labels = {'input_score': input_score,\
|
||||
'input_border': input_border,\
|
||||
'input_mask': input_mask,\
|
||||
'input_tvo': input_tvo,\
|
||||
'input_tco': input_tco}
|
||||
loader = fluid.io.DataLoader.from_generator(
|
||||
feed_list=feed_list,
|
||||
capacity=64,
|
||||
use_double_buffer=True,
|
||||
iterable=False)
|
||||
else:
|
||||
labels = None
|
||||
loader = None
|
||||
return image, labels, loader
|
||||
|
||||
def __call__(self, mode):
|
||||
"""
|
||||
run forward of defined module
|
||||
args:
|
||||
mode (str): 'train' for training; 'export' for inference,
|
||||
others for evaluation]
|
||||
"""
|
||||
image, labels, loader = self.create_feed(mode)
|
||||
conv_feas = self.backbone(image)
|
||||
if self.algorithm == "DB":
|
||||
predicts = self.head(conv_feas, mode)
|
||||
else:
|
||||
predicts = self.head(conv_feas)
|
||||
if mode == "train":
|
||||
losses = self.loss(predicts, labels)
|
||||
return loader, losses
|
||||
elif mode == "export":
|
||||
return [image, predicts]
|
||||
else:
|
||||
return loader, predicts
|
|
@ -0,0 +1,129 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os, sys
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.append('/home/zhoujun20/PaddleOCR')
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from ppocr.modeling.transform import build_transform
|
||||
from ppocr.modeling.backbones import build_backbone
|
||||
from ppocr.modeling.necks import build_neck
|
||||
from ppocr.modeling.heads import build_head
|
||||
|
||||
__all__ = ['Model']
|
||||
|
||||
|
||||
class Model(nn.Layer):
|
||||
def __init__(self, config):
|
||||
"""
|
||||
Detection module for OCR.
|
||||
args:
|
||||
config (dict): the super parameters for module.
|
||||
"""
|
||||
super(Model, self).__init__()
|
||||
algorithm = config['algorithm']
|
||||
self.type = config['type']
|
||||
self.model_name = '{}_{}'.format(self.type, algorithm)
|
||||
|
||||
in_channels = config.get('in_channels', 3)
|
||||
# build transfrom,
|
||||
# for rec, transfrom can be TPS,None
|
||||
# for det and cls, transfrom shoule to be None,
|
||||
# if you make model differently, you can use transfrom in det and cls
|
||||
if 'Transform' not in config or config['Transform'] is None:
|
||||
self.use_transform = False
|
||||
else:
|
||||
self.use_transform = True
|
||||
config['Transform']['in_channels'] = in_channels
|
||||
self.transform = build_transform(config['Transform'])
|
||||
in_channels = self.transform.out_channels
|
||||
|
||||
# build backbone, backbone is need for del, rec and cls
|
||||
config["Backbone"]['in_channels'] = in_channels
|
||||
self.backbone = build_backbone(config["Backbone"], self.type)
|
||||
in_channels = self.backbone.out_channels
|
||||
|
||||
# build neck
|
||||
# for rec, neck can be cnn,rnn or reshape(None)
|
||||
# for det, neck can be FPN, BIFPN and so on.
|
||||
# for cls, neck should be none
|
||||
if 'Neck' not in config or config['Neck'] is None:
|
||||
self.use_neck = False
|
||||
else:
|
||||
self.use_neck = True
|
||||
config['Neck']['in_channels'] = in_channels
|
||||
self.neck = build_neck(config['Neck'])
|
||||
in_channels = self.neck.out_channels
|
||||
|
||||
# # build head, head is need for del, rec and cls
|
||||
config["Head"]['in_channels'] = in_channels
|
||||
self.head = build_head(config["Head"])
|
||||
|
||||
# @paddle.jit.to_static
|
||||
def forward(self, x):
|
||||
if self.use_transform:
|
||||
x = self.transform(x)
|
||||
x = self.backbone(x)
|
||||
if self.use_neck:
|
||||
x = self.neck(x)
|
||||
x = self.head(x)
|
||||
return x
|
||||
|
||||
|
||||
def check_static():
|
||||
import numpy as np
|
||||
from ppocr.utils.save_load import load_dygraph_pretrain
|
||||
from ppocr.utils.logging import get_logger
|
||||
from tools import program
|
||||
|
||||
config = program.load_config('configs/det/det_r50_vd_db.yml')
|
||||
|
||||
# import cv2
|
||||
# data = cv2.imread('doc/imgs/1.jpg')
|
||||
# data = normalize(data)
|
||||
logger = get_logger()
|
||||
data = np.zeros((1, 3, 640, 640), dtype=np.float32)
|
||||
paddle.disable_static()
|
||||
|
||||
config['Architecture']['in_channels'] = 3
|
||||
config['Architecture']["Head"]['out_channels'] = 6624
|
||||
model = Model(config['Architecture'])
|
||||
model.eval()
|
||||
load_dygraph_pretrain(
|
||||
model,
|
||||
logger,
|
||||
'/Users/zhoujun20/Desktop/code/PaddleOCR/db/db',
|
||||
load_static_weights=True)
|
||||
x = paddle.to_variable(data)
|
||||
y = model(x)
|
||||
for y1 in y:
|
||||
print(y1.shape)
|
||||
#
|
||||
# # from matplotlib import pyplot as plt
|
||||
# # plt.imshow(y.numpy())
|
||||
# # plt.show()
|
||||
static_out = np.load('/Users/zhoujun20/Desktop/code/PaddleOCR/db/db.npy')
|
||||
diff = y.numpy() - static_out
|
||||
print(y.shape, static_out.shape, diff.mean())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_static()
|
|
@ -1,228 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from paddle import fluid
|
||||
|
||||
from ppocr.utils.utility import create_module
|
||||
from ppocr.utils.utility import initial_logger
|
||||
logger = initial_logger()
|
||||
from copy import deepcopy
|
||||
|
||||
|
||||
class RecModel(object):
|
||||
def __init__(self, params):
|
||||
super(RecModel, self).__init__()
|
||||
global_params = params['Global']
|
||||
char_num = global_params['char_ops'].get_char_num()
|
||||
global_params['char_num'] = char_num
|
||||
self.char_type = global_params['character_type']
|
||||
self.infer_img = global_params['infer_img']
|
||||
if "TPS" in params:
|
||||
tps_params = deepcopy(params["TPS"])
|
||||
tps_params.update(global_params)
|
||||
self.tps = create_module(tps_params['function'])\
|
||||
(params=tps_params)
|
||||
else:
|
||||
self.tps = None
|
||||
|
||||
backbone_params = deepcopy(params["Backbone"])
|
||||
backbone_params.update(global_params)
|
||||
self.backbone = create_module(backbone_params['function'])\
|
||||
(params=backbone_params)
|
||||
|
||||
head_params = deepcopy(params["Head"])
|
||||
head_params.update(global_params)
|
||||
self.head = create_module(head_params['function'])\
|
||||
(params=head_params)
|
||||
|
||||
loss_params = deepcopy(params["Loss"])
|
||||
loss_params.update(global_params)
|
||||
self.loss = create_module(loss_params['function'])\
|
||||
(params=loss_params)
|
||||
|
||||
self.loss_type = global_params['loss_type']
|
||||
self.image_shape = global_params['image_shape']
|
||||
self.max_text_length = global_params['max_text_length']
|
||||
if "num_heads" in global_params:
|
||||
self.num_heads = global_params["num_heads"]
|
||||
else:
|
||||
self.num_heads = None
|
||||
|
||||
def create_feed(self, mode):
|
||||
image_shape = deepcopy(self.image_shape)
|
||||
image_shape.insert(0, -1)
|
||||
if mode == "train":
|
||||
image = fluid.data(name='image', shape=image_shape, dtype='float32')
|
||||
if self.loss_type == "attention":
|
||||
label_in = fluid.data(
|
||||
name='label_in',
|
||||
shape=[None, 1],
|
||||
dtype='int32',
|
||||
lod_level=1)
|
||||
label_out = fluid.data(
|
||||
name='label_out',
|
||||
shape=[None, 1],
|
||||
dtype='int32',
|
||||
lod_level=1)
|
||||
feed_list = [image, label_in, label_out]
|
||||
labels = {'label_in': label_in, 'label_out': label_out}
|
||||
elif self.loss_type == "srn":
|
||||
encoder_word_pos = fluid.data(
|
||||
name="encoder_word_pos",
|
||||
shape=[
|
||||
-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
|
||||
1
|
||||
],
|
||||
dtype="int64")
|
||||
gsrm_word_pos = fluid.data(
|
||||
name="gsrm_word_pos",
|
||||
shape=[-1, self.max_text_length, 1],
|
||||
dtype="int64")
|
||||
gsrm_slf_attn_bias1 = fluid.data(
|
||||
name="gsrm_slf_attn_bias1",
|
||||
shape=[
|
||||
-1, self.num_heads, self.max_text_length,
|
||||
self.max_text_length
|
||||
],
|
||||
dtype="float32")
|
||||
gsrm_slf_attn_bias2 = fluid.data(
|
||||
name="gsrm_slf_attn_bias2",
|
||||
shape=[
|
||||
-1, self.num_heads, self.max_text_length,
|
||||
self.max_text_length
|
||||
],
|
||||
dtype="float32")
|
||||
lbl_weight = fluid.layers.data(
|
||||
name="lbl_weight", shape=[-1, 1], dtype='int64')
|
||||
label = fluid.data(
|
||||
name='label', shape=[-1, 1], dtype='int32', lod_level=1)
|
||||
feed_list = [
|
||||
image, label, encoder_word_pos, gsrm_word_pos,
|
||||
gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight
|
||||
]
|
||||
labels = {
|
||||
'label': label,
|
||||
'encoder_word_pos': encoder_word_pos,
|
||||
'gsrm_word_pos': gsrm_word_pos,
|
||||
'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
|
||||
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,
|
||||
'lbl_weight': lbl_weight
|
||||
}
|
||||
else:
|
||||
label = fluid.data(
|
||||
name='label', shape=[None, 1], dtype='int32', lod_level=1)
|
||||
feed_list = [image, label]
|
||||
labels = {'label': label}
|
||||
loader = fluid.io.DataLoader.from_generator(
|
||||
feed_list=feed_list,
|
||||
capacity=64,
|
||||
use_double_buffer=True,
|
||||
iterable=False)
|
||||
else:
|
||||
labels = None
|
||||
loader = None
|
||||
if self.char_type == "ch" and self.infer_img:
|
||||
image_shape[-1] = -1
|
||||
if self.tps != None:
|
||||
logger.info(
|
||||
"WARNRNG!!!\n"
|
||||
"TPS does not support variable shape in chinese!"
|
||||
"We set img_shape to be the same , it may affect the inference effect"
|
||||
)
|
||||
image_shape = deepcopy(self.image_shape)
|
||||
image = fluid.data(name='image', shape=image_shape, dtype='float32')
|
||||
if self.loss_type == "srn":
|
||||
encoder_word_pos = fluid.data(
|
||||
name="encoder_word_pos",
|
||||
shape=[
|
||||
-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
|
||||
1
|
||||
],
|
||||
dtype="int64")
|
||||
gsrm_word_pos = fluid.data(
|
||||
name="gsrm_word_pos",
|
||||
shape=[-1, self.max_text_length, 1],
|
||||
dtype="int64")
|
||||
gsrm_slf_attn_bias1 = fluid.data(
|
||||
name="gsrm_slf_attn_bias1",
|
||||
shape=[
|
||||
-1, self.num_heads, self.max_text_length,
|
||||
self.max_text_length
|
||||
],
|
||||
dtype="float32")
|
||||
gsrm_slf_attn_bias2 = fluid.data(
|
||||
name="gsrm_slf_attn_bias2",
|
||||
shape=[
|
||||
-1, self.num_heads, self.max_text_length,
|
||||
self.max_text_length
|
||||
],
|
||||
dtype="float32")
|
||||
feed_list = [
|
||||
image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2
|
||||
]
|
||||
labels = {
|
||||
'encoder_word_pos': encoder_word_pos,
|
||||
'gsrm_word_pos': gsrm_word_pos,
|
||||
'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
|
||||
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2
|
||||
}
|
||||
return image, labels, loader
|
||||
|
||||
def __call__(self, mode):
|
||||
image, labels, loader = self.create_feed(mode)
|
||||
if self.tps is None:
|
||||
inputs = image
|
||||
else:
|
||||
inputs = self.tps(image)
|
||||
conv_feas = self.backbone(inputs)
|
||||
predicts = self.head(conv_feas, labels, mode)
|
||||
decoded_out = predicts['decoded_out']
|
||||
if mode == "train":
|
||||
loss = self.loss(predicts, labels)
|
||||
if self.loss_type == "attention":
|
||||
label = labels['label_out']
|
||||
else:
|
||||
label = labels['label']
|
||||
if self.loss_type == 'srn':
|
||||
total_loss, img_loss, word_loss = self.loss(predicts, labels)
|
||||
outputs = {
|
||||
'total_loss': total_loss,
|
||||
'img_loss': img_loss,
|
||||
'word_loss': word_loss,
|
||||
'decoded_out': decoded_out,
|
||||
'label': label
|
||||
}
|
||||
else:
|
||||
outputs = {'total_loss':loss, 'decoded_out':\
|
||||
decoded_out, 'label':label}
|
||||
return loader, outputs
|
||||
|
||||
elif mode == "export":
|
||||
predict = predicts['predict']
|
||||
if self.loss_type == "ctc":
|
||||
predict = fluid.layers.softmax(predict)
|
||||
if self.loss_type == "srn":
|
||||
raise Exception(
|
||||
"Warning! SRN does not support export model currently")
|
||||
return [image, {'decoded_out': decoded_out, 'predicts': predict}]
|
||||
else:
|
||||
predict = predicts['predict']
|
||||
if self.loss_type == "ctc":
|
||||
predict = fluid.layers.softmax(predict)
|
||||
return loader, {'decoded_out': decoded_out, 'predicts': predict}
|
|
@ -11,3 +11,26 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ['build_backbone']
|
||||
|
||||
|
||||
def build_backbone(config, model_type):
|
||||
if model_type == 'det':
|
||||
from .det_mobilenet_v3 import MobileNetV3
|
||||
from .det_resnet_vd import ResNet
|
||||
|
||||
support_dict = ['MobileNetV3', 'ResNet', 'ResNet_SAST']
|
||||
elif model_type == 'rec':
|
||||
from .rec_mobilenet_v3 import MobileNetV3
|
||||
from .rec_resnet_vd import ResNet
|
||||
support_dict = ['MobileNetV3', 'ResNet', 'ResNet_FPN']
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception(
|
||||
'when model typs is {}, backbone only support {}'.format(model_type,
|
||||
support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
||||
|
|
|
@ -1,40 +1,48 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.initializer import MSRA
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
class MobileNetV3():
|
||||
def __init__(self, params):
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class MobileNetV3(nn.Layer):
|
||||
def __init__(self, in_channels=3, model_name='large', scale=0.5, **kwargs):
|
||||
"""
|
||||
the MobilenetV3 backbone network for detection module.
|
||||
Args:
|
||||
params(dict): the super parameters for build network
|
||||
"""
|
||||
self.scale = params['scale']
|
||||
model_name = params['model_name']
|
||||
self.inplanes = 16
|
||||
super(MobileNetV3, self).__init__()
|
||||
if model_name == "large":
|
||||
self.cfg = [
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, 'relu', 1],
|
||||
[3, 64, 24, False, 'relu', 2],
|
||||
|
@ -52,10 +60,9 @@ class MobileNetV3():
|
|||
[5, 960, 160, True, 'hard_swish', 1],
|
||||
[5, 960, 160, True, 'hard_swish', 1],
|
||||
]
|
||||
self.cls_ch_squeeze = 960
|
||||
self.cls_ch_expand = 1280
|
||||
cls_ch_squeeze = 960
|
||||
elif model_name == "small":
|
||||
self.cfg = [
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, 'relu', 2],
|
||||
[3, 72, 24, False, 'relu', 2],
|
||||
|
@ -69,183 +76,203 @@ class MobileNetV3():
|
|||
[5, 576, 96, True, 'hard_swish', 1],
|
||||
[5, 576, 96, True, 'hard_swish', 1],
|
||||
]
|
||||
self.cls_ch_squeeze = 576
|
||||
self.cls_ch_expand = 1280
|
||||
cls_ch_squeeze = 576
|
||||
else:
|
||||
raise NotImplementedError("mode[" + model_name +
|
||||
"_model] is not implemented!")
|
||||
|
||||
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
|
||||
assert self.scale in supported_scale, \
|
||||
"supported scale are {} but input scale is {}".format(supported_scale, self.scale)
|
||||
|
||||
def __call__(self, input):
|
||||
scale = self.scale
|
||||
inplanes = self.inplanes
|
||||
cfg = self.cfg
|
||||
cls_ch_squeeze = self.cls_ch_squeeze
|
||||
cls_ch_expand = self.cls_ch_expand
|
||||
#conv1
|
||||
conv = self.conv_bn_layer(
|
||||
input,
|
||||
filter_size=3,
|
||||
num_filters=self.make_divisible(inplanes * scale),
|
||||
assert scale in supported_scale, \
|
||||
"supported scale are {} but input scale is {}".format(supported_scale, scale)
|
||||
inplanes = 16
|
||||
# conv1
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=make_divisible(inplanes * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
num_groups=1,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hard_swish',
|
||||
name='conv1')
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = []
|
||||
block_list = []
|
||||
i = 0
|
||||
inplanes = self.make_divisible(inplanes * scale)
|
||||
outs = []
|
||||
for layer_cfg in cfg:
|
||||
if layer_cfg[5] == 2 and i > 2:
|
||||
outs.append(conv)
|
||||
conv = self.residual_unit(
|
||||
input=conv,
|
||||
num_in_filter=inplanes,
|
||||
num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
|
||||
num_out_filter=self.make_divisible(scale * layer_cfg[2]),
|
||||
act=layer_cfg[4],
|
||||
stride=layer_cfg[5],
|
||||
filter_size=layer_cfg[0],
|
||||
use_se=layer_cfg[3],
|
||||
name='conv' + str(i + 2))
|
||||
inplanes = self.make_divisible(scale * layer_cfg[2])
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in cfg:
|
||||
if s == 2 and i > 2:
|
||||
self.out_channels.append(inplanes)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
block_list = []
|
||||
block_list.append(
|
||||
ResidualUnit(
|
||||
in_channels=inplanes,
|
||||
mid_channels=make_divisible(scale * exp),
|
||||
out_channels=make_divisible(scale * c),
|
||||
kernel_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl,
|
||||
name="conv" + str(i + 2)))
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
block_list.append(
|
||||
ConvBNLayer(
|
||||
in_channels=inplanes,
|
||||
out_channels=make_divisible(scale * cls_ch_squeeze),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hard_swish',
|
||||
name='conv_last'))
|
||||
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
filter_size=1,
|
||||
num_filters=self.make_divisible(scale * cls_ch_squeeze),
|
||||
stride=1,
|
||||
padding=0,
|
||||
num_groups=1,
|
||||
if_act=True,
|
||||
act='hard_swish',
|
||||
name='conv_last')
|
||||
outs.append(conv)
|
||||
return outs
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
|
||||
for i, stage in enumerate(self.stages):
|
||||
self.add_sublayer(sublayer=stage, name="stage{}".format(i))
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
filter_size,
|
||||
num_filters,
|
||||
stride,
|
||||
padding,
|
||||
num_groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None,
|
||||
use_cudnn=True,
|
||||
res_last_bn_init=False):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
out_list = []
|
||||
for stage in self.stages:
|
||||
x = stage(x)
|
||||
out_list.append(x)
|
||||
return out_list
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.if_act = if_act
|
||||
self.act = act
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
act=None,
|
||||
use_cudnn=use_cudnn,
|
||||
param_attr=ParamAttr(name=name + '_weights'),
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + '_weights'),
|
||||
bias_attr=False)
|
||||
bn_name = name + '_bn'
|
||||
bn = fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
param_attr=ParamAttr(
|
||||
name=bn_name + "_scale",
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=0.0)),
|
||||
bias_attr=ParamAttr(
|
||||
name=bn_name + "_offset",
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=0.0)),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
if if_act:
|
||||
if act == 'relu':
|
||||
bn = fluid.layers.relu(bn)
|
||||
elif act == 'hard_swish':
|
||||
bn = fluid.layers.hard_swish(bn)
|
||||
return bn
|
||||
|
||||
def make_divisible(self, v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_bn_scale"),
|
||||
bias_attr=ParamAttr(name=name + "_bn_offset"),
|
||||
moving_mean_name=name + "_bn_mean",
|
||||
moving_variance_name=name + "_bn_variance")
|
||||
|
||||
def se_block(self, input, num_out_filter, ratio=4, name=None):
|
||||
num_mid_filter = num_out_filter // ratio
|
||||
pool = fluid.layers.pool2d(
|
||||
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
|
||||
conv1 = fluid.layers.conv2d(
|
||||
input=pool,
|
||||
filter_size=1,
|
||||
num_filters=num_mid_filter,
|
||||
act='relu',
|
||||
param_attr=ParamAttr(name=name + '_1_weights'),
|
||||
bias_attr=ParamAttr(name=name + '_1_offset'))
|
||||
conv2 = fluid.layers.conv2d(
|
||||
input=conv1,
|
||||
filter_size=1,
|
||||
num_filters=num_out_filter,
|
||||
act='hard_sigmoid',
|
||||
param_attr=ParamAttr(name=name + '_2_weights'),
|
||||
bias_attr=ParamAttr(name=name + '_2_offset'))
|
||||
scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
|
||||
return scale
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.if_act:
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "hard_swish":
|
||||
x = F.hard_swish(x)
|
||||
else:
|
||||
print("The activation function is selected incorrectly.")
|
||||
exit()
|
||||
return x
|
||||
|
||||
def residual_unit(self,
|
||||
input,
|
||||
num_in_filter,
|
||||
num_mid_filter,
|
||||
num_out_filter,
|
||||
stride,
|
||||
filter_size,
|
||||
act=None,
|
||||
use_se=False,
|
||||
name=None):
|
||||
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
filter_size=1,
|
||||
num_filters=num_mid_filter,
|
||||
class ResidualUnit(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
use_se,
|
||||
act=None,
|
||||
name=''):
|
||||
super(ResidualUnit, self).__init__()
|
||||
self.if_shortcut = stride == 1 and in_channels == out_channels
|
||||
self.if_se = use_se
|
||||
|
||||
self.expand_conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=True,
|
||||
act=act,
|
||||
name=name + '_expand')
|
||||
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
filter_size=filter_size,
|
||||
num_filters=num_mid_filter,
|
||||
name=name + "_expand")
|
||||
self.bottleneck_conv = ConvBNLayer(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=int((filter_size - 1) // 2),
|
||||
padding=int((kernel_size - 1) // 2),
|
||||
groups=mid_channels,
|
||||
if_act=True,
|
||||
act=act,
|
||||
num_groups=num_mid_filter,
|
||||
use_cudnn=False,
|
||||
name=name + '_depthwise')
|
||||
if use_se:
|
||||
conv1 = self.se_block(
|
||||
input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
|
||||
|
||||
conv2 = self.conv_bn_layer(
|
||||
input=conv1,
|
||||
filter_size=1,
|
||||
num_filters=num_out_filter,
|
||||
name=name + "_depthwise")
|
||||
if self.if_se:
|
||||
self.mid_se = SEModule(mid_channels, name=name + "_se")
|
||||
self.linear_conv = ConvBNLayer(
|
||||
in_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=False,
|
||||
name=name + '_linear',
|
||||
res_last_bn_init=True)
|
||||
if num_in_filter != num_out_filter or stride != 1:
|
||||
return conv2
|
||||
else:
|
||||
return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
|
||||
act=None,
|
||||
name=name + "_linear")
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.expand_conv(inputs)
|
||||
x = self.bottleneck_conv(x)
|
||||
if self.if_se:
|
||||
x = self.mid_se(x)
|
||||
x = self.linear_conv(x)
|
||||
if self.if_shortcut:
|
||||
x = paddle.elementwise_add(inputs, x)
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, in_channels, reduction=4, name=""):
|
||||
super(SEModule, self).__init__()
|
||||
self.avg_pool = nn.Pool2D(
|
||||
pool_type="avg", global_pooling=True, use_cudnn=False)
|
||||
self.conv1 = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels // reduction,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(name=name + "_1_weights"),
|
||||
bias_attr=ParamAttr(name=name + "_1_offset"))
|
||||
self.conv2 = nn.Conv2d(
|
||||
in_channels=in_channels // reduction,
|
||||
out_channels=in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(name + "_2_weights"),
|
||||
bias_attr=ParamAttr(name=name + "_2_offset"))
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.avg_pool(inputs)
|
||||
outputs = self.conv1(outputs)
|
||||
outputs = F.relu(outputs)
|
||||
outputs = self.conv2(outputs)
|
||||
outputs = F.hard_sigmoid(outputs)
|
||||
return inputs * outputs
|
||||
|
|
|
@ -1,252 +1,329 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class ResNet(object):
|
||||
def __init__(self, params):
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self, in_channels=3, layers=50, **kwargs):
|
||||
"""
|
||||
the Resnet backbone network for detection module.
|
||||
Args:
|
||||
params(dict): the super parameters for network build
|
||||
"""
|
||||
self.layers = params['layers']
|
||||
supported_layers = [18, 34, 50, 101, 152]
|
||||
assert self.layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
|
||||
self.is_3x3 = True
|
||||
super(ResNet, self).__init__()
|
||||
supported_layers = {
|
||||
18: {
|
||||
'depth': [2, 2, 2, 2],
|
||||
'block_class': BasicBlock
|
||||
},
|
||||
34: {
|
||||
'depth': [3, 4, 6, 3],
|
||||
'block_class': BasicBlock
|
||||
},
|
||||
50: {
|
||||
'depth': [3, 4, 6, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
101: {
|
||||
'depth': [3, 4, 23, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
152: {
|
||||
'depth': [3, 8, 36, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
200: {
|
||||
'depth': [3, 12, 48, 3],
|
||||
'block_class': BottleneckBlock
|
||||
}
|
||||
}
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(supported_layers.keys(), layers)
|
||||
is_3x3 = True
|
||||
|
||||
depth = supported_layers[layers]['depth']
|
||||
block_class = supported_layers[layers]['block_class']
|
||||
|
||||
def __call__(self, input):
|
||||
layers = self.layers
|
||||
is_3x3 = self.is_3x3
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
depth = [3, 4, 6, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
outs = []
|
||||
|
||||
conv = []
|
||||
if is_3x3 == False:
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=64,
|
||||
filter_size=7,
|
||||
stride=2,
|
||||
act='relu')
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=7,
|
||||
stride=2,
|
||||
act='relu'))
|
||||
else:
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=32,
|
||||
filter_size=3,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name='conv1_1')
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
num_filters=32,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_2')
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
num_filters=64,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_3')
|
||||
|
||||
conv = fluid.layers.pool2d(
|
||||
input=conv,
|
||||
pool_size=3,
|
||||
pool_stride=2,
|
||||
pool_padding=1,
|
||||
pool_type='max')
|
||||
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152, 200] and block == 2:
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=3,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name='conv1_1'))
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_2'))
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_3'))
|
||||
self.conv1 = nn.Sequential(*conv)
|
||||
self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
self.stages = []
|
||||
self.out_channels = []
|
||||
in_ch = 64
|
||||
for block_index in range(len(depth)):
|
||||
block_list = []
|
||||
for i in range(depth[block_index]):
|
||||
if layers >= 50:
|
||||
if layers in [101, 152, 200] and block_index == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
conv_name = "res" + str(block_index + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
conv_name = "res" + str(block_index +
|
||||
2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
conv = self.bottleneck_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
outs.append(conv)
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
conv = self.basic_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
outs.append(conv)
|
||||
return outs
|
||||
conv_name = "res" + str(block_index + 2) + chr(97 + i)
|
||||
else:
|
||||
conv_name = "res" + str(block_index + 2) + chr(97 + i)
|
||||
block_list.append(
|
||||
block_class(
|
||||
in_channels=in_ch,
|
||||
out_channels=num_filters[block_index],
|
||||
stride=2 if i == 0 and block_index != 0 else 1,
|
||||
if_first=block_index == i == 0,
|
||||
name=conv_name))
|
||||
in_ch = block_list[-1].out_channels
|
||||
self.out_channels.append(in_ch)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
for i, stage in enumerate(self.stages):
|
||||
self.add_sublayer(sublayer=stage, name="stage{}".format(i))
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.pool(x)
|
||||
out_list = []
|
||||
for stage in self.stages:
|
||||
x = stage(x)
|
||||
out_list.append(x)
|
||||
return out_list
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
param_attr=ParamAttr(name=bn_name + "_scale"),
|
||||
bias_attr=ParamAttr(name=bn_name + "_offset"),
|
||||
moving_mean_name=bn_name + "_mean",
|
||||
moving_variance_name=bn_name + "_variance")
|
||||
|
||||
def conv_bn_layer_new(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
pool = fluid.layers.pool2d(
|
||||
input=input,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
pool_padding=0,
|
||||
pool_type='avg',
|
||||
ceil_mode=True)
|
||||
def __call__(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
conv = fluid.layers.conv2d(
|
||||
input=pool,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
|
||||
class ConvBNLayerNew(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayerNew, self).__init__()
|
||||
self.pool = nn.AvgPool2d(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=1,
|
||||
padding=(filter_size - 1) // 2,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
param_attr=ParamAttr(name=bn_name + "_scale"),
|
||||
bias_attr=ParamAttr(name=bn_name + "_offset"),
|
||||
moving_mean_name=bn_name + "_mean",
|
||||
moving_variance_name=bn_name + "_variance")
|
||||
|
||||
def shortcut(self, input, ch_out, stride, name, if_first=False):
|
||||
ch_in = input.shape[1]
|
||||
if ch_in != ch_out or stride != 1:
|
||||
def __call__(self, x):
|
||||
x = self.pool(x)
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
|
||||
class ShortCut(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, if_first=False):
|
||||
super(ShortCut, self).__init__()
|
||||
self.use_conv = True
|
||||
if in_channels != out_channels or stride != 1:
|
||||
if if_first:
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
else:
|
||||
return self.conv_bn_layer_new(
|
||||
input, ch_out, 1, stride, name=name)
|
||||
self.conv = ConvBNLayerNew(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
elif if_first:
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
else:
|
||||
return input
|
||||
self.use_conv = False
|
||||
|
||||
def bottleneck_block(self, input, num_filters, stride, name, if_first):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=1,
|
||||
def forward(self, x):
|
||||
if self.use_conv:
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, if_first):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
conv2 = self.conv_bn_layer(
|
||||
input=conv1,
|
||||
num_filters=num_filters * 4,
|
||||
filter_size=1,
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters * 4,
|
||||
stride,
|
||||
self.short = ShortCut(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
stride=stride,
|
||||
if_first=if_first,
|
||||
name=name + "_branch1")
|
||||
self.out_channels = out_channels * 4
|
||||
|
||||
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
|
||||
def forward(self, x):
|
||||
y = self.conv0(x)
|
||||
y = self.conv1(y)
|
||||
y = self.conv2(y)
|
||||
y = y + self.short(x)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
def basic_block(self, input, num_filters, stride, name, if_first):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, if_first):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act='relu',
|
||||
stride=stride,
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters,
|
||||
stride,
|
||||
self.short = ShortCut(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
stride=stride,
|
||||
if_first=if_first,
|
||||
name=name + "_branch1")
|
||||
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
|
||||
self.out_channels = out_channels
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv0(x)
|
||||
y = self.conv1(y)
|
||||
y = y + self.short(x)
|
||||
return F.relu(y)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import paddle
|
||||
|
||||
paddle.disable_static()
|
||||
x = paddle.zeros([1, 3, 640, 640])
|
||||
x = paddle.to_variable(x)
|
||||
print(x.shape)
|
||||
net = ResNet(layers=18)
|
||||
y = net(x)
|
||||
|
||||
for stage in y:
|
||||
print(stage.shape)
|
||||
# paddle.save(net.state_dict(),'1.pth')
|
||||
|
|
|
@ -1,274 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class ResNet(object):
|
||||
def __init__(self, params):
|
||||
"""
|
||||
the Resnet backbone network for detection module.
|
||||
Args:
|
||||
params(dict): the super parameters for network build
|
||||
"""
|
||||
self.layers = params['layers']
|
||||
supported_layers = [18, 34, 50, 101, 152]
|
||||
assert self.layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
|
||||
self.is_3x3 = True
|
||||
|
||||
def __call__(self, input):
|
||||
layers = self.layers
|
||||
is_3x3 = self.is_3x3
|
||||
# if layers == 18:
|
||||
# depth = [2, 2, 2, 2]
|
||||
# elif layers == 34 or layers == 50:
|
||||
# depth = [3, 4, 6, 3]
|
||||
# elif layers == 101:
|
||||
# depth = [3, 4, 23, 3]
|
||||
# elif layers == 152:
|
||||
# depth = [3, 8, 36, 3]
|
||||
# elif layers == 200:
|
||||
# depth = [3, 12, 48, 3]
|
||||
# num_filters = [64, 128, 256, 512]
|
||||
# outs = []
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]#, 3, 3]
|
||||
elif layers == 34 or layers == 50:
|
||||
#depth = [3, 4, 6, 3]#, 3, 3]
|
||||
depth = [3, 4, 6, 3, 3]#, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]#, 3, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]#, 3, 3]
|
||||
num_filters = [64, 128, 256, 512, 512]#, 512]
|
||||
blocks = {}
|
||||
|
||||
idx = 'block_0'
|
||||
blocks[idx] = input
|
||||
|
||||
if is_3x3 == False:
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=64,
|
||||
filter_size=7,
|
||||
stride=2,
|
||||
act='relu')
|
||||
else:
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=32,
|
||||
filter_size=3,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name='conv1_1')
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
num_filters=32,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_2')
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
num_filters=64,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_3')
|
||||
idx = 'block_1'
|
||||
blocks[idx] = conv
|
||||
|
||||
conv = fluid.layers.pool2d(
|
||||
input=conv,
|
||||
pool_size=3,
|
||||
pool_stride=2,
|
||||
pool_padding=1,
|
||||
pool_type='max')
|
||||
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152, 200] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
conv = self.bottleneck_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
# outs.append(conv)
|
||||
idx = 'block_' + str(block + 2)
|
||||
blocks[idx] = conv
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
conv = self.basic_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
# outs.append(conv)
|
||||
idx = 'block_' + str(block + 2)
|
||||
blocks[idx] = conv
|
||||
# return outs
|
||||
return blocks
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def conv_bn_layer_new(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
pool = fluid.layers.pool2d(
|
||||
input=input,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
pool_padding=0,
|
||||
pool_type='avg',
|
||||
ceil_mode=True)
|
||||
|
||||
conv = fluid.layers.conv2d(
|
||||
input=pool,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
stride=1,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def shortcut(self, input, ch_out, stride, name, if_first=False):
|
||||
ch_in = input.shape[1]
|
||||
if ch_in != ch_out or stride != 1:
|
||||
if if_first:
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
else:
|
||||
return self.conv_bn_layer_new(
|
||||
input, ch_out, 1, stride, name=name)
|
||||
elif if_first:
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
else:
|
||||
return input
|
||||
|
||||
def bottleneck_block(self, input, num_filters, stride, name, if_first):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
conv2 = self.conv_bn_layer(
|
||||
input=conv1,
|
||||
num_filters=num_filters * 4,
|
||||
filter_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters * 4,
|
||||
stride,
|
||||
if_first=if_first,
|
||||
name=name + "_branch1")
|
||||
|
||||
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
|
||||
|
||||
def basic_block(self, input, num_filters, stride, name, if_first):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
act='relu',
|
||||
stride=stride,
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters,
|
||||
stride,
|
||||
if_first=if_first,
|
||||
name=name + "_branch1")
|
||||
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
|
|
@ -1,53 +1,49 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from paddle import nn
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.initializer import MSRA
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible
|
||||
|
||||
__all__ = [
|
||||
'MobileNetV3', 'MobileNetV3_small_x0_35', 'MobileNetV3_small_x0_5',
|
||||
'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
|
||||
'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_35',
|
||||
'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
|
||||
'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25'
|
||||
]
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
class MobileNetV3():
|
||||
def __init__(self, params):
|
||||
self.scale = params.get("scale", 0.5)
|
||||
model_name = params.get("model_name", "small")
|
||||
large_stride = params.get("large_stride", [1, 2, 2, 2])
|
||||
small_stride = params.get("small_stride", [2, 2, 2, 2])
|
||||
class MobileNetV3(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
model_name='small',
|
||||
scale=0.5,
|
||||
large_stride=None,
|
||||
small_stride=None,
|
||||
**kwargs):
|
||||
super(MobileNetV3, self).__init__()
|
||||
if small_stride is None:
|
||||
small_stride = [2, 2, 2, 2]
|
||||
if large_stride is None:
|
||||
large_stride = [1, 2, 2, 2]
|
||||
|
||||
assert isinstance(large_stride, list), "large_stride type must " \
|
||||
"be list but got {}".format(type(large_stride))
|
||||
"be list but got {}".format(type(large_stride))
|
||||
assert isinstance(small_stride, list), "small_stride type must " \
|
||||
"be list but got {}".format(type(small_stride))
|
||||
"be list but got {}".format(type(small_stride))
|
||||
assert len(large_stride) == 4, "large_stride length must be " \
|
||||
"4 but got {}".format(len(large_stride))
|
||||
"4 but got {}".format(len(large_stride))
|
||||
assert len(small_stride) == 4, "small_stride length must be " \
|
||||
"4 but got {}".format(len(small_stride))
|
||||
"4 but got {}".format(len(small_stride))
|
||||
|
||||
self.inplanes = 16
|
||||
if model_name == "large":
|
||||
self.cfg = [
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, 'relu', large_stride[0]],
|
||||
[3, 64, 24, False, 'relu', (large_stride[1], 1)],
|
||||
|
@ -65,10 +61,9 @@ class MobileNetV3():
|
|||
[5, 960, 160, True, 'hard_swish', 1],
|
||||
[5, 960, 160, True, 'hard_swish', 1],
|
||||
]
|
||||
self.cls_ch_squeeze = 960
|
||||
self.cls_ch_expand = 1280
|
||||
cls_ch_squeeze = 960
|
||||
elif model_name == "small":
|
||||
self.cfg = [
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, 'relu', (small_stride[0], 1)],
|
||||
[3, 72, 24, False, 'relu', (small_stride[1], 1)],
|
||||
|
@ -82,186 +77,72 @@ class MobileNetV3():
|
|||
[5, 576, 96, True, 'hard_swish', 1],
|
||||
[5, 576, 96, True, 'hard_swish', 1],
|
||||
]
|
||||
self.cls_ch_squeeze = 576
|
||||
self.cls_ch_expand = 1280
|
||||
cls_ch_squeeze = 576
|
||||
else:
|
||||
raise NotImplementedError("mode[" + model_name +
|
||||
"_model] is not implemented!")
|
||||
|
||||
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
|
||||
assert self.scale in supported_scale, \
|
||||
"supported scales are {} but input scale is {}".format(supported_scale, self.scale)
|
||||
assert scale in supported_scale, \
|
||||
"supported scales are {} but input scale is {}".format(supported_scale, scale)
|
||||
|
||||
def __call__(self, input):
|
||||
scale = self.scale
|
||||
inplanes = self.inplanes
|
||||
cfg = self.cfg
|
||||
cls_ch_squeeze = self.cls_ch_squeeze
|
||||
cls_ch_expand = self.cls_ch_expand
|
||||
#conv1
|
||||
conv = self.conv_bn_layer(
|
||||
input,
|
||||
filter_size=3,
|
||||
num_filters=self.make_divisible(inplanes * scale),
|
||||
inplanes = 16
|
||||
# conv1
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=make_divisible(inplanes * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
num_groups=1,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hard_swish',
|
||||
name='conv1')
|
||||
i = 0
|
||||
inplanes = self.make_divisible(inplanes * scale)
|
||||
for layer_cfg in cfg:
|
||||
conv = self.residual_unit(
|
||||
input=conv,
|
||||
num_in_filter=inplanes,
|
||||
num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
|
||||
num_out_filter=self.make_divisible(scale * layer_cfg[2]),
|
||||
act=layer_cfg[4],
|
||||
stride=layer_cfg[5],
|
||||
filter_size=layer_cfg[0],
|
||||
use_se=layer_cfg[3],
|
||||
name='conv' + str(i + 2))
|
||||
inplanes = self.make_divisible(scale * layer_cfg[2])
|
||||
block_list = []
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in cfg:
|
||||
block_list.append(
|
||||
ResidualUnit(
|
||||
in_channels=inplanes,
|
||||
mid_channels=make_divisible(scale * exp),
|
||||
out_channels=make_divisible(scale * c),
|
||||
kernel_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl,
|
||||
name='conv' + str(i + 2)))
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
self.blocks = nn.Sequential(*block_list)
|
||||
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
filter_size=1,
|
||||
num_filters=self.make_divisible(scale * cls_ch_squeeze),
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=inplanes,
|
||||
out_channels=make_divisible(scale * cls_ch_squeeze),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
num_groups=1,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hard_swish',
|
||||
name='conv_last')
|
||||
|
||||
conv = fluid.layers.pool2d(
|
||||
input=conv,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
pool_padding=0,
|
||||
pool_type='max')
|
||||
return conv
|
||||
self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
|
||||
self.out_channels = make_divisible(scale * cls_ch_squeeze)
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
filter_size,
|
||||
num_filters,
|
||||
stride,
|
||||
padding,
|
||||
num_groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None,
|
||||
use_cudnn=True,
|
||||
res_last_bn_init=False):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
act=None,
|
||||
use_cudnn=use_cudnn,
|
||||
param_attr=ParamAttr(name=name + '_weights'),
|
||||
bias_attr=False)
|
||||
bn_name = name + '_bn'
|
||||
bn = fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
param_attr=ParamAttr(
|
||||
name=bn_name + "_scale",
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=0.0)),
|
||||
bias_attr=ParamAttr(
|
||||
name=bn_name + "_offset",
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=0.0)),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
if if_act:
|
||||
if act == 'relu':
|
||||
bn = fluid.layers.relu(bn)
|
||||
elif act == 'hard_swish':
|
||||
bn = fluid.layers.hard_swish(bn)
|
||||
return bn
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.blocks(x)
|
||||
x = self.conv2(x)
|
||||
x = self.pool(x)
|
||||
return x
|
||||
|
||||
def make_divisible(self, v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
def se_block(self, input, num_out_filter, ratio=4, name=None):
|
||||
num_mid_filter = num_out_filter // ratio
|
||||
pool = fluid.layers.pool2d(
|
||||
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
|
||||
conv1 = fluid.layers.conv2d(
|
||||
input=pool,
|
||||
filter_size=1,
|
||||
num_filters=num_mid_filter,
|
||||
act='relu',
|
||||
param_attr=ParamAttr(name=name + '_1_weights'),
|
||||
bias_attr=ParamAttr(name=name + '_1_offset'))
|
||||
conv2 = fluid.layers.conv2d(
|
||||
input=conv1,
|
||||
filter_size=1,
|
||||
num_filters=num_out_filter,
|
||||
act='hard_sigmoid',
|
||||
param_attr=ParamAttr(name=name + '_2_weights'),
|
||||
bias_attr=ParamAttr(name=name + '_2_offset'))
|
||||
scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
|
||||
return scale
|
||||
|
||||
def residual_unit(self,
|
||||
input,
|
||||
num_in_filter,
|
||||
num_mid_filter,
|
||||
num_out_filter,
|
||||
stride,
|
||||
filter_size,
|
||||
act=None,
|
||||
use_se=False,
|
||||
name=None):
|
||||
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
filter_size=1,
|
||||
num_filters=num_mid_filter,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=True,
|
||||
act=act,
|
||||
name=name + '_expand')
|
||||
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
filter_size=filter_size,
|
||||
num_filters=num_mid_filter,
|
||||
stride=stride,
|
||||
padding=int((filter_size - 1) // 2),
|
||||
if_act=True,
|
||||
act=act,
|
||||
num_groups=num_mid_filter,
|
||||
use_cudnn=False,
|
||||
name=name + '_depthwise')
|
||||
if use_se:
|
||||
conv1 = self.se_block(
|
||||
input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
|
||||
|
||||
conv2 = self.conv_bn_layer(
|
||||
input=conv1,
|
||||
filter_size=1,
|
||||
num_filters=num_out_filter,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=False,
|
||||
name=name + '_linear',
|
||||
res_last_bn_init=True)
|
||||
if num_in_filter != num_out_filter or stride != 1:
|
||||
return conv2
|
||||
else:
|
||||
return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
|
||||
if __name__ == '__main__':
|
||||
import paddle
|
||||
paddle.disable_static()
|
||||
x = paddle.zeros((1, 3, 32, 320))
|
||||
x = paddle.to_variable(x)
|
||||
net = MobileNetV3(model_name='small', small_stride=[1, 2, 2, 2])
|
||||
y = net(x)
|
||||
print(y.shape)
|
||||
|
|
|
@ -1,246 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
|
||||
__all__ = [
|
||||
"ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"
|
||||
]
|
||||
|
||||
Trainable = True
|
||||
w_nolr = fluid.ParamAttr(trainable=Trainable)
|
||||
train_parameters = {
|
||||
"input_size": [3, 224, 224],
|
||||
"input_mean": [0.485, 0.456, 0.406],
|
||||
"input_std": [0.229, 0.224, 0.225],
|
||||
"learning_strategy": {
|
||||
"name": "piecewise_decay",
|
||||
"batch_size": 256,
|
||||
"epochs": [30, 60, 90],
|
||||
"steps": [0.1, 0.01, 0.001, 0.0001]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class ResNet():
|
||||
def __init__(self, params):
|
||||
self.layers = params['layers']
|
||||
self.params = train_parameters
|
||||
|
||||
def __call__(self, input):
|
||||
layers = self.layers
|
||||
supported_layers = [18, 34, 50, 101, 152]
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(supported_layers, layers)
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
depth = [3, 4, 6, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=64,
|
||||
filter_size=7,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name="conv1")
|
||||
F = []
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
conv = self.bottleneck_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=stride_list[block] if i == 0 else 1,
|
||||
name=conv_name)
|
||||
F.append(conv)
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
|
||||
if i == 0 and block != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
|
||||
conv = self.basic_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=stride,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
F.append(conv)
|
||||
|
||||
base = F[-1]
|
||||
for i in [-2, -3]:
|
||||
b, c, w, h = F[i].shape
|
||||
if (w, h) == base.shape[2:]:
|
||||
base = base
|
||||
else:
|
||||
base = fluid.layers.conv2d_transpose(
|
||||
input=base,
|
||||
num_filters=c,
|
||||
filter_size=4,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=None,
|
||||
param_attr=w_nolr,
|
||||
bias_attr=w_nolr)
|
||||
base = fluid.layers.batch_norm(
|
||||
base, act="relu", param_attr=w_nolr, bias_attr=w_nolr)
|
||||
base = fluid.layers.concat([base, F[i]], axis=1)
|
||||
base = fluid.layers.conv2d(
|
||||
base,
|
||||
num_filters=c,
|
||||
filter_size=1,
|
||||
param_attr=w_nolr,
|
||||
bias_attr=w_nolr)
|
||||
base = fluid.layers.conv2d(
|
||||
base,
|
||||
num_filters=c,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=w_nolr,
|
||||
bias_attr=w_nolr)
|
||||
base = fluid.layers.batch_norm(
|
||||
base, act="relu", param_attr=w_nolr, bias_attr=w_nolr)
|
||||
|
||||
base = fluid.layers.conv2d(
|
||||
base,
|
||||
num_filters=512,
|
||||
filter_size=1,
|
||||
bias_attr=w_nolr,
|
||||
param_attr=w_nolr)
|
||||
|
||||
return base
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=2 if stride == (1, 1) else filter_size,
|
||||
dilation=2 if stride == (1, 1) else 1,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(
|
||||
name=name + "_weights", trainable=Trainable),
|
||||
bias_attr=False,
|
||||
name=name + '.conv2d.output.1')
|
||||
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
act=act,
|
||||
name=bn_name + '.output.1',
|
||||
param_attr=ParamAttr(
|
||||
name=bn_name + '_scale', trainable=Trainable),
|
||||
bias_attr=ParamAttr(
|
||||
bn_name + '_offset', trainable=Trainable),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance', )
|
||||
|
||||
def shortcut(self, input, ch_out, stride, is_first, name):
|
||||
ch_in = input.shape[1]
|
||||
if ch_in != ch_out or stride != 1 or is_first == True:
|
||||
if stride == (1, 1):
|
||||
return self.conv_bn_layer(input, ch_out, 1, 1, name=name)
|
||||
else: #stride == (2,2)
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
|
||||
else:
|
||||
return input
|
||||
|
||||
def bottleneck_block(self, input, num_filters, stride, name):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
conv2 = self.conv_bn_layer(
|
||||
input=conv1,
|
||||
num_filters=num_filters * 4,
|
||||
filter_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters * 4,
|
||||
stride,
|
||||
is_first=False,
|
||||
name=name + "_branch1")
|
||||
|
||||
return fluid.layers.elementwise_add(
|
||||
x=short, y=conv2, act='relu', name=name + ".add.output.5")
|
||||
|
||||
def basic_block(self, input, num_filters, stride, is_first, name):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
act='relu',
|
||||
stride=stride,
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
short = self.shortcut(
|
||||
input, num_filters, stride, is_first, name=name + "_branch1")
|
||||
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
|
|
@ -1,271 +1,312 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
from paddle import nn, ParamAttr
|
||||
from paddle.nn import functional as F
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
|
||||
__all__ = [
|
||||
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
|
||||
"ResNet152_vd", "ResNet200_vd"
|
||||
]
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class ResNet():
|
||||
def __init__(self, params):
|
||||
self.layers = params['layers']
|
||||
self.is_3x3 = True
|
||||
supported_layers = [18, 34, 50, 101, 152, 200]
|
||||
assert self.layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self, in_channels=3, layers=34):
|
||||
super(ResNet, self).__init__()
|
||||
supported_layers = {
|
||||
18: {
|
||||
'depth': [2, 2, 2, 2],
|
||||
'block_class': BasicBlock
|
||||
},
|
||||
34: {
|
||||
'depth': [3, 4, 6, 3],
|
||||
'block_class': BasicBlock
|
||||
},
|
||||
50: {
|
||||
'depth': [3, 4, 6, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
101: {
|
||||
'depth': [3, 4, 23, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
152: {
|
||||
'depth': [3, 8, 36, 3],
|
||||
'block_class': BottleneckBlock
|
||||
},
|
||||
200: {
|
||||
'depth': [3, 12, 48, 3],
|
||||
'block_class': BottleneckBlock
|
||||
}
|
||||
}
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(supported_layers.keys(), layers)
|
||||
is_3x3 = True
|
||||
|
||||
def __call__(self, input):
|
||||
is_3x3 = self.is_3x3
|
||||
layers = self.layers
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
depth = [3, 4, 6, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
depth = supported_layers[layers]['depth']
|
||||
block_class = supported_layers[layers]['block_class']
|
||||
conv = []
|
||||
if is_3x3 == False:
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=64,
|
||||
filter_size=7,
|
||||
stride=1,
|
||||
act='relu')
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=64,
|
||||
kernel_size=7,
|
||||
stride=1,
|
||||
act='relu'))
|
||||
else:
|
||||
conv = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=32,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_1')
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
num_filters=32,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_2')
|
||||
conv = self.conv_bn_layer(
|
||||
input=conv,
|
||||
num_filters=64,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_3')
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_1'))
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_2'))
|
||||
conv.append(
|
||||
ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name='conv1_3'))
|
||||
self.conv1 = nn.Sequential(*conv)
|
||||
|
||||
conv = fluid.layers.pool2d(
|
||||
input=conv,
|
||||
pool_size=3,
|
||||
pool_stride=2,
|
||||
pool_padding=1,
|
||||
pool_type='max')
|
||||
self.pool = nn.MaxPool2d(
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1, )
|
||||
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152, 200] and block == 2:
|
||||
block_list = []
|
||||
in_ch = 64
|
||||
for block_index in range(len(depth)):
|
||||
for i in range(depth[block_index]):
|
||||
if layers >= 50:
|
||||
if layers in [101, 152, 200] and block_index == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
conv_name = "res" + str(block_index + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
conv_name = "res" + str(block_index +
|
||||
2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
|
||||
if i == 0 and block != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
|
||||
conv = self.bottleneck_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
conv_name = "res" + str(block_index + 2) + chr(97 + i)
|
||||
else:
|
||||
conv_name = "res" + str(block_index + 2) + chr(97 + i)
|
||||
if i == 0 and block_index != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
block_list.append(
|
||||
block_class(
|
||||
in_channels=in_ch,
|
||||
out_channels=num_filters[block_index],
|
||||
stride=stride,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
if_first=block_index == i == 0,
|
||||
name=conv_name))
|
||||
in_ch = block_list[-1].out_channels
|
||||
self.block_list = nn.Sequential(*block_list)
|
||||
self.add_sublayer(sublayer=self.block_list, name="block_list")
|
||||
self.pool_out = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
|
||||
self.out_channels = in_ch
|
||||
|
||||
if i == 0 and block != 0:
|
||||
stride = (2, 1)
|
||||
else:
|
||||
stride = (1, 1)
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.pool(x)
|
||||
x = self.block_list(x)
|
||||
x = self.pool_out(x)
|
||||
return x
|
||||
|
||||
conv = self.basic_block(
|
||||
input=conv,
|
||||
num_filters=num_filters[block],
|
||||
stride=stride,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name)
|
||||
|
||||
conv = fluid.layers.pool2d(
|
||||
input=conv,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
pool_padding=0,
|
||||
pool_type='max')
|
||||
|
||||
return conv
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
param_attr=ParamAttr(name=bn_name + "_scale"),
|
||||
bias_attr=ParamAttr(name=bn_name + "_offset"),
|
||||
moving_mean_name=bn_name + "_mean",
|
||||
moving_variance_name=bn_name + "_variance")
|
||||
|
||||
def conv_bn_layer_new(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
pool = fluid.layers.pool2d(
|
||||
input=input,
|
||||
pool_size=stride,
|
||||
pool_stride=stride,
|
||||
pool_padding=0,
|
||||
pool_type='avg',
|
||||
ceil_mode=True)
|
||||
def __call__(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
conv = fluid.layers.conv2d(
|
||||
input=pool,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
|
||||
class ConvBNLayerNew(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayerNew, self).__init__()
|
||||
self.pool = nn.AvgPool2d(
|
||||
kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
|
||||
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=1,
|
||||
padding=(filter_size - 1) // 2,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
param_attr=ParamAttr(name=bn_name + "_scale"),
|
||||
bias_attr=ParamAttr(name=bn_name + "_offset"),
|
||||
moving_mean_name=bn_name + "_mean",
|
||||
moving_variance_name=bn_name + "_variance")
|
||||
|
||||
def shortcut(self, input, ch_out, stride, name, if_first=False):
|
||||
ch_in = input.shape[1]
|
||||
if ch_in != ch_out or stride[0] != 1:
|
||||
def __call__(self, x):
|
||||
x = self.pool(x)
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
return x
|
||||
|
||||
|
||||
class ShortCut(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, if_first=False):
|
||||
super(ShortCut, self).__init__()
|
||||
self.use_conv = True
|
||||
|
||||
if in_channels != out_channels or stride[0] != 1:
|
||||
if if_first:
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
else:
|
||||
return self.conv_bn_layer_new(
|
||||
input, ch_out, 1, stride, name=name)
|
||||
self.conv = ConvBNLayerNew(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
elif if_first:
|
||||
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels, out_channels, 1, stride, name=name)
|
||||
else:
|
||||
return input
|
||||
self.use_conv = False
|
||||
|
||||
def bottleneck_block(self, input, num_filters, stride, name, if_first):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=1,
|
||||
def forward(self, x):
|
||||
if self.use_conv:
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, if_first):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
conv2 = self.conv_bn_layer(
|
||||
input=conv1,
|
||||
num_filters=num_filters * 4,
|
||||
filter_size=1,
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters * 4,
|
||||
stride,
|
||||
self.short = ShortCut(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
stride=stride,
|
||||
if_first=if_first,
|
||||
name=name + "_branch1")
|
||||
self.out_channels = out_channels * 4
|
||||
|
||||
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
|
||||
def forward(self, x):
|
||||
y = self.conv0(x)
|
||||
y = self.conv1(y)
|
||||
y = self.conv2(y)
|
||||
y = y + self.short(x)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
def basic_block(self, input, num_filters, stride, name, if_first):
|
||||
conv0 = self.conv_bn_layer(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, name, if_first):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act='relu',
|
||||
stride=stride,
|
||||
name=name + "_branch2a")
|
||||
conv1 = self.conv_bn_layer(
|
||||
input=conv0,
|
||||
num_filters=num_filters,
|
||||
filter_size=3,
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
short = self.shortcut(
|
||||
input,
|
||||
num_filters,
|
||||
stride,
|
||||
self.short = ShortCut(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
stride=stride,
|
||||
if_first=if_first,
|
||||
name=name + "_branch1")
|
||||
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
|
||||
self.out_channels = out_channels
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv0(x)
|
||||
y = self.conv1(y)
|
||||
y = y + self.short(x)
|
||||
return F.relu(y)
|
||||
|
|
|
@ -1,95 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
import math
|
||||
|
||||
|
||||
def get_para_bias_attr(l2_decay, k, name):
|
||||
regularizer = fluid.regularizer.L2Decay(l2_decay)
|
||||
stdv = 1.0 / math.sqrt(k * 1.0)
|
||||
initializer = fluid.initializer.Uniform(-stdv, stdv)
|
||||
para_attr = fluid.ParamAttr(
|
||||
regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
|
||||
bias_attr = fluid.ParamAttr(
|
||||
regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
|
||||
return [para_attr, bias_attr]
|
||||
|
||||
|
||||
def conv_bn_layer(input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
conv = fluid.layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False,
|
||||
name=name + '.conv2d')
|
||||
|
||||
bn_name = "bn_" + name
|
||||
return fluid.layers.batch_norm(
|
||||
input=conv,
|
||||
act=act,
|
||||
name=bn_name + '.output',
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
|
||||
def deconv_bn_layer(input,
|
||||
num_filters,
|
||||
filter_size=4,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name=None):
|
||||
deconv = fluid.layers.conv2d_transpose(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False,
|
||||
name=name + '.deconv2d')
|
||||
bn_name = "bn_" + name
|
||||
return fluid.layers.batch_norm(
|
||||
input=deconv,
|
||||
act=act,
|
||||
name=bn_name + '.output',
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
|
||||
def create_tmp_var(program, name, dtype, shape, lod_level=0):
|
||||
return program.current_block().create_var(
|
||||
name=name, dtype=dtype, shape=shape, lod_level=lod_level)
|
|
@ -11,3 +11,20 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ['build_head']
|
||||
|
||||
|
||||
def build_head(config):
|
||||
# det head
|
||||
from .det_db_head import DBHead
|
||||
|
||||
# rec head
|
||||
from .rec_ctc_head import CTC
|
||||
support_dict = ['DBHead', 'CTC']
|
||||
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception('head only support {}'.format(
|
||||
support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
||||
|
|
|
@ -1,27 +1,98 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
class DBHead(object):
|
||||
def get_bias_attr(k, name):
|
||||
stdv = 1.0 / math.sqrt(k * 1.0)
|
||||
initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
|
||||
bias_attr = ParamAttr(initializer=initializer, name=name + "_b_attr")
|
||||
return bias_attr
|
||||
|
||||
|
||||
class Head(nn.Layer):
|
||||
def __init__(self, in_channels, name_list):
|
||||
super(Head, self).__init__()
|
||||
self.conv1 = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels // 4,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(name=name_list[0] + '.w_0'),
|
||||
bias_attr=False)
|
||||
self.conv_bn1 = nn.BatchNorm(
|
||||
num_channels=in_channels // 4,
|
||||
param_attr=ParamAttr(
|
||||
name=name_list[1] + '.w_0',
|
||||
initializer=paddle.nn.initializer.Constant(value=1.0)),
|
||||
bias_attr=ParamAttr(
|
||||
name=name_list[1] + '.b_0',
|
||||
initializer=paddle.nn.initializer.Constant(value=1e-4)),
|
||||
moving_mean_name=name_list[1] + '.w_1',
|
||||
moving_variance_name=name_list[1] + '.w_2',
|
||||
act='relu')
|
||||
self.conv2 = nn.ConvTranspose2d(
|
||||
in_channels=in_channels // 4,
|
||||
out_channels=in_channels // 4,
|
||||
kernel_size=2,
|
||||
stride=2,
|
||||
weight_attr=ParamAttr(
|
||||
name=name_list[2] + '.w_0',
|
||||
initializer=paddle.nn.initializer.MSRA(uniform=False)),
|
||||
bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv2"))
|
||||
self.conv_bn2 = nn.BatchNorm(
|
||||
num_channels=in_channels // 4,
|
||||
param_attr=ParamAttr(
|
||||
name=name_list[3] + '.w_0',
|
||||
initializer=paddle.nn.initializer.Constant(value=1.0)),
|
||||
bias_attr=ParamAttr(
|
||||
name=name_list[3] + '.b_0',
|
||||
initializer=paddle.nn.initializer.Constant(value=1e-4)),
|
||||
moving_mean_name=name_list[3] + '.w_1',
|
||||
moving_variance_name=name_list[3] + '.w_2',
|
||||
act="relu")
|
||||
self.conv3 = nn.ConvTranspose2d(
|
||||
in_channels=in_channels // 4,
|
||||
out_channels=1,
|
||||
kernel_size=2,
|
||||
stride=2,
|
||||
weight_attr=ParamAttr(
|
||||
name=name_list[4] + '.w_0',
|
||||
initializer=paddle.nn.initializer.MSRA(uniform=False)),
|
||||
bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv3"),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv_bn1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.conv_bn2(x)
|
||||
x = self.conv3(x)
|
||||
x = F.sigmoid(x)
|
||||
return x
|
||||
|
||||
|
||||
class DBHead(nn.Layer):
|
||||
"""
|
||||
Differentiable Binarization (DB) for text detection:
|
||||
see https://arxiv.org/abs/1911.08947
|
||||
|
@ -29,177 +100,29 @@ class DBHead(object):
|
|||
params(dict): super parameters for build DB network
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
self.k = params['k']
|
||||
self.inner_channels = params['inner_channels']
|
||||
self.C, self.H, self.W = params['image_shape']
|
||||
print(self.C, self.H, self.W)
|
||||
|
||||
def binarize(self, x):
|
||||
conv1 = fluid.layers.conv2d(
|
||||
input=x,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
|
||||
bias_attr=False)
|
||||
conv_bn1 = fluid.layers.batch_norm(
|
||||
input=conv1,
|
||||
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
|
||||
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
|
||||
act="relu")
|
||||
conv2 = fluid.layers.conv2d_transpose(
|
||||
input=conv_bn1,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=2,
|
||||
stride=2,
|
||||
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
|
||||
bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
|
||||
act=None)
|
||||
conv_bn2 = fluid.layers.batch_norm(
|
||||
input=conv2,
|
||||
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
|
||||
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
|
||||
act="relu")
|
||||
conv3 = fluid.layers.conv2d_transpose(
|
||||
input=conv_bn2,
|
||||
num_filters=1,
|
||||
filter_size=2,
|
||||
stride=2,
|
||||
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
|
||||
bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
|
||||
act=None)
|
||||
out = fluid.layers.sigmoid(conv3)
|
||||
return out
|
||||
|
||||
def thresh(self, x):
|
||||
conv1 = fluid.layers.conv2d(
|
||||
input=x,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
|
||||
bias_attr=False)
|
||||
conv_bn1 = fluid.layers.batch_norm(
|
||||
input=conv1,
|
||||
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
|
||||
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
|
||||
act="relu")
|
||||
conv2 = fluid.layers.conv2d_transpose(
|
||||
input=conv_bn1,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=2,
|
||||
stride=2,
|
||||
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
|
||||
bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
|
||||
act=None)
|
||||
conv_bn2 = fluid.layers.batch_norm(
|
||||
input=conv2,
|
||||
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
|
||||
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
|
||||
act="relu")
|
||||
conv3 = fluid.layers.conv2d_transpose(
|
||||
input=conv_bn2,
|
||||
num_filters=1,
|
||||
filter_size=2,
|
||||
stride=2,
|
||||
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
|
||||
bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
|
||||
act=None)
|
||||
out = fluid.layers.sigmoid(conv3)
|
||||
return out
|
||||
|
||||
def _get_bias_attr(self, l2_decay, k, name, gradient_clip=None):
|
||||
regularizer = fluid.regularizer.L2Decay(l2_decay)
|
||||
stdv = 1.0 / math.sqrt(k * 1.0)
|
||||
initializer = fluid.initializer.Uniform(-stdv, stdv)
|
||||
bias_attr = fluid.ParamAttr(
|
||||
regularizer=regularizer,
|
||||
initializer=initializer,
|
||||
name=name + "_b_attr")
|
||||
return bias_attr
|
||||
def __init__(self, in_channels, k=50, **kwargs):
|
||||
super(DBHead, self).__init__()
|
||||
self.k = k
|
||||
binarize_name_list = [
|
||||
'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
|
||||
'conv2d_transpose_1', 'binarize'
|
||||
]
|
||||
thresh_name_list = [
|
||||
'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
|
||||
'conv2d_transpose_3', 'thresh'
|
||||
]
|
||||
self.binarize = Head(in_channels, binarize_name_list)
|
||||
self.thresh = Head(in_channels, thresh_name_list)
|
||||
|
||||
def step_function(self, x, y):
|
||||
return fluid.layers.reciprocal(1 + fluid.layers.exp(-self.k * (x - y)))
|
||||
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))
|
||||
|
||||
def __call__(self, conv_features, mode="train"):
|
||||
c2, c3, c4, c5 = conv_features
|
||||
param_attr = fluid.initializer.MSRAInitializer(uniform=False)
|
||||
in5 = fluid.layers.conv2d(
|
||||
input=c5,
|
||||
num_filters=self.inner_channels,
|
||||
filter_size=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
in4 = fluid.layers.conv2d(
|
||||
input=c4,
|
||||
num_filters=self.inner_channels,
|
||||
filter_size=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
in3 = fluid.layers.conv2d(
|
||||
input=c3,
|
||||
num_filters=self.inner_channels,
|
||||
filter_size=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
in2 = fluid.layers.conv2d(
|
||||
input=c2,
|
||||
num_filters=self.inner_channels,
|
||||
filter_size=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
def forward(self, x):
|
||||
shrink_maps = self.binarize(x)
|
||||
if not self.training:
|
||||
return shrink_maps
|
||||
|
||||
out4 = fluid.layers.elementwise_add(
|
||||
x=fluid.layers.resize_nearest(
|
||||
input=in5, scale=2), y=in4) # 1/16
|
||||
out3 = fluid.layers.elementwise_add(
|
||||
x=fluid.layers.resize_nearest(
|
||||
input=out4, scale=2), y=in3) # 1/8
|
||||
out2 = fluid.layers.elementwise_add(
|
||||
x=fluid.layers.resize_nearest(
|
||||
input=out3, scale=2), y=in2) # 1/4
|
||||
|
||||
p5 = fluid.layers.conv2d(
|
||||
input=in5,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
p5 = fluid.layers.resize_nearest(input=p5, scale=8)
|
||||
p4 = fluid.layers.conv2d(
|
||||
input=out4,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
p4 = fluid.layers.resize_nearest(input=p4, scale=4)
|
||||
p3 = fluid.layers.conv2d(
|
||||
input=out3,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
p3 = fluid.layers.resize_nearest(input=p3, scale=2)
|
||||
p2 = fluid.layers.conv2d(
|
||||
input=out2,
|
||||
num_filters=self.inner_channels // 4,
|
||||
filter_size=3,
|
||||
padding=1,
|
||||
param_attr=param_attr,
|
||||
bias_attr=False)
|
||||
|
||||
fuse = fluid.layers.concat(input=[p5, p4, p3, p2], axis=1)
|
||||
shrink_maps = self.binarize(fuse)
|
||||
if mode != "train":
|
||||
return {"maps": shrink_maps}
|
||||
threshold_maps = self.thresh(fuse)
|
||||
threshold_maps = self.thresh(x)
|
||||
binary_maps = self.step_function(shrink_maps, threshold_maps)
|
||||
y = fluid.layers.concat(
|
||||
input=[shrink_maps, threshold_maps, binary_maps], axis=1)
|
||||
predicts = {}
|
||||
predicts['maps'] = y
|
||||
return predicts
|
||||
y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
|
||||
return y
|
||||
|
|
|
@ -1,117 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from ..common_functions import conv_bn_layer, deconv_bn_layer
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
class EASTHead(object):
|
||||
"""
|
||||
EAST: An Efficient and Accurate Scene Text Detector
|
||||
see arxiv: https://arxiv.org/abs/1704.03155
|
||||
args:
|
||||
params(dict): the super parameters for network build
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
|
||||
self.model_name = params['model_name']
|
||||
|
||||
def unet_fusion(self, inputs):
|
||||
f = inputs[::-1]
|
||||
if self.model_name == "large":
|
||||
num_outputs = [128, 128, 128, 128]
|
||||
else:
|
||||
num_outputs = [64, 64, 64, 64]
|
||||
g = [None, None, None, None]
|
||||
h = [None, None, None, None]
|
||||
for i in range(4):
|
||||
if i == 0:
|
||||
h[i] = f[i]
|
||||
else:
|
||||
h[i] = fluid.layers.concat([g[i - 1], f[i]], axis=1)
|
||||
h[i] = conv_bn_layer(
|
||||
input=h[i],
|
||||
num_filters=num_outputs[i],
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="unet_h_%d" % (i))
|
||||
if i <= 2:
|
||||
#can be replaced with unpool
|
||||
g[i] = deconv_bn_layer(
|
||||
input=h[i],
|
||||
num_filters=num_outputs[i],
|
||||
name="unet_g_%d" % (i))
|
||||
else:
|
||||
g[i] = conv_bn_layer(
|
||||
input=h[i],
|
||||
num_filters=num_outputs[i],
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="unet_g_%d" % (i))
|
||||
return g[3]
|
||||
|
||||
def detector_header(self, f_common):
|
||||
if self.model_name == "large":
|
||||
num_outputs = [128, 64, 1, 8]
|
||||
else:
|
||||
num_outputs = [64, 32, 1, 8]
|
||||
f_det = conv_bn_layer(
|
||||
input=f_common,
|
||||
num_filters=num_outputs[0],
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="det_head1")
|
||||
f_det = conv_bn_layer(
|
||||
input=f_det,
|
||||
num_filters=num_outputs[1],
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="det_head2")
|
||||
#f_score
|
||||
f_score = conv_bn_layer(
|
||||
input=f_det,
|
||||
num_filters=num_outputs[2],
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
act=None,
|
||||
name="f_score")
|
||||
f_score = fluid.layers.sigmoid(f_score)
|
||||
#f_geo
|
||||
f_geo = conv_bn_layer(
|
||||
input=f_det,
|
||||
num_filters=num_outputs[3],
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
act=None,
|
||||
name="f_geo")
|
||||
f_geo = (fluid.layers.sigmoid(f_geo) - 0.5) * 2 * 800
|
||||
return f_score, f_geo
|
||||
|
||||
def __call__(self, inputs):
|
||||
f_common = self.unet_fusion(inputs)
|
||||
f_score, f_geo = self.detector_header(f_common)
|
||||
predicts = OrderedDict()
|
||||
predicts['f_score'] = f_score
|
||||
predicts['f_geo'] = f_geo
|
||||
return predicts
|
|
@ -1,228 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from ..common_functions import conv_bn_layer, deconv_bn_layer
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
class SASTHead(object):
|
||||
"""
|
||||
SAST:
|
||||
see arxiv: https://arxiv.org/abs/1908.05498
|
||||
args:
|
||||
params(dict): the super parameters for network build
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
self.model_name = params['model_name']
|
||||
self.with_cab = params['with_cab']
|
||||
|
||||
def FPN_Up_Fusion(self, blocks):
|
||||
"""
|
||||
blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
|
||||
1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
|
||||
"""
|
||||
f = [blocks['block_6'], blocks['block_5'], blocks['block_4'], blocks['block_3'], blocks['block_2']]
|
||||
num_outputs = [256, 256, 192, 192, 128]
|
||||
g = [None, None, None, None, None]
|
||||
h = [None, None, None, None, None]
|
||||
for i in range(5):
|
||||
h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
|
||||
filter_size=1, stride=1, act=None, name='fpn_up_h'+str(i))
|
||||
|
||||
for i in range(4):
|
||||
if i == 0:
|
||||
g[i] = deconv_bn_layer(input=h[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g0')
|
||||
#print("g[{}] shape: {}".format(i, g[i].shape))
|
||||
else:
|
||||
g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
|
||||
g[i] = fluid.layers.relu(g[i])
|
||||
#g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
|
||||
# filter_size=1, stride=1, act='relu')
|
||||
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
|
||||
filter_size=3, stride=1, act='relu', name='fpn_up_g%d_1'%i)
|
||||
g[i] = deconv_bn_layer(input=g[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g%d_2'%i)
|
||||
#print("g[{}] shape: {}".format(i, g[i].shape))
|
||||
|
||||
g[4] = fluid.layers.elementwise_add(x=g[3], y=h[4])
|
||||
g[4] = fluid.layers.relu(g[4])
|
||||
g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
|
||||
filter_size=3, stride=1, act='relu', name='fpn_up_fusion_1')
|
||||
g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
|
||||
filter_size=1, stride=1, act=None, name='fpn_up_fusion_2')
|
||||
|
||||
return g[4]
|
||||
|
||||
def FPN_Down_Fusion(self, blocks):
|
||||
"""
|
||||
blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
|
||||
1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
|
||||
"""
|
||||
f = [blocks['block_0'], blocks['block_1'], blocks['block_2']]
|
||||
num_outputs = [32, 64, 128]
|
||||
g = [None, None, None]
|
||||
h = [None, None, None]
|
||||
for i in range(3):
|
||||
h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
|
||||
filter_size=3, stride=1, act=None, name='fpn_down_h'+str(i))
|
||||
for i in range(2):
|
||||
if i == 0:
|
||||
g[i] = conv_bn_layer(input=h[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g0')
|
||||
else:
|
||||
g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
|
||||
g[i] = fluid.layers.relu(g[i])
|
||||
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i], filter_size=3, stride=1, act='relu', name='fpn_down_g%d_1'%i)
|
||||
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g%d_2'%i)
|
||||
# print("g[{}] shape: {}".format(i, g[i].shape))
|
||||
g[2] = fluid.layers.elementwise_add(x=g[1], y=h[2])
|
||||
g[2] = fluid.layers.relu(g[2])
|
||||
g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
|
||||
filter_size=3, stride=1, act='relu', name='fpn_down_fusion_1')
|
||||
g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
|
||||
filter_size=1, stride=1, act=None, name='fpn_down_fusion_2')
|
||||
return g[2]
|
||||
|
||||
def SAST_Header1(self, f_common):
|
||||
"""Detector header."""
|
||||
#f_score
|
||||
f_score = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_score1')
|
||||
f_score = conv_bn_layer(input=f_score, num_filters=64, filter_size=3, stride=1, act='relu', name='f_score2')
|
||||
f_score = conv_bn_layer(input=f_score, num_filters=128, filter_size=1, stride=1, act='relu', name='f_score3')
|
||||
f_score = conv_bn_layer(input=f_score, num_filters=1, filter_size=3, stride=1, name='f_score4')
|
||||
f_score = fluid.layers.sigmoid(f_score)
|
||||
# print("f_score shape: {}".format(f_score.shape))
|
||||
|
||||
#f_boder
|
||||
f_border = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_border1')
|
||||
f_border = conv_bn_layer(input=f_border, num_filters=64, filter_size=3, stride=1, act='relu', name='f_border2')
|
||||
f_border = conv_bn_layer(input=f_border, num_filters=128, filter_size=1, stride=1, act='relu', name='f_border3')
|
||||
f_border = conv_bn_layer(input=f_border, num_filters=4, filter_size=3, stride=1, name='f_border4')
|
||||
# print("f_border shape: {}".format(f_border.shape))
|
||||
|
||||
return f_score, f_border
|
||||
|
||||
def SAST_Header2(self, f_common):
|
||||
"""Detector header."""
|
||||
#f_tvo
|
||||
f_tvo = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tvo1')
|
||||
f_tvo = conv_bn_layer(input=f_tvo, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tvo2')
|
||||
f_tvo = conv_bn_layer(input=f_tvo, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tvo3')
|
||||
f_tvo = conv_bn_layer(input=f_tvo, num_filters=8, filter_size=3, stride=1, name='f_tvo4')
|
||||
# print("f_tvo shape: {}".format(f_tvo.shape))
|
||||
|
||||
#f_tco
|
||||
f_tco = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tco1')
|
||||
f_tco = conv_bn_layer(input=f_tco, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tco2')
|
||||
f_tco = conv_bn_layer(input=f_tco, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tco3')
|
||||
f_tco = conv_bn_layer(input=f_tco, num_filters=2, filter_size=3, stride=1, name='f_tco4')
|
||||
# print("f_tco shape: {}".format(f_tco.shape))
|
||||
|
||||
return f_tvo, f_tco
|
||||
|
||||
def cross_attention(self, f_common):
|
||||
"""
|
||||
"""
|
||||
f_shape = fluid.layers.shape(f_common)
|
||||
f_theta = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_theta')
|
||||
f_phi = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_phi')
|
||||
f_g = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_g')
|
||||
### horizon
|
||||
fh_theta = f_theta
|
||||
fh_phi = f_phi
|
||||
fh_g = f_g
|
||||
#flatten
|
||||
fh_theta = fluid.layers.transpose(fh_theta, [0, 2, 3, 1])
|
||||
fh_theta = fluid.layers.reshape(fh_theta, [f_shape[0] * f_shape[2], f_shape[3], 128])
|
||||
fh_phi = fluid.layers.transpose(fh_phi, [0, 2, 3, 1])
|
||||
fh_phi = fluid.layers.reshape(fh_phi, [f_shape[0] * f_shape[2], f_shape[3], 128])
|
||||
fh_g = fluid.layers.transpose(fh_g, [0, 2, 3, 1])
|
||||
fh_g = fluid.layers.reshape(fh_g, [f_shape[0] * f_shape[2], f_shape[3], 128])
|
||||
#correlation
|
||||
fh_attn = fluid.layers.matmul(fh_theta, fluid.layers.transpose(fh_phi, [0, 2, 1]))
|
||||
#scale
|
||||
fh_attn = fh_attn / (128 ** 0.5)
|
||||
fh_attn = fluid.layers.softmax(fh_attn)
|
||||
#weighted sum
|
||||
fh_weight = fluid.layers.matmul(fh_attn, fh_g)
|
||||
fh_weight = fluid.layers.reshape(fh_weight, [f_shape[0], f_shape[2], f_shape[3], 128])
|
||||
# print("fh_weight: {}".format(fh_weight.shape))
|
||||
fh_weight = fluid.layers.transpose(fh_weight, [0, 3, 1, 2])
|
||||
fh_weight = conv_bn_layer(input=fh_weight, num_filters=128, filter_size=1, stride=1, name='fh_weight')
|
||||
#short cut
|
||||
fh_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fh_sc')
|
||||
f_h = fluid.layers.relu(fh_weight + fh_sc)
|
||||
######
|
||||
#vertical
|
||||
fv_theta = fluid.layers.transpose(f_theta, [0, 1, 3, 2])
|
||||
fv_phi = fluid.layers.transpose(f_phi, [0, 1, 3, 2])
|
||||
fv_g = fluid.layers.transpose(f_g, [0, 1, 3, 2])
|
||||
#flatten
|
||||
fv_theta = fluid.layers.transpose(fv_theta, [0, 2, 3, 1])
|
||||
fv_theta = fluid.layers.reshape(fv_theta, [f_shape[0] * f_shape[3], f_shape[2], 128])
|
||||
fv_phi = fluid.layers.transpose(fv_phi, [0, 2, 3, 1])
|
||||
fv_phi = fluid.layers.reshape(fv_phi, [f_shape[0] * f_shape[3], f_shape[2], 128])
|
||||
fv_g = fluid.layers.transpose(fv_g, [0, 2, 3, 1])
|
||||
fv_g = fluid.layers.reshape(fv_g, [f_shape[0] * f_shape[3], f_shape[2], 128])
|
||||
#correlation
|
||||
fv_attn = fluid.layers.matmul(fv_theta, fluid.layers.transpose(fv_phi, [0, 2, 1]))
|
||||
#scale
|
||||
fv_attn = fv_attn / (128 ** 0.5)
|
||||
fv_attn = fluid.layers.softmax(fv_attn)
|
||||
#weighted sum
|
||||
fv_weight = fluid.layers.matmul(fv_attn, fv_g)
|
||||
fv_weight = fluid.layers.reshape(fv_weight, [f_shape[0], f_shape[3], f_shape[2], 128])
|
||||
# print("fv_weight: {}".format(fv_weight.shape))
|
||||
fv_weight = fluid.layers.transpose(fv_weight, [0, 3, 2, 1])
|
||||
fv_weight = conv_bn_layer(input=fv_weight, num_filters=128, filter_size=1, stride=1, name='fv_weight')
|
||||
#short cut
|
||||
fv_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fv_sc')
|
||||
f_v = fluid.layers.relu(fv_weight + fv_sc)
|
||||
######
|
||||
f_attn = fluid.layers.concat([f_h, f_v], axis=1)
|
||||
f_attn = conv_bn_layer(input=f_attn, num_filters=128, filter_size=1, stride=1, act='relu', name='f_attn')
|
||||
return f_attn
|
||||
|
||||
def __call__(self, blocks, with_cab=False):
|
||||
# for k, v in blocks.items():
|
||||
# print(k, v.shape)
|
||||
|
||||
#down fpn
|
||||
f_down = self.FPN_Down_Fusion(blocks)
|
||||
# print("f_down shape: {}".format(f_down.shape))
|
||||
#up fpn
|
||||
f_up = self.FPN_Up_Fusion(blocks)
|
||||
# print("f_up shape: {}".format(f_up.shape))
|
||||
#fusion
|
||||
f_common = fluid.layers.elementwise_add(x=f_down, y=f_up)
|
||||
f_common = fluid.layers.relu(f_common)
|
||||
# print("f_common: {}".format(f_common.shape))
|
||||
|
||||
if self.with_cab:
|
||||
# print('enhence f_common with CAB.')
|
||||
f_common = self.cross_attention(f_common)
|
||||
|
||||
f_score, f_border= self.SAST_Header1(f_common)
|
||||
f_tvo, f_tco = self.SAST_Header2(f_common)
|
||||
|
||||
predicts = OrderedDict()
|
||||
predicts['f_score'] = f_score
|
||||
predicts['f_border'] = f_border
|
||||
predicts['f_tvo'] = f_tvo
|
||||
predicts['f_tco'] = f_tco
|
||||
return predicts
|
|
@ -1,237 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from .rec_seq_encoder import SequenceEncoder
|
||||
import numpy as np
|
||||
|
||||
|
||||
class AttentionPredict(object):
|
||||
def __init__(self, params):
|
||||
super(AttentionPredict, self).__init__()
|
||||
self.char_num = params['char_num']
|
||||
self.encoder = SequenceEncoder(params)
|
||||
self.decoder_size = params['Attention']['decoder_size']
|
||||
self.word_vector_dim = params['Attention']['word_vector_dim']
|
||||
self.encoder_type = params['encoder_type']
|
||||
self.max_length = params['max_text_length']
|
||||
|
||||
def simple_attention(self, encoder_vec, encoder_proj, decoder_state,
|
||||
decoder_size):
|
||||
decoder_state_proj = layers.fc(input=decoder_state,
|
||||
size=decoder_size,
|
||||
bias_attr=False,
|
||||
name="decoder_state_proj_fc")
|
||||
decoder_state_expand = layers.sequence_expand(
|
||||
x=decoder_state_proj, y=encoder_proj)
|
||||
concated = layers.elementwise_add(encoder_proj, decoder_state_expand)
|
||||
concated = layers.tanh(x=concated)
|
||||
attention_weights = layers.fc(input=concated,
|
||||
size=1,
|
||||
act=None,
|
||||
bias_attr=False,
|
||||
name="attention_weights_fc")
|
||||
attention_weights = layers.sequence_softmax(input=attention_weights)
|
||||
weigths_reshape = layers.reshape(x=attention_weights, shape=[-1])
|
||||
scaled = layers.elementwise_mul(
|
||||
x=encoder_vec, y=weigths_reshape, axis=0)
|
||||
context = layers.sequence_pool(input=scaled, pool_type='sum')
|
||||
return context
|
||||
|
||||
def gru_decoder_with_attention(self, target_embedding, encoder_vec,
|
||||
encoder_proj, decoder_boot, decoder_size,
|
||||
char_num):
|
||||
rnn = layers.DynamicRNN()
|
||||
with rnn.block():
|
||||
current_word = rnn.step_input(target_embedding)
|
||||
encoder_vec = rnn.static_input(encoder_vec)
|
||||
encoder_proj = rnn.static_input(encoder_proj)
|
||||
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
|
||||
context = self.simple_attention(encoder_vec, encoder_proj,
|
||||
hidden_mem, decoder_size)
|
||||
fc_1 = layers.fc(input=context,
|
||||
size=decoder_size * 3,
|
||||
bias_attr=False,
|
||||
name="rnn_fc1")
|
||||
fc_2 = layers.fc(input=current_word,
|
||||
size=decoder_size * 3,
|
||||
bias_attr=False,
|
||||
name="rnn_fc2")
|
||||
decoder_inputs = fc_1 + fc_2
|
||||
h, _, _ = layers.gru_unit(
|
||||
input=decoder_inputs, hidden=hidden_mem, size=decoder_size * 3)
|
||||
rnn.update_memory(hidden_mem, h)
|
||||
out = layers.fc(input=h,
|
||||
size=char_num,
|
||||
bias_attr=True,
|
||||
act='softmax',
|
||||
name="rnn_out_fc")
|
||||
rnn.output(out)
|
||||
return rnn()
|
||||
|
||||
def gru_attention_infer(self, decoder_boot, max_length, char_num,
|
||||
word_vector_dim, encoded_vector, encoded_proj,
|
||||
decoder_size):
|
||||
init_state = decoder_boot
|
||||
beam_size = 1
|
||||
array_len = layers.fill_constant(
|
||||
shape=[1], dtype='int64', value=max_length)
|
||||
counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True)
|
||||
|
||||
# fill the first element with init_state
|
||||
state_array = layers.create_array('float32')
|
||||
layers.array_write(init_state, array=state_array, i=counter)
|
||||
|
||||
# ids, scores as memory
|
||||
ids_array = layers.create_array('int64')
|
||||
scores_array = layers.create_array('float32')
|
||||
rois_shape = layers.shape(init_state)
|
||||
batch_size = layers.slice(
|
||||
rois_shape, axes=[0], starts=[0], ends=[1]) + 1
|
||||
lod_level = layers.range(
|
||||
start=0, end=batch_size, step=1, dtype=batch_size.dtype)
|
||||
|
||||
init_ids = layers.fill_constant_batch_size_like(
|
||||
input=init_state, shape=[-1, 1], value=0, dtype='int64')
|
||||
init_ids = layers.lod_reset(init_ids, lod_level)
|
||||
init_ids = layers.lod_append(init_ids, lod_level)
|
||||
|
||||
init_scores = layers.fill_constant_batch_size_like(
|
||||
input=init_state, shape=[-1, 1], value=1, dtype='float32')
|
||||
init_scores = layers.lod_reset(init_scores, init_ids)
|
||||
layers.array_write(init_ids, array=ids_array, i=counter)
|
||||
layers.array_write(init_scores, array=scores_array, i=counter)
|
||||
|
||||
full_ids = fluid.layers.fill_constant_batch_size_like(
|
||||
input=init_state, shape=[-1, 1], dtype='int64', value=1)
|
||||
full_scores = fluid.layers.fill_constant_batch_size_like(
|
||||
input=init_state, shape=[-1, 1], dtype='float32', value=1)
|
||||
|
||||
cond = layers.less_than(x=counter, y=array_len)
|
||||
while_op = layers.While(cond=cond)
|
||||
with while_op.block():
|
||||
pre_ids = layers.array_read(array=ids_array, i=counter)
|
||||
pre_state = layers.array_read(array=state_array, i=counter)
|
||||
pre_score = layers.array_read(array=scores_array, i=counter)
|
||||
pre_ids_emb = layers.embedding(
|
||||
input=pre_ids,
|
||||
size=[char_num, word_vector_dim],
|
||||
dtype='float32')
|
||||
|
||||
context = self.simple_attention(encoded_vector, encoded_proj,
|
||||
pre_state, decoder_size)
|
||||
|
||||
# expand the recursive_sequence_lengths of pre_state
|
||||
# to be the same with pre_score
|
||||
pre_state_expanded = layers.sequence_expand(pre_state, pre_score)
|
||||
context_expanded = layers.sequence_expand(context, pre_score)
|
||||
|
||||
fc_1 = layers.fc(input=context_expanded,
|
||||
size=decoder_size * 3,
|
||||
bias_attr=False,
|
||||
name="rnn_fc1")
|
||||
|
||||
fc_2 = layers.fc(input=pre_ids_emb,
|
||||
size=decoder_size * 3,
|
||||
bias_attr=False,
|
||||
name="rnn_fc2")
|
||||
|
||||
decoder_inputs = fc_1 + fc_2
|
||||
current_state, _, _ = layers.gru_unit(
|
||||
input=decoder_inputs,
|
||||
hidden=pre_state_expanded,
|
||||
size=decoder_size * 3)
|
||||
current_state_with_lod = layers.lod_reset(
|
||||
x=current_state, y=pre_score)
|
||||
# use score to do beam search
|
||||
current_score = layers.fc(input=current_state_with_lod,
|
||||
size=char_num,
|
||||
bias_attr=True,
|
||||
act='softmax',
|
||||
name="rnn_out_fc")
|
||||
topk_scores, topk_indices = layers.topk(current_score, k=beam_size)
|
||||
|
||||
new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
|
||||
fluid.layers.assign(new_ids, full_ids)
|
||||
|
||||
new_scores = fluid.layers.concat([full_scores, topk_scores], axis=1)
|
||||
fluid.layers.assign(new_scores, full_scores)
|
||||
|
||||
layers.increment(x=counter, value=1, in_place=True)
|
||||
|
||||
# update the memories
|
||||
layers.array_write(current_state, array=state_array, i=counter)
|
||||
layers.array_write(topk_indices, array=ids_array, i=counter)
|
||||
layers.array_write(topk_scores, array=scores_array, i=counter)
|
||||
|
||||
# update the break condition:
|
||||
# up to the max length or all candidates of
|
||||
# source sentences have ended.
|
||||
length_cond = layers.less_than(x=counter, y=array_len)
|
||||
finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
|
||||
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
|
||||
return full_ids, full_scores
|
||||
|
||||
def __call__(self, inputs, labels=None, mode=None):
|
||||
encoder_features = self.encoder(inputs)
|
||||
char_num = self.char_num
|
||||
word_vector_dim = self.word_vector_dim
|
||||
decoder_size = self.decoder_size
|
||||
|
||||
if self.encoder_type == "reshape":
|
||||
encoder_input = encoder_features
|
||||
encoded_vector = encoder_features
|
||||
else:
|
||||
encoder_input = encoder_features[1]
|
||||
encoded_vector = layers.concat(encoder_features, axis=1)
|
||||
encoded_proj = layers.fc(input=encoded_vector,
|
||||
size=decoder_size,
|
||||
bias_attr=False,
|
||||
name="encoded_proj_fc")
|
||||
backward_first = layers.sequence_pool(
|
||||
input=encoder_input, pool_type='first')
|
||||
decoder_boot = layers.fc(input=backward_first,
|
||||
size=decoder_size,
|
||||
bias_attr=False,
|
||||
act="relu",
|
||||
name='decoder_boot')
|
||||
|
||||
if mode == "train":
|
||||
label_in = labels['label_in']
|
||||
label_out = labels['label_out']
|
||||
label_in = layers.cast(x=label_in, dtype='int64')
|
||||
trg_embedding = layers.embedding(
|
||||
input=label_in,
|
||||
size=[char_num, word_vector_dim],
|
||||
dtype='float32')
|
||||
predict = self.gru_decoder_with_attention(
|
||||
trg_embedding, encoded_vector, encoded_proj, decoder_boot,
|
||||
decoder_size, char_num)
|
||||
_, decoded_out = layers.topk(input=predict, k=1)
|
||||
decoded_out = layers.lod_reset(decoded_out, y=label_out)
|
||||
predicts = {'predict':predict, 'decoded_out':decoded_out}
|
||||
else:
|
||||
ids, predict = self.gru_attention_infer(
|
||||
decoder_boot, self.max_length, char_num, word_vector_dim,
|
||||
encoded_vector, encoded_proj, decoder_size)
|
||||
predicts = {'predict':predict, 'decoded_out':ids}
|
||||
return predicts
|
|
@ -1,16 +1,16 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
|
@ -19,34 +19,33 @@ from __future__ import print_function
|
|||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
from .rec_seq_encoder import SequenceEncoder
|
||||
from ..common_functions import get_para_bias_attr
|
||||
import numpy as np
|
||||
from paddle import ParamAttr, nn
|
||||
|
||||
|
||||
class CTCPredict(object):
|
||||
def __init__(self, params):
|
||||
super(CTCPredict, self).__init__()
|
||||
self.char_num = params['char_num']
|
||||
self.encoder = SequenceEncoder(params)
|
||||
self.encoder_type = params['encoder_type']
|
||||
self.fc_decay = params.get("fc_decay", 0.0004)
|
||||
def get_para_bias_attr(l2_decay, k, name):
|
||||
regularizer = paddle.fluid.regularizer.L2Decay(l2_decay)
|
||||
stdv = 1.0 / math.sqrt(k * 1.0)
|
||||
initializer = nn.initializer.Uniform(-stdv, stdv)
|
||||
weight_attr = ParamAttr(
|
||||
regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
|
||||
bias_attr = ParamAttr(
|
||||
regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
|
||||
return [weight_attr, bias_attr]
|
||||
|
||||
def __call__(self, inputs, labels=None, mode=None):
|
||||
encoder_features = self.encoder(inputs)
|
||||
if self.encoder_type != "reshape":
|
||||
encoder_features = fluid.layers.concat(encoder_features, axis=1)
|
||||
name = "ctc_fc"
|
||||
para_attr, bias_attr = get_para_bias_attr(
|
||||
l2_decay=self.fc_decay, k=encoder_features.shape[1], name=name)
|
||||
predict = fluid.layers.fc(input=encoder_features,
|
||||
size=self.char_num + 1,
|
||||
param_attr=para_attr,
|
||||
bias_attr=bias_attr,
|
||||
name=name)
|
||||
decoded_out = fluid.layers.ctc_greedy_decoder(
|
||||
input=predict, blank=self.char_num)
|
||||
predicts = {'predict': predict, 'decoded_out': decoded_out}
|
||||
|
||||
class CTC(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, fc_decay=1e-5, **kwargs):
|
||||
super(CTC, self).__init__()
|
||||
weight_attr, bias_attr = get_para_bias_attr(
|
||||
l2_decay=fc_decay, k=in_channels, name='ctc_fc')
|
||||
self.fc = nn.Linear(
|
||||
in_channels,
|
||||
out_channels,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr,
|
||||
name='ctc_fc')
|
||||
self.out_channels = out_channels
|
||||
|
||||
def forward(self, x, labels=None):
|
||||
predicts = self.fc(x)
|
||||
return predicts
|
||||
|
|
|
@ -1,100 +0,0 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
class EncoderWithReshape(object):
|
||||
def __init__(self, params):
|
||||
super(EncoderWithReshape, self).__init__()
|
||||
|
||||
def __call__(self, inputs):
|
||||
sliced_feature = layers.im2sequence(
|
||||
input=inputs,
|
||||
stride=[1, 1],
|
||||
filter_size=[inputs.shape[2], 1],
|
||||
name="sliced_feature")
|
||||
return sliced_feature
|
||||
|
||||
|
||||
class EncoderWithRNN(object):
|
||||
def __init__(self, params):
|
||||
super(EncoderWithRNN, self).__init__()
|
||||
self.rnn_hidden_size = params['SeqRNN']['hidden_size']
|
||||
|
||||
def __call__(self, inputs):
|
||||
lstm_list = []
|
||||
name_prefix = "lstm"
|
||||
rnn_hidden_size = self.rnn_hidden_size
|
||||
for no in range(1, 3):
|
||||
if no == 1:
|
||||
is_reverse = False
|
||||
else:
|
||||
is_reverse = True
|
||||
name = "%s_st1_fc%d" % (name_prefix, no)
|
||||
fc = layers.fc(input=inputs,
|
||||
size=rnn_hidden_size * 4,
|
||||
param_attr=fluid.ParamAttr(name=name + "_w"),
|
||||
bias_attr=fluid.ParamAttr(name=name + "_b"),
|
||||
name=name)
|
||||
name = "%s_st1_out%d" % (name_prefix, no)
|
||||
lstm, _ = layers.dynamic_lstm(
|
||||
input=fc,
|
||||
size=rnn_hidden_size * 4,
|
||||
is_reverse=is_reverse,
|
||||
param_attr=fluid.ParamAttr(name=name + "_w"),
|
||||
bias_attr=fluid.ParamAttr(name=name + "_b"),
|
||||
use_peepholes=False)
|
||||
name = "%s_st2_fc%d" % (name_prefix, no)
|
||||
fc = layers.fc(input=lstm,
|
||||
size=rnn_hidden_size * 4,
|
||||
param_attr=fluid.ParamAttr(name=name + "_w"),
|
||||
bias_attr=fluid.ParamAttr(name=name + "_b"),
|
||||
name=name)
|
||||
name = "%s_st2_out%d" % (name_prefix, no)
|
||||
lstm, _ = layers.dynamic_lstm(
|
||||
input=fc,
|
||||
size=rnn_hidden_size * 4,
|
||||
is_reverse=is_reverse,
|
||||
param_attr=fluid.ParamAttr(name=name + "_w"),
|
||||
bias_attr=fluid.ParamAttr(name=name + "_b"),
|
||||
use_peepholes=False)
|
||||
lstm_list.append(lstm)
|
||||
return lstm_list
|
||||
|
||||
|
||||
class SequenceEncoder(object):
|
||||
def __init__(self, params):
|
||||
super(SequenceEncoder, self).__init__()
|
||||
self.encoder_type = params['encoder_type']
|
||||
self.encoder_reshape = EncoderWithReshape(params)
|
||||
if self.encoder_type == "rnn":
|
||||
self.encoder_rnn = EncoderWithRNN(params)
|
||||
|
||||
def __call__(self, inputs):
|
||||
if self.encoder_type == "reshape":
|
||||
encoder_features = self.encoder_reshape(inputs)
|
||||
elif self.encoder_type == "rnn":
|
||||
inputs = self.encoder_reshape(inputs)
|
||||
encoder_features = self.encoder_rnn(inputs)
|
||||
else:
|
||||
assert False, "Unsupport encoder_type:%s"\
|
||||
% self.encoder_type
|
||||
return encoder_features
|
|
@ -1,230 +0,0 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
import numpy as np
|
||||
from .self_attention.model import wrap_encoder
|
||||
from .self_attention.model import wrap_encoder_forFeature
|
||||
gradient_clip = 10
|
||||
|
||||
|
||||
class SRNPredict(object):
|
||||
def __init__(self, params):
|
||||
super(SRNPredict, self).__init__()
|
||||
self.char_num = params['char_num']
|
||||
self.max_length = params['max_text_length']
|
||||
|
||||
self.num_heads = params['num_heads']
|
||||
self.num_encoder_TUs = params['num_encoder_TUs']
|
||||
self.num_decoder_TUs = params['num_decoder_TUs']
|
||||
self.hidden_dims = params['hidden_dims']
|
||||
|
||||
def pvam(self, inputs, others):
|
||||
|
||||
b, c, h, w = inputs.shape
|
||||
conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w])
|
||||
conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1])
|
||||
|
||||
#===== Transformer encoder =====
|
||||
b, t, c = conv_features.shape
|
||||
encoder_word_pos = others["encoder_word_pos"]
|
||||
gsrm_word_pos = others["gsrm_word_pos"]
|
||||
|
||||
enc_inputs = [conv_features, encoder_word_pos, None]
|
||||
word_features = wrap_encoder_forFeature(
|
||||
src_vocab_size=-1,
|
||||
max_length=t,
|
||||
n_layer=self.num_encoder_TUs,
|
||||
n_head=self.num_heads,
|
||||
d_key=int(self.hidden_dims / self.num_heads),
|
||||
d_value=int(self.hidden_dims / self.num_heads),
|
||||
d_model=self.hidden_dims,
|
||||
d_inner_hid=self.hidden_dims,
|
||||
prepostprocess_dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da",
|
||||
weight_sharing=True,
|
||||
enc_inputs=enc_inputs, )
|
||||
fluid.clip.set_gradient_clip(
|
||||
fluid.clip.GradientClipByValue(gradient_clip))
|
||||
|
||||
#===== Parallel Visual Attention Module =====
|
||||
b, t, c = word_features.shape
|
||||
|
||||
word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2)
|
||||
word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
|
||||
word_features_ = fluid.layers.expand(word_features_,
|
||||
[1, self.max_length, 1, 1])
|
||||
word_pos_feature = fluid.layers.embedding(gsrm_word_pos,
|
||||
[self.max_length, c])
|
||||
word_pos_ = fluid.layers.reshape(word_pos_feature,
|
||||
[-1, self.max_length, 1, c])
|
||||
word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
|
||||
temp = fluid.layers.elementwise_add(
|
||||
word_features_, word_pos_, act='tanh')
|
||||
|
||||
attention_weight = fluid.layers.fc(input=temp,
|
||||
size=1,
|
||||
num_flatten_dims=3,
|
||||
bias_attr=False)
|
||||
attention_weight = fluid.layers.reshape(
|
||||
x=attention_weight, shape=[-1, self.max_length, t])
|
||||
attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1)
|
||||
|
||||
pvam_features = fluid.layers.matmul(attention_weight,
|
||||
word_features) #[b, max_length, c]
|
||||
|
||||
return pvam_features
|
||||
|
||||
def gsrm(self, pvam_features, others):
|
||||
|
||||
#===== GSRM Visual-to-semantic embedding block =====
|
||||
b, t, c = pvam_features.shape
|
||||
word_out = fluid.layers.fc(
|
||||
input=fluid.layers.reshape(pvam_features, [-1, c]),
|
||||
size=self.char_num,
|
||||
act="softmax")
|
||||
#word_out.stop_gradient = True
|
||||
word_ids = fluid.layers.argmax(word_out, axis=1)
|
||||
word_ids.stop_gradient = True
|
||||
word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1])
|
||||
|
||||
#===== GSRM Semantic reasoning block =====
|
||||
"""
|
||||
This module is achieved through bi-transformers,
|
||||
ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
|
||||
"""
|
||||
pad_idx = self.char_num
|
||||
gsrm_word_pos = others["gsrm_word_pos"]
|
||||
gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"]
|
||||
gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"]
|
||||
|
||||
def prepare_bi(word_ids):
|
||||
"""
|
||||
prepare bi for gsrm
|
||||
word1 for forward; word2 for backward
|
||||
"""
|
||||
word1 = fluid.layers.cast(word_ids, "float32")
|
||||
word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0],
|
||||
pad_value=1.0 * pad_idx)
|
||||
word1 = fluid.layers.cast(word1, "int64")
|
||||
word1 = word1[:, :-1, :]
|
||||
word2 = word_ids
|
||||
return word1, word2
|
||||
|
||||
word1, word2 = prepare_bi(word_ids)
|
||||
word1.stop_gradient = True
|
||||
word2.stop_gradient = True
|
||||
enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
|
||||
enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
|
||||
|
||||
gsrm_feature1 = wrap_encoder(
|
||||
src_vocab_size=self.char_num + 1,
|
||||
max_length=self.max_length,
|
||||
n_layer=self.num_decoder_TUs,
|
||||
n_head=self.num_heads,
|
||||
d_key=int(self.hidden_dims / self.num_heads),
|
||||
d_value=int(self.hidden_dims / self.num_heads),
|
||||
d_model=self.hidden_dims,
|
||||
d_inner_hid=self.hidden_dims,
|
||||
prepostprocess_dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da",
|
||||
weight_sharing=True,
|
||||
enc_inputs=enc_inputs_1, )
|
||||
gsrm_feature2 = wrap_encoder(
|
||||
src_vocab_size=self.char_num + 1,
|
||||
max_length=self.max_length,
|
||||
n_layer=self.num_decoder_TUs,
|
||||
n_head=self.num_heads,
|
||||
d_key=int(self.hidden_dims / self.num_heads),
|
||||
d_value=int(self.hidden_dims / self.num_heads),
|
||||
d_model=self.hidden_dims,
|
||||
d_inner_hid=self.hidden_dims,
|
||||
prepostprocess_dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da",
|
||||
weight_sharing=True,
|
||||
enc_inputs=enc_inputs_2, )
|
||||
gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0],
|
||||
pad_value=0.)
|
||||
gsrm_feature2 = gsrm_feature2[:, 1:, ]
|
||||
gsrm_features = gsrm_feature1 + gsrm_feature2
|
||||
|
||||
b, t, c = gsrm_features.shape
|
||||
|
||||
gsrm_out = fluid.layers.matmul(
|
||||
x=gsrm_features,
|
||||
y=fluid.default_main_program().global_block().var(
|
||||
"src_word_emb_table"),
|
||||
transpose_y=True)
|
||||
b, t, c = gsrm_out.shape
|
||||
gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out,
|
||||
[-1, c]))
|
||||
|
||||
return gsrm_features, word_out, gsrm_out
|
||||
|
||||
def vsfd(self, pvam_features, gsrm_features):
|
||||
|
||||
#===== Visual-Semantic Fusion Decoder Module =====
|
||||
b, t, c1 = pvam_features.shape
|
||||
b, t, c2 = gsrm_features.shape
|
||||
combine_features_ = fluid.layers.concat(
|
||||
[pvam_features, gsrm_features], axis=2)
|
||||
img_comb_features_ = fluid.layers.reshape(
|
||||
x=combine_features_, shape=[-1, c1 + c2])
|
||||
img_comb_features_map = fluid.layers.fc(input=img_comb_features_,
|
||||
size=c1,
|
||||
act="sigmoid")
|
||||
img_comb_features_map = fluid.layers.reshape(
|
||||
x=img_comb_features_map, shape=[-1, t, c1])
|
||||
combine_features = img_comb_features_map * pvam_features + (
|
||||
1.0 - img_comb_features_map) * gsrm_features
|
||||
img_comb_features = fluid.layers.reshape(
|
||||
x=combine_features, shape=[-1, c1])
|
||||
|
||||
fc_out = fluid.layers.fc(input=img_comb_features,
|
||||
size=self.char_num,
|
||||
act="softmax")
|
||||
return fc_out
|
||||
|
||||
def __call__(self, inputs, others, mode=None):
|
||||
|
||||
pvam_features = self.pvam(inputs, others)
|
||||
gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others)
|
||||
final_out = self.vsfd(pvam_features, gsrm_features)
|
||||
|
||||
_, decoded_out = fluid.layers.topk(input=final_out, k=1)
|
||||
predicts = {
|
||||
'predict': final_out,
|
||||
'decoded_out': decoded_out,
|
||||
'word_out': word_out,
|
||||
'gsrm_out': gsrm_out
|
||||
}
|
||||
|
||||
return predicts
|
|
@ -1,485 +0,0 @@
|
|||
from functools import partial
|
||||
import numpy as np
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
encoder_data_input_fields = (
|
||||
"src_word",
|
||||
"src_pos",
|
||||
"src_slf_attn_bias", )
|
||||
|
||||
|
||||
def wrap_layer_with_block(layer, block_idx):
|
||||
"""
|
||||
Make layer define support indicating block, by which we can add layers
|
||||
to other blocks within current block. This will make it easy to define
|
||||
cache among while loop.
|
||||
"""
|
||||
|
||||
class BlockGuard(object):
|
||||
"""
|
||||
BlockGuard class.
|
||||
|
||||
BlockGuard class is used to switch to the given block in a program by
|
||||
using the Python `with` keyword.
|
||||
"""
|
||||
|
||||
def __init__(self, block_idx=None, main_program=None):
|
||||
self.main_program = fluid.default_main_program(
|
||||
) if main_program is None else main_program
|
||||
self.old_block_idx = self.main_program.current_block().idx
|
||||
self.new_block_idx = block_idx
|
||||
|
||||
def __enter__(self):
|
||||
self.main_program.current_block_idx = self.new_block_idx
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.main_program.current_block_idx = self.old_block_idx
|
||||
if exc_type is not None:
|
||||
return False # re-raise exception
|
||||
return True
|
||||
|
||||
def layer_wrapper(*args, **kwargs):
|
||||
with BlockGuard(block_idx):
|
||||
return layer(*args, **kwargs)
|
||||
|
||||
return layer_wrapper
|
||||
|
||||
|
||||
def multi_head_attention(queries,
|
||||
keys,
|
||||
values,
|
||||
attn_bias,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
n_head=1,
|
||||
dropout_rate=0.,
|
||||
cache=None,
|
||||
gather_idx=None,
|
||||
static_kv=False):
|
||||
"""
|
||||
Multi-Head Attention. Note that attn_bias is added to the logit before
|
||||
computing softmax activiation to mask certain selected positions so that
|
||||
they will not considered in attention weights.
|
||||
"""
|
||||
keys = queries if keys is None else keys
|
||||
values = keys if values is None else values
|
||||
|
||||
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
|
||||
raise ValueError(
|
||||
"Inputs: quries, keys and values should all be 3-D tensors.")
|
||||
|
||||
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
|
||||
"""
|
||||
Add linear projection to queries, keys, and values.
|
||||
"""
|
||||
q = layers.fc(input=queries,
|
||||
size=d_key * n_head,
|
||||
bias_attr=False,
|
||||
num_flatten_dims=2)
|
||||
# For encoder-decoder attention in inference, insert the ops and vars
|
||||
# into global block to use as cache among beam search.
|
||||
fc_layer = wrap_layer_with_block(
|
||||
layers.fc, fluid.default_main_program().current_block()
|
||||
.parent_idx) if cache is not None and static_kv else layers.fc
|
||||
k = fc_layer(
|
||||
input=keys,
|
||||
size=d_key * n_head,
|
||||
bias_attr=False,
|
||||
num_flatten_dims=2)
|
||||
v = fc_layer(
|
||||
input=values,
|
||||
size=d_value * n_head,
|
||||
bias_attr=False,
|
||||
num_flatten_dims=2)
|
||||
return q, k, v
|
||||
|
||||
def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value):
|
||||
"""
|
||||
Reshape input tensors at the last dimension to split multi-heads
|
||||
and then transpose. Specifically, transform the input tensor with shape
|
||||
[bs, max_sequence_length, n_head * hidden_dim] to the output tensor
|
||||
with shape [bs, n_head, max_sequence_length, hidden_dim].
|
||||
"""
|
||||
# The value 0 in shape attr means copying the corresponding dimension
|
||||
# size of the input as the output dimension size.
|
||||
reshaped_q = layers.reshape(
|
||||
x=queries, shape=[0, 0, n_head, d_key], inplace=True)
|
||||
# permuate the dimensions into:
|
||||
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
|
||||
q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
|
||||
# For encoder-decoder attention in inference, insert the ops and vars
|
||||
# into global block to use as cache among beam search.
|
||||
reshape_layer = wrap_layer_with_block(
|
||||
layers.reshape,
|
||||
fluid.default_main_program().current_block()
|
||||
.parent_idx) if cache is not None and static_kv else layers.reshape
|
||||
transpose_layer = wrap_layer_with_block(
|
||||
layers.transpose,
|
||||
fluid.default_main_program().current_block().
|
||||
parent_idx) if cache is not None and static_kv else layers.transpose
|
||||
reshaped_k = reshape_layer(
|
||||
x=keys, shape=[0, 0, n_head, d_key], inplace=True)
|
||||
k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3])
|
||||
reshaped_v = reshape_layer(
|
||||
x=values, shape=[0, 0, n_head, d_value], inplace=True)
|
||||
v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3])
|
||||
|
||||
if cache is not None: # only for faster inference
|
||||
if static_kv: # For encoder-decoder attention in inference
|
||||
cache_k, cache_v = cache["static_k"], cache["static_v"]
|
||||
# To init the static_k and static_v in cache.
|
||||
# Maybe we can use condition_op(if_else) to do these at the first
|
||||
# step in while loop to replace these, however it might be less
|
||||
# efficient.
|
||||
static_cache_init = wrap_layer_with_block(
|
||||
layers.assign,
|
||||
fluid.default_main_program().current_block().parent_idx)
|
||||
static_cache_init(k, cache_k)
|
||||
static_cache_init(v, cache_v)
|
||||
else: # For decoder self-attention in inference
|
||||
cache_k, cache_v = cache["k"], cache["v"]
|
||||
# gather cell states corresponding to selected parent
|
||||
select_k = layers.gather(cache_k, index=gather_idx)
|
||||
select_v = layers.gather(cache_v, index=gather_idx)
|
||||
if not static_kv:
|
||||
# For self attention in inference, use cache and concat time steps.
|
||||
select_k = layers.concat([select_k, k], axis=2)
|
||||
select_v = layers.concat([select_v, v], axis=2)
|
||||
# update cell states(caches) cached in global block
|
||||
layers.assign(select_k, cache_k)
|
||||
layers.assign(select_v, cache_v)
|
||||
return q, select_k, select_v
|
||||
return q, k, v
|
||||
|
||||
def __combine_heads(x):
|
||||
"""
|
||||
Transpose and then reshape the last two dimensions of inpunt tensor x
|
||||
so that it becomes one dimension, which is reverse to __split_heads.
|
||||
"""
|
||||
if len(x.shape) != 4:
|
||||
raise ValueError("Input(x) should be a 4-D Tensor.")
|
||||
|
||||
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
|
||||
# The value 0 in shape attr means copying the corresponding dimension
|
||||
# size of the input as the output dimension size.
|
||||
return layers.reshape(
|
||||
x=trans_x,
|
||||
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
|
||||
inplace=True)
|
||||
|
||||
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
|
||||
"""
|
||||
Scaled Dot-Product Attention
|
||||
"""
|
||||
# print(q)
|
||||
# print(k)
|
||||
|
||||
product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
|
||||
if attn_bias:
|
||||
product += attn_bias
|
||||
weights = layers.softmax(product)
|
||||
if dropout_rate:
|
||||
weights = layers.dropout(
|
||||
weights, dropout_prob=dropout_rate, seed=None, is_test=False)
|
||||
out = layers.matmul(weights, v)
|
||||
return out
|
||||
|
||||
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
|
||||
q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value)
|
||||
|
||||
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
|
||||
dropout_rate)
|
||||
|
||||
out = __combine_heads(ctx_multiheads)
|
||||
|
||||
# Project back to the model size.
|
||||
proj_out = layers.fc(input=out,
|
||||
size=d_model,
|
||||
bias_attr=False,
|
||||
num_flatten_dims=2)
|
||||
return proj_out
|
||||
|
||||
|
||||
def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate):
|
||||
"""
|
||||
Position-wise Feed-Forward Networks.
|
||||
This module consists of two linear transformations with a ReLU activation
|
||||
in between, which is applied to each position separately and identically.
|
||||
"""
|
||||
hidden = layers.fc(input=x,
|
||||
size=d_inner_hid,
|
||||
num_flatten_dims=2,
|
||||
act="relu")
|
||||
if dropout_rate:
|
||||
hidden = layers.dropout(
|
||||
hidden, dropout_prob=dropout_rate, seed=None, is_test=False)
|
||||
out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
|
||||
return out
|
||||
|
||||
|
||||
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
|
||||
"""
|
||||
Add residual connection, layer normalization and droput to the out tensor
|
||||
optionally according to the value of process_cmd.
|
||||
This will be used before or after multi-head attention and position-wise
|
||||
feed-forward networks.
|
||||
"""
|
||||
for cmd in process_cmd:
|
||||
if cmd == "a": # add residual connection
|
||||
out = out + prev_out if prev_out else out
|
||||
elif cmd == "n": # add layer normalization
|
||||
out = layers.layer_norm(
|
||||
out,
|
||||
begin_norm_axis=len(out.shape) - 1,
|
||||
param_attr=fluid.initializer.Constant(1.),
|
||||
bias_attr=fluid.initializer.Constant(0.))
|
||||
elif cmd == "d": # add dropout
|
||||
if dropout_rate:
|
||||
out = layers.dropout(
|
||||
out, dropout_prob=dropout_rate, seed=None, is_test=False)
|
||||
return out
|
||||
|
||||
|
||||
pre_process_layer = partial(pre_post_process_layer, None)
|
||||
post_process_layer = pre_post_process_layer
|
||||
|
||||
|
||||
def prepare_encoder(
|
||||
src_word, # [b,t,c]
|
||||
src_pos,
|
||||
src_vocab_size,
|
||||
src_emb_dim,
|
||||
src_max_len,
|
||||
dropout_rate=0.,
|
||||
bos_idx=0,
|
||||
word_emb_param_name=None,
|
||||
pos_enc_param_name=None):
|
||||
"""Add word embeddings and position encodings.
|
||||
The output tensor has a shape of:
|
||||
[batch_size, max_src_length_in_batch, d_model].
|
||||
This module is used at the bottom of the encoder stacks.
|
||||
"""
|
||||
|
||||
src_word_emb = src_word
|
||||
src_word_emb = layers.cast(src_word_emb, 'float32')
|
||||
|
||||
src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
|
||||
src_pos_enc = layers.embedding(
|
||||
src_pos,
|
||||
size=[src_max_len, src_emb_dim],
|
||||
param_attr=fluid.ParamAttr(
|
||||
name=pos_enc_param_name, trainable=False))
|
||||
src_pos_enc.stop_gradient = True
|
||||
enc_input = src_word_emb + src_pos_enc
|
||||
return layers.dropout(
|
||||
enc_input, dropout_prob=dropout_rate, seed=None,
|
||||
is_test=False) if dropout_rate else enc_input
|
||||
|
||||
|
||||
def prepare_decoder(src_word,
|
||||
src_pos,
|
||||
src_vocab_size,
|
||||
src_emb_dim,
|
||||
src_max_len,
|
||||
dropout_rate=0.,
|
||||
bos_idx=0,
|
||||
word_emb_param_name=None,
|
||||
pos_enc_param_name=None):
|
||||
"""Add word embeddings and position encodings.
|
||||
The output tensor has a shape of:
|
||||
[batch_size, max_src_length_in_batch, d_model].
|
||||
This module is used at the bottom of the encoder stacks.
|
||||
"""
|
||||
src_word_emb = layers.embedding(
|
||||
src_word,
|
||||
size=[src_vocab_size, src_emb_dim],
|
||||
padding_idx=bos_idx, # set embedding of bos to 0
|
||||
param_attr=fluid.ParamAttr(
|
||||
name=word_emb_param_name,
|
||||
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
|
||||
|
||||
src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
|
||||
src_pos_enc = layers.embedding(
|
||||
src_pos,
|
||||
size=[src_max_len, src_emb_dim],
|
||||
param_attr=fluid.ParamAttr(
|
||||
name=pos_enc_param_name, trainable=False))
|
||||
src_pos_enc.stop_gradient = True
|
||||
enc_input = src_word_emb + src_pos_enc
|
||||
return layers.dropout(
|
||||
enc_input, dropout_prob=dropout_rate, seed=None,
|
||||
is_test=False) if dropout_rate else enc_input
|
||||
|
||||
|
||||
def encoder_layer(enc_input,
|
||||
attn_bias,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da"):
|
||||
"""The encoder layers that can be stacked to form a deep encoder.
|
||||
This module consits of a multi-head (self) attention followed by
|
||||
position-wise feed-forward networks and both the two components companied
|
||||
with the post_process_layer to add residual connection, layer normalization
|
||||
and droput.
|
||||
"""
|
||||
attn_output = multi_head_attention(
|
||||
pre_process_layer(enc_input, preprocess_cmd,
|
||||
prepostprocess_dropout), None, None, attn_bias, d_key,
|
||||
d_value, d_model, n_head, attention_dropout)
|
||||
attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd,
|
||||
prepostprocess_dropout)
|
||||
ffd_output = positionwise_feed_forward(
|
||||
pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout),
|
||||
d_inner_hid, d_model, relu_dropout)
|
||||
return post_process_layer(attn_output, ffd_output, postprocess_cmd,
|
||||
prepostprocess_dropout)
|
||||
|
||||
|
||||
def encoder(enc_input,
|
||||
attn_bias,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd="n",
|
||||
postprocess_cmd="da"):
|
||||
"""
|
||||
The encoder is composed of a stack of identical layers returned by calling
|
||||
encoder_layer.
|
||||
"""
|
||||
for i in range(n_layer):
|
||||
enc_output = encoder_layer(
|
||||
enc_input,
|
||||
attn_bias,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd, )
|
||||
enc_input = enc_output
|
||||
enc_output = pre_process_layer(enc_output, preprocess_cmd,
|
||||
prepostprocess_dropout)
|
||||
return enc_output
|
||||
|
||||
|
||||
def wrap_encoder_forFeature(src_vocab_size,
|
||||
max_length,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd,
|
||||
weight_sharing,
|
||||
enc_inputs=None,
|
||||
bos_idx=0):
|
||||
"""
|
||||
The wrapper assembles together all needed layers for the encoder.
|
||||
img, src_pos, src_slf_attn_bias = enc_inputs
|
||||
img
|
||||
"""
|
||||
|
||||
conv_features, src_pos, src_slf_attn_bias = enc_inputs #
|
||||
b, t, c = conv_features.shape
|
||||
|
||||
enc_input = prepare_encoder(
|
||||
conv_features,
|
||||
src_pos,
|
||||
src_vocab_size,
|
||||
d_model,
|
||||
max_length,
|
||||
prepostprocess_dropout,
|
||||
bos_idx=bos_idx,
|
||||
word_emb_param_name="src_word_emb_table")
|
||||
|
||||
enc_output = encoder(
|
||||
enc_input,
|
||||
src_slf_attn_bias,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd, )
|
||||
return enc_output
|
||||
|
||||
|
||||
def wrap_encoder(src_vocab_size,
|
||||
max_length,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd,
|
||||
weight_sharing,
|
||||
enc_inputs=None,
|
||||
bos_idx=0):
|
||||
"""
|
||||
The wrapper assembles together all needed layers for the encoder.
|
||||
img, src_pos, src_slf_attn_bias = enc_inputs
|
||||
img
|
||||
"""
|
||||
|
||||
src_word, src_pos, src_slf_attn_bias = enc_inputs #
|
||||
|
||||
enc_input = prepare_decoder(
|
||||
src_word,
|
||||
src_pos,
|
||||
src_vocab_size,
|
||||
d_model,
|
||||
max_length,
|
||||
prepostprocess_dropout,
|
||||
bos_idx=bos_idx,
|
||||
word_emb_param_name="src_word_emb_table")
|
||||
|
||||
enc_output = encoder(
|
||||
enc_input,
|
||||
src_slf_attn_bias,
|
||||
n_layer,
|
||||
n_head,
|
||||
d_key,
|
||||
d_value,
|
||||
d_model,
|
||||
d_inner_hid,
|
||||
prepostprocess_dropout,
|
||||
attention_dropout,
|
||||
relu_dropout,
|
||||
preprocess_cmd,
|
||||
postprocess_cmd, )
|
||||
return enc_output
|
|
@ -11,3 +11,22 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
|
||||
|
||||
def build_loss(config):
|
||||
# det loss
|
||||
from .det_db_loss import DBLoss
|
||||
|
||||
# rec loss
|
||||
from .rec_ctc_loss import CTCLoss
|
||||
|
||||
support_dict = ['DBLoss', 'CTCLoss']
|
||||
|
||||
config = copy.deepcopy(config)
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception('loss only support {}'.format(
|
||||
support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
|
@ -18,99 +18,189 @@ from __future__ import print_function
|
|||
|
||||
import numpy as np
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
def BalanceLoss(pred,
|
||||
gt,
|
||||
mask,
|
||||
balance_loss=True,
|
||||
main_loss_type="DiceLoss",
|
||||
negative_ratio=3,
|
||||
return_origin=False,
|
||||
eps=1e-6):
|
||||
"""
|
||||
The BalanceLoss for Differentiable Binarization text detection
|
||||
args:
|
||||
pred (variable): predicted feature maps.
|
||||
gt (variable): ground truth feature maps.
|
||||
mask (variable): masked maps.
|
||||
balance_loss (bool): whether balance loss or not, default is True
|
||||
main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
|
||||
'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'.
|
||||
negative_ratio (int|float): float, default is 3.
|
||||
return_origin (bool): whether return unbalanced loss or not, default is False.
|
||||
eps (float): default is 1e-6.
|
||||
return: (variable) balanced loss
|
||||
"""
|
||||
positive = gt * mask
|
||||
negative = (1 - gt) * mask
|
||||
class BalanceLoss(nn.Layer):
|
||||
def __init__(self,
|
||||
balance_loss=True,
|
||||
main_loss_type='DiceLoss',
|
||||
negative_ratio=3,
|
||||
return_origin=False,
|
||||
eps=1e-6,
|
||||
**kwargs):
|
||||
"""
|
||||
The BalanceLoss for Differentiable Binarization text detection
|
||||
args:
|
||||
balance_loss (bool): whether balance loss or not, default is True
|
||||
main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
|
||||
'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'.
|
||||
negative_ratio (int|float): float, default is 3.
|
||||
return_origin (bool): whether return unbalanced loss or not, default is False.
|
||||
eps (float): default is 1e-6.
|
||||
"""
|
||||
super(BalanceLoss, self).__init__()
|
||||
self.balance_loss = balance_loss
|
||||
self.main_loss_type = main_loss_type
|
||||
self.negative_ratio = negative_ratio
|
||||
self.main_loss_type = main_loss_type
|
||||
self.return_origin = return_origin
|
||||
self.eps = eps
|
||||
|
||||
positive_count = fluid.layers.reduce_sum(positive)
|
||||
positive_count_int = fluid.layers.cast(positive_count, dtype=np.int32)
|
||||
negative_count = min(
|
||||
fluid.layers.reduce_sum(negative), positive_count * negative_ratio)
|
||||
negative_count_int = fluid.layers.cast(negative_count, dtype=np.int32)
|
||||
if self.main_loss_type == "CrossEntropy":
|
||||
self.loss = nn.CrossEntropyLoss()
|
||||
elif self.main_loss_type == "Euclidean":
|
||||
self.loss = nn.MSELoss()
|
||||
elif self.main_loss_type == "DiceLoss":
|
||||
self.loss = DiceLoss(self.eps)
|
||||
elif self.main_loss_type == "BCELoss":
|
||||
self.loss = BCELoss(reduction='none')
|
||||
elif self.main_loss_type == "MaskL1Loss":
|
||||
self.loss = MaskL1Loss(self.eps)
|
||||
else:
|
||||
loss_type = [
|
||||
'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
|
||||
]
|
||||
raise Exception(
|
||||
"main_loss_type in BalanceLoss() can only be one of {}".format(
|
||||
loss_type))
|
||||
|
||||
if main_loss_type == "CrossEntropy":
|
||||
loss = fluid.layers.cross_entropy(input=pred, label=gt, soft_label=True)
|
||||
loss = fluid.layers.reduce_mean(loss)
|
||||
elif main_loss_type == "Euclidean":
|
||||
loss = fluid.layers.square(pred - gt)
|
||||
loss = fluid.layers.reduce_mean(loss)
|
||||
elif main_loss_type == "DiceLoss":
|
||||
loss = DiceLoss(pred, gt, mask)
|
||||
elif main_loss_type == "BCELoss":
|
||||
loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred, label=gt)
|
||||
elif main_loss_type == "MaskL1Loss":
|
||||
loss = MaskL1Loss(pred, gt, mask)
|
||||
else:
|
||||
loss_type = [
|
||||
'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
|
||||
]
|
||||
raise Exception("main_loss_type in BalanceLoss() can only be one of {}".
|
||||
format(loss_type))
|
||||
def forward(self, pred, gt, mask=None):
|
||||
"""
|
||||
The BalanceLoss for Differentiable Binarization text detection
|
||||
args:
|
||||
pred (variable): predicted feature maps.
|
||||
gt (variable): ground truth feature maps.
|
||||
mask (variable): masked maps.
|
||||
return: (variable) balanced loss
|
||||
"""
|
||||
# if self.main_loss_type in ['DiceLoss']:
|
||||
# # For the loss that returns to scalar value, perform ohem on the mask
|
||||
# mask = ohem_batch(pred, gt, mask, self.negative_ratio)
|
||||
# loss = self.loss(pred, gt, mask)
|
||||
# return loss
|
||||
|
||||
if not balance_loss:
|
||||
positive = gt * mask
|
||||
negative = (1 - gt) * mask
|
||||
|
||||
positive_count = int(positive.sum())
|
||||
negative_count = int(
|
||||
min(negative.sum(), positive_count * self.negative_ratio))
|
||||
loss = self.loss(pred, gt, mask=mask)
|
||||
|
||||
if not self.balance_loss:
|
||||
return loss
|
||||
|
||||
positive_loss = positive * loss
|
||||
negative_loss = negative * loss
|
||||
negative_loss = paddle.reshape(negative_loss, shape=[-1])
|
||||
if negative_count > 0:
|
||||
sort_loss = negative_loss.sort(descending=True)
|
||||
negative_loss = sort_loss[:negative_count]
|
||||
# negative_loss, _ = paddle.topk(negative_loss, k=negative_count_int)
|
||||
balance_loss = (positive_loss.sum() + negative_loss.sum()) / (
|
||||
positive_count + negative_count + self.eps)
|
||||
else:
|
||||
balance_loss = positive_loss.sum() / (positive_count + self.eps)
|
||||
if self.return_origin:
|
||||
return balance_loss, loss
|
||||
|
||||
return balance_loss
|
||||
|
||||
|
||||
class DiceLoss(nn.Layer):
|
||||
def __init__(self, eps=1e-6):
|
||||
super(DiceLoss, self).__init__()
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, pred, gt, mask, weights=None):
|
||||
"""
|
||||
DiceLoss function.
|
||||
"""
|
||||
|
||||
assert pred.shape == gt.shape
|
||||
assert pred.shape == mask.shape
|
||||
if weights is not None:
|
||||
assert weights.shape == mask.shape
|
||||
mask = weights * mask
|
||||
intersection = paddle.sum(pred * gt * mask)
|
||||
|
||||
union = paddle.sum(pred * mask) + paddle.sum(gt * mask) + self.eps
|
||||
loss = 1 - 2.0 * intersection / union
|
||||
assert loss <= 1
|
||||
return loss
|
||||
|
||||
positive_loss = positive * loss
|
||||
negative_loss = negative * loss
|
||||
negative_loss = fluid.layers.reshape(negative_loss, shape=[-1])
|
||||
negative_loss, _ = fluid.layers.topk(negative_loss, k=negative_count_int)
|
||||
balance_loss = (fluid.layers.reduce_sum(positive_loss) +
|
||||
fluid.layers.reduce_sum(negative_loss)) / (
|
||||
positive_count + negative_count + eps)
|
||||
|
||||
if return_origin:
|
||||
return balance_loss, loss
|
||||
return balance_loss
|
||||
class MaskL1Loss(nn.Layer):
|
||||
def __init__(self, eps=1e-6):
|
||||
super(MaskL1Loss, self).__init__()
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, pred, gt, mask):
|
||||
"""
|
||||
Mask L1 Loss
|
||||
"""
|
||||
loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps)
|
||||
loss = paddle.mean(loss)
|
||||
return loss
|
||||
|
||||
|
||||
def DiceLoss(pred, gt, mask, weights=None, eps=1e-6):
|
||||
"""
|
||||
DiceLoss function.
|
||||
"""
|
||||
class BCELoss(nn.Layer):
|
||||
def __init__(self, reduction='mean'):
|
||||
super(BCELoss, self).__init__()
|
||||
self.reduction = reduction
|
||||
|
||||
assert pred.shape == gt.shape
|
||||
assert pred.shape == mask.shape
|
||||
if weights is not None:
|
||||
assert weights.shape == mask.shape
|
||||
mask = weights * mask
|
||||
intersection = fluid.layers.reduce_sum(pred * gt * mask)
|
||||
|
||||
union = fluid.layers.reduce_sum(pred * mask) + fluid.layers.reduce_sum(
|
||||
gt * mask) + eps
|
||||
loss = 1 - 2.0 * intersection / union
|
||||
assert loss <= 1
|
||||
return loss
|
||||
def forward(self, input, label, mask=None, weight=None, name=None):
|
||||
loss = F.binary_cross_entropy(input, label, reduction=self.reduction)
|
||||
return loss
|
||||
|
||||
|
||||
def MaskL1Loss(pred, gt, mask, eps=1e-6):
|
||||
"""
|
||||
Mask L1 Loss
|
||||
"""
|
||||
loss = fluid.layers.reduce_sum((fluid.layers.abs(pred - gt) * mask)) / (
|
||||
fluid.layers.reduce_sum(mask) + eps)
|
||||
loss = fluid.layers.reduce_mean(loss)
|
||||
return loss
|
||||
def ohem_single(score, gt_text, training_mask, ohem_ratio):
|
||||
pos_num = (int)(np.sum(gt_text > 0.5)) - (
|
||||
int)(np.sum((gt_text > 0.5) & (training_mask <= 0.5)))
|
||||
|
||||
if pos_num == 0:
|
||||
# selected_mask = gt_text.copy() * 0 # may be not good
|
||||
selected_mask = training_mask
|
||||
selected_mask = selected_mask.reshape(
|
||||
1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
|
||||
return selected_mask
|
||||
|
||||
neg_num = (int)(np.sum(gt_text <= 0.5))
|
||||
neg_num = (int)(min(pos_num * ohem_ratio, neg_num))
|
||||
|
||||
if neg_num == 0:
|
||||
selected_mask = training_mask
|
||||
selected_mask = selected_mask.reshape(
|
||||
1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
|
||||
return selected_mask
|
||||
|
||||
neg_score = score[gt_text <= 0.5]
|
||||
# 将负样本得分从高到低排序
|
||||
neg_score_sorted = np.sort(-neg_score)
|
||||
threshold = -neg_score_sorted[neg_num - 1]
|
||||
# 选出 得分高的 负样本 和正样本 的 mask
|
||||
selected_mask = ((score >= threshold) |
|
||||
(gt_text > 0.5)) & (training_mask > 0.5)
|
||||
selected_mask = selected_mask.reshape(
|
||||
1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
|
||||
return selected_mask
|
||||
|
||||
|
||||
def ohem_batch(scores, gt_texts, training_masks, ohem_ratio):
|
||||
scores = scores.numpy()
|
||||
gt_texts = gt_texts.numpy()
|
||||
training_masks = training_masks.numpy()
|
||||
|
||||
selected_masks = []
|
||||
for i in range(scores.shape[0]):
|
||||
selected_masks.append(
|
||||
ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[
|
||||
i, :, :], ohem_ratio))
|
||||
|
||||
selected_masks = np.concatenate(selected_masks, 0)
|
||||
selected_masks = paddle.to_variable(selected_masks)
|
||||
|
||||
return selected_masks
|
||||
|
|
|
@ -1,68 +1,71 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from paddle import nn
|
||||
|
||||
from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss
|
||||
|
||||
|
||||
class DBLoss(object):
|
||||
class DBLoss(nn.Layer):
|
||||
"""
|
||||
Differentiable Binarization (DB) Loss Function
|
||||
args:
|
||||
param (dict): the super paramter for DB Loss
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
def __init__(self,
|
||||
balance_loss=True,
|
||||
main_loss_type='DiceLoss',
|
||||
alpha=5,
|
||||
beta=10,
|
||||
ohem_ratio=3,
|
||||
eps=1e-6,
|
||||
**kwargs):
|
||||
super(DBLoss, self).__init__()
|
||||
self.balance_loss = params['balance_loss']
|
||||
self.main_loss_type = params['main_loss_type']
|
||||
self.alpha = alpha
|
||||
self.beta = beta
|
||||
self.dice_loss = DiceLoss(eps=eps)
|
||||
self.l1_loss = MaskL1Loss(eps=eps)
|
||||
self.bce_loss = BalanceLoss(
|
||||
balance_loss=balance_loss,
|
||||
main_loss_type=main_loss_type,
|
||||
negative_ratio=ohem_ratio)
|
||||
|
||||
self.alpha = params['alpha']
|
||||
self.beta = params['beta']
|
||||
self.ohem_ratio = params['ohem_ratio']
|
||||
def forward(self, predicts, labels):
|
||||
label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = labels[
|
||||
1:]
|
||||
shrink_maps = predicts[:, 0, :, :]
|
||||
threshold_maps = predicts[:, 1, :, :]
|
||||
binary_maps = predicts[:, 2, :, :]
|
||||
|
||||
def __call__(self, predicts, labels):
|
||||
label_shrink_map = labels['shrink_map']
|
||||
label_shrink_mask = labels['shrink_mask']
|
||||
label_threshold_map = labels['threshold_map']
|
||||
label_threshold_mask = labels['threshold_mask']
|
||||
pred = predicts['maps']
|
||||
shrink_maps = pred[:, 0, :, :]
|
||||
threshold_maps = pred[:, 1, :, :]
|
||||
binary_maps = pred[:, 2, :, :]
|
||||
|
||||
loss_shrink_maps = BalanceLoss(
|
||||
shrink_maps,
|
||||
label_shrink_map,
|
||||
label_shrink_mask,
|
||||
balance_loss=self.balance_loss,
|
||||
main_loss_type=self.main_loss_type,
|
||||
negative_ratio=self.ohem_ratio)
|
||||
loss_threshold_maps = MaskL1Loss(threshold_maps, label_threshold_map,
|
||||
label_threshold_mask)
|
||||
loss_binary_maps = DiceLoss(binary_maps, label_shrink_map,
|
||||
label_shrink_mask)
|
||||
loss_shrink_maps = self.bce_loss(shrink_maps, label_shrink_map,
|
||||
label_shrink_mask)
|
||||
loss_threshold_maps = self.l1_loss(threshold_maps, label_threshold_map,
|
||||
label_threshold_mask)
|
||||
loss_binary_maps = self.dice_loss(binary_maps, label_shrink_map,
|
||||
label_shrink_mask)
|
||||
loss_shrink_maps = self.alpha * loss_shrink_maps
|
||||
loss_threshold_maps = self.beta * loss_threshold_maps
|
||||
|
||||
loss_all = loss_shrink_maps + loss_threshold_maps\
|
||||
+ loss_binary_maps
|
||||
losses = {'total_loss':loss_all,\
|
||||
"loss_shrink_maps":loss_shrink_maps,\
|
||||
"loss_threshold_maps":loss_threshold_maps,\
|
||||
"loss_binary_maps":loss_binary_maps}
|
||||
loss_all = loss_shrink_maps + loss_threshold_maps \
|
||||
+ loss_binary_maps
|
||||
losses = {'loss': loss_all, \
|
||||
"loss_shrink_maps": loss_shrink_maps, \
|
||||
"loss_threshold_maps": loss_threshold_maps, \
|
||||
"loss_binary_maps": loss_binary_maps}
|
||||
return losses
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
|
||||
|
||||
class EASTLoss(object):
|
||||
"""
|
||||
EAST Loss function
|
||||
"""
|
||||
|
||||
def __init__(self, params=None):
|
||||
super(EASTLoss, self).__init__()
|
||||
|
||||
def __call__(self, predicts, labels):
|
||||
f_score = predicts['f_score']
|
||||
f_geo = predicts['f_geo']
|
||||
l_score = labels['score']
|
||||
l_geo = labels['geo']
|
||||
l_mask = labels['mask']
|
||||
##dice_loss
|
||||
intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
|
||||
union = fluid.layers.reduce_sum(f_score * l_mask)\
|
||||
+ fluid.layers.reduce_sum(l_score * l_mask)
|
||||
dice_loss = 1 - 2 * intersection / (union + 1e-5)
|
||||
#smoooth_l1_loss
|
||||
channels = 8
|
||||
l_geo_split = fluid.layers.split(
|
||||
l_geo, num_or_sections=channels + 1, dim=1)
|
||||
f_geo_split = fluid.layers.split(f_geo, num_or_sections=channels, dim=1)
|
||||
smooth_l1 = 0
|
||||
for i in range(0, channels):
|
||||
geo_diff = l_geo_split[i] - f_geo_split[i]
|
||||
abs_geo_diff = fluid.layers.abs(geo_diff)
|
||||
smooth_l1_sign = fluid.layers.less_than(abs_geo_diff, l_score)
|
||||
smooth_l1_sign = fluid.layers.cast(smooth_l1_sign, dtype='float32')
|
||||
in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + \
|
||||
(abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign)
|
||||
out_loss = l_geo_split[-1] / channels * in_loss * l_score
|
||||
smooth_l1 += out_loss
|
||||
smooth_l1_loss = fluid.layers.reduce_mean(smooth_l1 * l_score)
|
||||
dice_loss = dice_loss * 0.01
|
||||
total_loss = dice_loss + smooth_l1_loss
|
||||
losses = {'total_loss':total_loss, "dice_loss":dice_loss,\
|
||||
"smooth_l1_loss":smooth_l1_loss}
|
||||
return losses
|
|
@ -1,115 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.fluid as fluid
|
||||
|
||||
|
||||
class SASTLoss(object):
|
||||
"""
|
||||
SAST Loss function
|
||||
"""
|
||||
|
||||
def __init__(self, params=None):
|
||||
super(SASTLoss, self).__init__()
|
||||
|
||||
def __call__(self, predicts, labels):
|
||||
"""
|
||||
tcl_pos: N x 128 x 3
|
||||
tcl_mask: N x 128 x 1
|
||||
tcl_label: N x X list or LoDTensor
|
||||
"""
|
||||
|
||||
f_score = predicts['f_score']
|
||||
f_border = predicts['f_border']
|
||||
f_tvo = predicts['f_tvo']
|
||||
f_tco = predicts['f_tco']
|
||||
|
||||
l_score = labels['input_score']
|
||||
l_border = labels['input_border']
|
||||
l_mask = labels['input_mask']
|
||||
l_tvo = labels['input_tvo']
|
||||
l_tco = labels['input_tco']
|
||||
|
||||
#score_loss
|
||||
intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
|
||||
union = fluid.layers.reduce_sum(f_score * l_mask) + fluid.layers.reduce_sum(l_score * l_mask)
|
||||
score_loss = 1.0 - 2 * intersection / (union + 1e-5)
|
||||
|
||||
#border loss
|
||||
l_border_split, l_border_norm = fluid.layers.split(l_border, num_or_sections=[4, 1], dim=1)
|
||||
f_border_split = f_border
|
||||
l_border_norm_split = fluid.layers.expand(x=l_border_norm, expand_times=[1, 4, 1, 1])
|
||||
l_border_score = fluid.layers.expand(x=l_score, expand_times=[1, 4, 1, 1])
|
||||
l_border_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 4, 1, 1])
|
||||
border_diff = l_border_split - f_border_split
|
||||
abs_border_diff = fluid.layers.abs(border_diff)
|
||||
border_sign = abs_border_diff < 1.0
|
||||
border_sign = fluid.layers.cast(border_sign, dtype='float32')
|
||||
border_sign.stop_gradient = True
|
||||
border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \
|
||||
(abs_border_diff - 0.5) * (1.0 - border_sign)
|
||||
border_out_loss = l_border_norm_split * border_in_loss
|
||||
border_loss = fluid.layers.reduce_sum(border_out_loss * l_border_score * l_border_mask) / \
|
||||
(fluid.layers.reduce_sum(l_border_score * l_border_mask) + 1e-5)
|
||||
|
||||
#tvo_loss
|
||||
l_tvo_split, l_tvo_norm = fluid.layers.split(l_tvo, num_or_sections=[8, 1], dim=1)
|
||||
f_tvo_split = f_tvo
|
||||
l_tvo_norm_split = fluid.layers.expand(x=l_tvo_norm, expand_times=[1, 8, 1, 1])
|
||||
l_tvo_score = fluid.layers.expand(x=l_score, expand_times=[1, 8, 1, 1])
|
||||
l_tvo_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 8, 1, 1])
|
||||
#
|
||||
tvo_geo_diff = l_tvo_split - f_tvo_split
|
||||
abs_tvo_geo_diff = fluid.layers.abs(tvo_geo_diff)
|
||||
tvo_sign = abs_tvo_geo_diff < 1.0
|
||||
tvo_sign = fluid.layers.cast(tvo_sign, dtype='float32')
|
||||
tvo_sign.stop_gradient = True
|
||||
tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \
|
||||
(abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign)
|
||||
tvo_out_loss = l_tvo_norm_split * tvo_in_loss
|
||||
tvo_loss = fluid.layers.reduce_sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \
|
||||
(fluid.layers.reduce_sum(l_tvo_score * l_tvo_mask) + 1e-5)
|
||||
|
||||
#tco_loss
|
||||
l_tco_split, l_tco_norm = fluid.layers.split(l_tco, num_or_sections=[2, 1], dim=1)
|
||||
f_tco_split = f_tco
|
||||
l_tco_norm_split = fluid.layers.expand(x=l_tco_norm, expand_times=[1, 2, 1, 1])
|
||||
l_tco_score = fluid.layers.expand(x=l_score, expand_times=[1, 2, 1, 1])
|
||||
l_tco_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 2, 1, 1])
|
||||
#
|
||||
tco_geo_diff = l_tco_split - f_tco_split
|
||||
abs_tco_geo_diff = fluid.layers.abs(tco_geo_diff)
|
||||
tco_sign = abs_tco_geo_diff < 1.0
|
||||
tco_sign = fluid.layers.cast(tco_sign, dtype='float32')
|
||||
tco_sign.stop_gradient = True
|
||||
tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \
|
||||
(abs_tco_geo_diff - 0.5) * (1.0 - tco_sign)
|
||||
tco_out_loss = l_tco_norm_split * tco_in_loss
|
||||
tco_loss = fluid.layers.reduce_sum(tco_out_loss * l_tco_score * l_tco_mask) / \
|
||||
(fluid.layers.reduce_sum(l_tco_score * l_tco_mask) + 1e-5)
|
||||
|
||||
|
||||
# total loss
|
||||
tvo_lw, tco_lw = 1.5, 1.5
|
||||
score_lw, border_lw = 1.0, 1.0
|
||||
total_loss = score_loss * score_lw + border_loss * border_lw + \
|
||||
tvo_loss * tvo_lw + tco_loss * tco_lw
|
||||
|
||||
losses = {'total_loss':total_loss, "score_loss":score_loss,\
|
||||
"border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss}
|
||||
return losses
|
|
@ -1,38 +0,0 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
import numpy as np
|
||||
|
||||
|
||||
class AttentionLoss(object):
|
||||
def __init__(self, params):
|
||||
super(AttentionLoss, self).__init__()
|
||||
self.char_num = params['char_num']
|
||||
|
||||
def __call__(self, predicts, labels):
|
||||
predict = predicts['predict']
|
||||
label_out = labels['label_out']
|
||||
label_out = fluid.layers.cast(x=label_out, dtype='int64')
|
||||
cost = fluid.layers.cross_entropy(input=predict, label=label_out)
|
||||
sum_cost = fluid.layers.reduce_sum(cost)
|
||||
return sum_cost
|
|
@ -1,36 +1,36 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle import nn
|
||||
|
||||
|
||||
class CTCLoss(object):
|
||||
def __init__(self, params):
|
||||
class CTCLoss(nn.Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super(CTCLoss, self).__init__()
|
||||
self.char_num = params['char_num']
|
||||
self.loss_func = nn.CTCLoss(blank=0, reduction='none')
|
||||
|
||||
def __call__(self, predicts, labels):
|
||||
predict = predicts['predict']
|
||||
label = labels['label']
|
||||
cost = fluid.layers.warpctc(
|
||||
input=predict, label=label, blank=self.char_num, norm_by_times=True)
|
||||
sum_cost = fluid.layers.reduce_sum(cost)
|
||||
return sum_cost
|
||||
def __call__(self, predicts, batch):
|
||||
predicts = predicts.transpose((1, 0, 2))
|
||||
N, B, _ = predicts.shape
|
||||
preds_lengths = paddle.to_tensor([N] * B, dtype='int64')
|
||||
labels = batch[1].astype("int32")
|
||||
label_lengths = batch[2].astype('int64')
|
||||
loss = self.loss_func(predicts, labels, preds_lengths, label_lengths)
|
||||
loss = loss.mean()
|
||||
return {'loss': loss}
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
|
||||
|
||||
class SRNLoss(object):
|
||||
def __init__(self, params):
|
||||
super(SRNLoss, self).__init__()
|
||||
self.char_num = params['char_num']
|
||||
|
||||
def __call__(self, predicts, others):
|
||||
predict = predicts['predict']
|
||||
word_predict = predicts['word_out']
|
||||
gsrm_predict = predicts['gsrm_out']
|
||||
label = others['label']
|
||||
lbl_weight = others['lbl_weight']
|
||||
|
||||
casted_label = fluid.layers.cast(x=label, dtype='int64')
|
||||
cost_word = fluid.layers.cross_entropy(
|
||||
input=word_predict, label=casted_label)
|
||||
cost_gsrm = fluid.layers.cross_entropy(
|
||||
input=gsrm_predict, label=casted_label)
|
||||
cost_vsfd = fluid.layers.cross_entropy(
|
||||
input=predict, label=casted_label)
|
||||
|
||||
cost_word = fluid.layers.reshape(
|
||||
x=fluid.layers.reduce_sum(cost_word), shape=[1])
|
||||
cost_gsrm = fluid.layers.reshape(
|
||||
x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
|
||||
cost_vsfd = fluid.layers.reshape(
|
||||
x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
|
||||
|
||||
sum_cost = fluid.layers.sum(
|
||||
[cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
|
||||
|
||||
return [sum_cost, cost_vsfd, cost_word]
|
|
@ -11,3 +11,17 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ['build_neck']
|
||||
|
||||
|
||||
def build_neck(config):
|
||||
from .fpn import FPN
|
||||
from .rnn import SequenceEncoder
|
||||
support_dict = ['FPN', 'SequenceEncoder']
|
||||
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception('neck only support {}'.format(
|
||||
support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
|
@ -0,0 +1,113 @@
|
|||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
|
||||
class FPN(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, **kwargs):
|
||||
super(FPN, self).__init__()
|
||||
self.out_channels = out_channels
|
||||
weight_attr = paddle.nn.initializer.MSRA(uniform=False)
|
||||
|
||||
self.in2_conv = nn.Conv2d(
|
||||
in_channels=in_channels[0],
|
||||
out_channels=self.out_channels,
|
||||
kernel_size=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_51.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.in3_conv = nn.Conv2d(
|
||||
in_channels=in_channels[1],
|
||||
out_channels=self.out_channels,
|
||||
kernel_size=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_50.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.in4_conv = nn.Conv2d(
|
||||
in_channels=in_channels[2],
|
||||
out_channels=self.out_channels,
|
||||
kernel_size=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_49.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.in5_conv = nn.Conv2d(
|
||||
in_channels=in_channels[3],
|
||||
out_channels=self.out_channels,
|
||||
kernel_size=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_48.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.p5_conv = nn.Conv2d(
|
||||
in_channels=self.out_channels,
|
||||
out_channels=self.out_channels // 4,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_52.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.p4_conv = nn.Conv2d(
|
||||
in_channels=self.out_channels,
|
||||
out_channels=self.out_channels // 4,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_53.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.p3_conv = nn.Conv2d(
|
||||
in_channels=self.out_channels,
|
||||
out_channels=self.out_channels // 4,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_54.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
self.p2_conv = nn.Conv2d(
|
||||
in_channels=self.out_channels,
|
||||
out_channels=self.out_channels // 4,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(
|
||||
name='conv2d_55.w_0', initializer=weight_attr),
|
||||
bias_attr=False)
|
||||
|
||||
def forward(self, x):
|
||||
c2, c3, c4, c5 = x
|
||||
|
||||
in5 = self.in5_conv(c5)
|
||||
in4 = self.in4_conv(c4)
|
||||
in3 = self.in3_conv(c3)
|
||||
in2 = self.in2_conv(c2)
|
||||
|
||||
out4 = in4 + F.resize_nearest(in5, scale=2) # 1/16
|
||||
out3 = in3 + F.resize_nearest(out4, scale=2) # 1/8
|
||||
out2 = in2 + F.resize_nearest(out3, scale=2) # 1/4
|
||||
|
||||
p5 = self.p5_conv(in5)
|
||||
p4 = self.p4_conv(out4)
|
||||
p3 = self.p3_conv(out3)
|
||||
p2 = self.p2_conv(out2)
|
||||
p5 = F.resize_nearest(p5, scale=8)
|
||||
p4 = F.resize_nearest(p4, scale=4)
|
||||
p3 = F.resize_nearest(p3, scale=2)
|
||||
|
||||
fuse = paddle.concat([p5, p4, p3, p2], axis=1)
|
||||
return fuse
|
|
@ -0,0 +1,143 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from paddle import nn
|
||||
|
||||
from ppocr.modeling.heads.rec_ctc_head import get_para_bias_attr
|
||||
|
||||
|
||||
class EncoderWithReshape(nn.Layer):
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super().__init__()
|
||||
self.out_channels = in_channels
|
||||
|
||||
def forward(self, x):
|
||||
B, C, H, W = x.shape
|
||||
x = x.reshape((B, C, -1))
|
||||
x = x.transpose([0, 2, 1]) # (NTC)(batch, width, channels)
|
||||
return x
|
||||
|
||||
|
||||
class Im2Seq(nn.Layer):
|
||||
def __init__(self, in_channels, **kwargs):
|
||||
super().__init__()
|
||||
self.out_channels = in_channels
|
||||
|
||||
def forward(self, x):
|
||||
B, C, H, W = x.shape
|
||||
assert H == 1
|
||||
x = x.transpose((0, 2, 3, 1))
|
||||
x = x.reshape((-1, C))
|
||||
return x
|
||||
|
||||
|
||||
class EncoderWithRNN(nn.Layer):
|
||||
def __init__(self, in_channels, hidden_size):
|
||||
super(EncoderWithRNN, self).__init__()
|
||||
self.out_channels = hidden_size * 2
|
||||
# self.lstm1_fw = nn.LSTMCell(
|
||||
# in_channels,
|
||||
# hidden_size,
|
||||
# weight_ih_attr=ParamAttr(name='lstm_st1_fc1_w'),
|
||||
# bias_ih_attr=ParamAttr(name='lstm_st1_fc1_b'),
|
||||
# weight_hh_attr=ParamAttr(name='lstm_st1_out1_w'),
|
||||
# bias_hh_attr=ParamAttr(name='lstm_st1_out1_b'),
|
||||
# )
|
||||
# self.lstm1_bw = nn.LSTMCell(
|
||||
# in_channels,
|
||||
# hidden_size,
|
||||
# weight_ih_attr=ParamAttr(name='lstm_st1_fc2_w'),
|
||||
# bias_ih_attr=ParamAttr(name='lstm_st1_fc2_b'),
|
||||
# weight_hh_attr=ParamAttr(name='lstm_st1_out2_w'),
|
||||
# bias_hh_attr=ParamAttr(name='lstm_st1_out2_b'),
|
||||
# )
|
||||
# self.lstm2_fw = nn.LSTMCell(
|
||||
# hidden_size,
|
||||
# hidden_size,
|
||||
# weight_ih_attr=ParamAttr(name='lstm_st2_fc1_w'),
|
||||
# bias_ih_attr=ParamAttr(name='lstm_st2_fc1_b'),
|
||||
# weight_hh_attr=ParamAttr(name='lstm_st2_out1_w'),
|
||||
# bias_hh_attr=ParamAttr(name='lstm_st2_out1_b'),
|
||||
# )
|
||||
# self.lstm2_bw = nn.LSTMCell(
|
||||
# hidden_size,
|
||||
# hidden_size,
|
||||
# weight_ih_attr=ParamAttr(name='lstm_st2_fc2_w'),
|
||||
# bias_ih_attr=ParamAttr(name='lstm_st2_fc2_b'),
|
||||
# weight_hh_attr=ParamAttr(name='lstm_st2_out2_w'),
|
||||
# bias_hh_attr=ParamAttr(name='lstm_st2_out2_b'),
|
||||
# )
|
||||
self.lstm = nn.LSTM(
|
||||
in_channels, hidden_size, direction='bidirectional', num_layers=2)
|
||||
|
||||
def forward(self, x):
|
||||
# fw_x, _ = self.lstm1_fw(x)
|
||||
# fw_x, _ = self.lstm2_fw(fw_x)
|
||||
#
|
||||
# # bw
|
||||
# bw_x, _ = self.lstm1_bw(x)
|
||||
# bw_x, _ = self.lstm2_bw(bw_x)
|
||||
# x = paddle.concat([fw_x, bw_x], axis=2)
|
||||
x, _ = self.lstm(x)
|
||||
return x
|
||||
|
||||
|
||||
class EncoderWithFC(nn.Layer):
|
||||
def __init__(self, in_channels, hidden_size):
|
||||
super(EncoderWithFC, self).__init__()
|
||||
self.out_channels = hidden_size
|
||||
weight_attr, bias_attr = get_para_bias_attr(
|
||||
l2_decay=0.00001, k=in_channels, name='reduce_encoder_fea')
|
||||
self.fc = nn.Linear(
|
||||
in_channels,
|
||||
hidden_size,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr,
|
||||
name='reduce_encoder_fea')
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc(x)
|
||||
return x
|
||||
|
||||
|
||||
class SequenceEncoder(nn.Layer):
|
||||
def __init__(self, in_channels, encoder_type, hidden_size, **kwargs):
|
||||
super(SequenceEncoder, self).__init__()
|
||||
self.encoder_reshape = EncoderWithReshape(in_channels)
|
||||
self.out_channels = self.encoder_reshape.out_channels
|
||||
if encoder_type == 'reshape':
|
||||
self.only_reshape = True
|
||||
else:
|
||||
support_encoder_dict = {
|
||||
'reshape': EncoderWithReshape,
|
||||
'fc': EncoderWithFC,
|
||||
'rnn': EncoderWithRNN
|
||||
}
|
||||
assert encoder_type in support_encoder_dict, '{} must in {}'.format(
|
||||
encoder_type, support_encoder_dict.keys())
|
||||
|
||||
self.encoder = support_encoder_dict[encoder_type](
|
||||
self.encoder_reshape.out_channels, hidden_size)
|
||||
self.out_channels = self.encoder.out_channels
|
||||
self.only_reshape = False
|
||||
|
||||
def forward(self, x):
|
||||
x = self.encoder_reshape(x)
|
||||
if not self.only_reshape:
|
||||
x = self.encoder(x)
|
||||
return x
|
|
@ -1,261 +0,0 @@
|
|||
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from paddle.fluid.param_attr import ParamAttr
|
||||
import numpy as np
|
||||
|
||||
|
||||
class LocalizationNetwork(object):
|
||||
def __init__(self, params):
|
||||
super(LocalizationNetwork, self).__init__()
|
||||
self.F = params['num_fiducial']
|
||||
self.loc_lr = params['loc_lr']
|
||||
self.model_name = params['model_name']
|
||||
|
||||
def conv_bn_layer(self,
|
||||
input,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
act=None,
|
||||
name=None):
|
||||
conv = layers.conv2d(
|
||||
input=input,
|
||||
num_filters=num_filters,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
bn_name = "bn_" + name
|
||||
return layers.batch_norm(
|
||||
input=conv,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def get_initial_fiducials(self):
|
||||
""" see RARE paper Fig. 6 (a) """
|
||||
F = self.F
|
||||
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
|
||||
ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
|
||||
ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
|
||||
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
|
||||
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
|
||||
initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
|
||||
return initial_bias
|
||||
|
||||
def __call__(self, image):
|
||||
F = self.F
|
||||
loc_lr = self.loc_lr
|
||||
if self.model_name == "large":
|
||||
num_filters_list = [64, 128, 256, 512]
|
||||
fc_dim = 256
|
||||
else:
|
||||
num_filters_list = [16, 32, 64, 128]
|
||||
fc_dim = 64
|
||||
for fno in range(len(num_filters_list)):
|
||||
num_filters = num_filters_list[fno]
|
||||
name = "loc_conv%d" % fno
|
||||
if fno == 0:
|
||||
conv = self.conv_bn_layer(
|
||||
image, num_filters, 3, act='relu', name=name)
|
||||
else:
|
||||
conv = self.conv_bn_layer(
|
||||
pool, num_filters, 3, act='relu', name=name)
|
||||
|
||||
if fno == len(num_filters_list) - 1:
|
||||
pool = layers.adaptive_pool2d(
|
||||
input=conv, pool_size=[1, 1], pool_type='avg')
|
||||
else:
|
||||
pool = layers.pool2d(
|
||||
input=conv,
|
||||
pool_size=2,
|
||||
pool_stride=2,
|
||||
pool_padding=0,
|
||||
pool_type='max')
|
||||
name = "loc_fc1"
|
||||
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
|
||||
fc1 = layers.fc(input=pool,
|
||||
size=fc_dim,
|
||||
param_attr=fluid.param_attr.ParamAttr(
|
||||
learning_rate=loc_lr,
|
||||
initializer=fluid.initializer.Uniform(-stdv, stdv),
|
||||
name=name + "_w"),
|
||||
act='relu',
|
||||
name=name)
|
||||
|
||||
initial_bias = self.get_initial_fiducials()
|
||||
initial_bias = initial_bias.reshape(-1)
|
||||
name = "loc_fc2"
|
||||
param_attr = fluid.param_attr.ParamAttr(
|
||||
learning_rate=loc_lr,
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
np.zeros([fc_dim, F * 2])),
|
||||
name=name + "_w")
|
||||
bias_attr = fluid.param_attr.ParamAttr(
|
||||
learning_rate=loc_lr,
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(initial_bias),
|
||||
name=name + "_b")
|
||||
fc2 = layers.fc(input=fc1,
|
||||
size=F * 2,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
name=name)
|
||||
batch_C_prime = layers.reshape(x=fc2, shape=[-1, F, 2], inplace=False)
|
||||
return batch_C_prime
|
||||
|
||||
|
||||
class GridGenerator(object):
|
||||
def __init__(self, params):
|
||||
super(GridGenerator, self).__init__()
|
||||
self.eps = 1e-6
|
||||
self.F = params['num_fiducial']
|
||||
|
||||
def build_C(self):
|
||||
""" Return coordinates of fiducial points in I_r; C """
|
||||
F = self.F
|
||||
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
|
||||
ctrl_pts_y_top = -1 * np.ones(int(F / 2))
|
||||
ctrl_pts_y_bottom = np.ones(int(F / 2))
|
||||
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
|
||||
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
|
||||
C = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
|
||||
return C # F x 2
|
||||
|
||||
def build_P(self, I_r_size):
|
||||
I_r_width, I_r_height = I_r_size
|
||||
I_r_grid_x = (np.arange(-I_r_width, I_r_width, 2) + 1.0)\
|
||||
/ I_r_width # self.I_r_width
|
||||
I_r_grid_y = (np.arange(-I_r_height, I_r_height, 2) + 1.0)\
|
||||
/ I_r_height # self.I_r_height
|
||||
# P: self.I_r_width x self.I_r_height x 2
|
||||
P = np.stack(np.meshgrid(I_r_grid_x, I_r_grid_y), axis=2)
|
||||
# n (= self.I_r_width x self.I_r_height) x 2
|
||||
return P.reshape([-1, 2])
|
||||
|
||||
def build_inv_delta_C(self, C):
|
||||
""" Return inv_delta_C which is needed to calculate T """
|
||||
F = self.F
|
||||
hat_C = np.zeros((F, F), dtype=float) # F x F
|
||||
for i in range(0, F):
|
||||
for j in range(i, F):
|
||||
r = np.linalg.norm(C[i] - C[j])
|
||||
hat_C[i, j] = r
|
||||
hat_C[j, i] = r
|
||||
np.fill_diagonal(hat_C, 1)
|
||||
hat_C = (hat_C**2) * np.log(hat_C)
|
||||
# print(C.shape, hat_C.shape)
|
||||
delta_C = np.concatenate( # F+3 x F+3
|
||||
[
|
||||
np.concatenate(
|
||||
[np.ones((F, 1)), C, hat_C], axis=1), # F x F+3
|
||||
np.concatenate(
|
||||
[np.zeros((2, 3)), np.transpose(C)], axis=1), # 2 x F+3
|
||||
np.concatenate(
|
||||
[np.zeros((1, 3)), np.ones((1, F))], axis=1) # 1 x F+3
|
||||
],
|
||||
axis=0)
|
||||
inv_delta_C = np.linalg.inv(delta_C)
|
||||
return inv_delta_C # F+3 x F+3
|
||||
|
||||
def build_P_hat(self, C, P):
|
||||
F = self.F
|
||||
eps = self.eps
|
||||
n = P.shape[0] # n (= self.I_r_width x self.I_r_height)
|
||||
#P_tile: n x 2 -> n x 1 x 2 -> n x F x 2
|
||||
P_tile = np.tile(np.expand_dims(P, axis=1), (1, F, 1))
|
||||
C_tile = np.expand_dims(C, axis=0) # 1 x F x 2
|
||||
P_diff = P_tile - C_tile # n x F x 2
|
||||
#rbf_norm: n x F
|
||||
rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False)
|
||||
#rbf: n x F
|
||||
rbf = np.multiply(np.square(rbf_norm), np.log(rbf_norm + eps))
|
||||
P_hat = np.concatenate([np.ones((n, 1)), P, rbf], axis=1)
|
||||
return P_hat # n x F+3
|
||||
|
||||
def get_expand_tensor(self, batch_C_prime):
|
||||
name = "ex_fc"
|
||||
initializer = fluid.initializer.ConstantInitializer(value=0.0)
|
||||
param_attr = fluid.param_attr.ParamAttr(
|
||||
learning_rate=0.0, initializer=initializer, name=name + "_w")
|
||||
bias_attr = fluid.param_attr.ParamAttr(
|
||||
learning_rate=0.0, initializer=initializer, name=name + "_b")
|
||||
batch_C_ex_part_tensor = fluid.layers.fc(input=batch_C_prime,
|
||||
size=6,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
name=name)
|
||||
batch_C_ex_part_tensor = fluid.layers.reshape(
|
||||
x=batch_C_ex_part_tensor, shape=[-1, 3, 2])
|
||||
return batch_C_ex_part_tensor
|
||||
|
||||
def __call__(self, batch_C_prime, I_r_size):
|
||||
C = self.build_C()
|
||||
P = self.build_P(I_r_size)
|
||||
inv_delta_C = self.build_inv_delta_C(C).astype('float32')
|
||||
P_hat = self.build_P_hat(C, P).astype('float32')
|
||||
|
||||
inv_delta_C_tensor = layers.create_tensor(dtype='float32')
|
||||
layers.assign(inv_delta_C, inv_delta_C_tensor)
|
||||
inv_delta_C_tensor.stop_gradient = True
|
||||
P_hat_tensor = layers.create_tensor(dtype='float32')
|
||||
layers.assign(P_hat, P_hat_tensor)
|
||||
P_hat_tensor.stop_gradient = True
|
||||
|
||||
batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime)
|
||||
# batch_C_ex_part_tensor = create_tmp_var(
|
||||
# fluid.default_main_program(),
|
||||
# name='batch_C_ex_part_tensor',
|
||||
# dtype='float32', shape=[-1, 3, 2])
|
||||
# layers.py_func(func=get_batch_C_expand,
|
||||
# x=[batch_C_prime], out=[batch_C_ex_part_tensor])
|
||||
|
||||
batch_C_ex_part_tensor.stop_gradient = True
|
||||
|
||||
batch_C_prime_with_zeros = layers.concat(
|
||||
[batch_C_prime, batch_C_ex_part_tensor], axis=1)
|
||||
batch_T = layers.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros)
|
||||
batch_P_prime = layers.matmul(P_hat_tensor, batch_T)
|
||||
return batch_P_prime
|
||||
|
||||
|
||||
class TPS(object):
|
||||
def __init__(self, params):
|
||||
super(TPS, self).__init__()
|
||||
self.loc_net = LocalizationNetwork(params)
|
||||
self.grid_generator = GridGenerator(params)
|
||||
|
||||
def __call__(self, image):
|
||||
batch_C_prime = self.loc_net(image)
|
||||
I_r_size = [image.shape[3], image.shape[2]]
|
||||
batch_P_prime = self.grid_generator(batch_C_prime, I_r_size)
|
||||
batch_P_prime = layers.reshape(
|
||||
x=batch_P_prime, shape=[-1, image.shape[2], image.shape[3], 2])
|
||||
batch_I_r = layers.grid_sampler(x=image, grid=batch_P_prime)
|
||||
image.stop_gradient = False
|
||||
return batch_I_r
|
|
@ -11,3 +11,15 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__all__ = ['build_transform']
|
||||
|
||||
|
||||
def build_transform(config):
|
||||
support_dict = ['']
|
||||
|
||||
module_name = config.pop('name')
|
||||
assert module_name in support_dict, Exception(
|
||||
'transform only support {}'.format(support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
|
@ -1,155 +0,0 @@
|
|||
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import math
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.regularizer import L2Decay
|
||||
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
|
||||
import paddle.fluid.layers.ops as ops
|
||||
|
||||
from ppocr.utils.utility import initial_logger
|
||||
|
||||
logger = initial_logger()
|
||||
|
||||
|
||||
def cosine_decay_with_warmup(learning_rate,
|
||||
step_each_epoch,
|
||||
epochs=500,
|
||||
warmup_minibatch=1000):
|
||||
"""Applies cosine decay to the learning rate.
|
||||
lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
|
||||
decrease lr for every mini-batch and start with warmup.
|
||||
"""
|
||||
global_step = _decay_step_counter()
|
||||
lr = fluid.layers.tensor.create_global_var(
|
||||
shape=[1],
|
||||
value=0.0,
|
||||
dtype='float32',
|
||||
persistable=True,
|
||||
name="learning_rate")
|
||||
|
||||
warmup_minibatch = fluid.layers.fill_constant(
|
||||
shape=[1],
|
||||
dtype='float32',
|
||||
value=float(warmup_minibatch),
|
||||
force_cpu=True)
|
||||
|
||||
with fluid.layers.control_flow.Switch() as switch:
|
||||
with switch.case(global_step < warmup_minibatch):
|
||||
decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch)
|
||||
fluid.layers.tensor.assign(input=decayed_lr, output=lr)
|
||||
with switch.default():
|
||||
decayed_lr = learning_rate * \
|
||||
(ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
|
||||
fluid.layers.tensor.assign(input=decayed_lr, output=lr)
|
||||
return lr
|
||||
|
||||
|
||||
def AdamDecay(params, parameter_list=None):
|
||||
"""
|
||||
define optimizer function
|
||||
args:
|
||||
params(dict): the super parameters
|
||||
parameter_list (list): list of Variable names to update to minimize loss
|
||||
return:
|
||||
"""
|
||||
base_lr = params['base_lr']
|
||||
beta1 = params['beta1']
|
||||
beta2 = params['beta2']
|
||||
l2_decay = params.get("l2_decay", 0.0)
|
||||
|
||||
if 'decay' in params:
|
||||
supported_decay_mode = [
|
||||
"cosine_decay", "cosine_decay_warmup", "piecewise_decay"
|
||||
]
|
||||
params = params['decay']
|
||||
decay_mode = params['function']
|
||||
assert decay_mode in supported_decay_mode, "Supported decay mode is {}, but got {}".format(
|
||||
supported_decay_mode, decay_mode)
|
||||
|
||||
if decay_mode == "cosine_decay":
|
||||
step_each_epoch = params['step_each_epoch']
|
||||
total_epoch = params['total_epoch']
|
||||
base_lr = fluid.layers.cosine_decay(
|
||||
learning_rate=base_lr,
|
||||
step_each_epoch=step_each_epoch,
|
||||
epochs=total_epoch)
|
||||
elif decay_mode == "cosine_decay_warmup":
|
||||
step_each_epoch = params['step_each_epoch']
|
||||
total_epoch = params['total_epoch']
|
||||
warmup_minibatch = params.get("warmup_minibatch", 1000)
|
||||
base_lr = cosine_decay_with_warmup(
|
||||
learning_rate=base_lr,
|
||||
step_each_epoch=step_each_epoch,
|
||||
epochs=total_epoch,
|
||||
warmup_minibatch=warmup_minibatch)
|
||||
elif decay_mode == "piecewise_decay":
|
||||
boundaries = params["boundaries"]
|
||||
decay_rate = params["decay_rate"]
|
||||
values = [
|
||||
base_lr * decay_rate**idx
|
||||
for idx in range(len(boundaries) + 1)
|
||||
]
|
||||
base_lr = fluid.layers.piecewise_decay(boundaries, values)
|
||||
|
||||
optimizer = fluid.optimizer.Adam(
|
||||
learning_rate=base_lr,
|
||||
beta1=beta1,
|
||||
beta2=beta2,
|
||||
regularization=L2Decay(regularization_coeff=l2_decay),
|
||||
parameter_list=parameter_list)
|
||||
return optimizer
|
||||
|
||||
|
||||
def RMSProp(params, parameter_list=None):
|
||||
"""
|
||||
define optimizer function
|
||||
args:
|
||||
params(dict): the super parameters
|
||||
parameter_list (list): list of Variable names to update to minimize loss
|
||||
return:
|
||||
"""
|
||||
base_lr = params.get("base_lr", 0.001)
|
||||
l2_decay = params.get("l2_decay", 0.00005)
|
||||
|
||||
if 'decay' in params:
|
||||
supported_decay_mode = ["cosine_decay", "piecewise_decay"]
|
||||
params = params['decay']
|
||||
decay_mode = params['function']
|
||||
assert decay_mode in supported_decay_mode, "Supported decay mode is {}, but got {}".format(
|
||||
supported_decay_mode, decay_mode)
|
||||
|
||||
if decay_mode == "cosine_decay":
|
||||
step_each_epoch = params['step_each_epoch']
|
||||
total_epoch = params['total_epoch']
|
||||
base_lr = fluid.layers.cosine_decay(
|
||||
learning_rate=base_lr,
|
||||
step_each_epoch=step_each_epoch,
|
||||
epochs=total_epoch)
|
||||
elif decay_mode == "piecewise_decay":
|
||||
boundaries = params["boundaries"]
|
||||
decay_rate = params["decay_rate"]
|
||||
values = [
|
||||
base_lr * decay_rate**idx
|
||||
for idx in range(len(boundaries) + 1)
|
||||
]
|
||||
base_lr = fluid.layers.piecewise_decay(boundaries, values)
|
||||
|
||||
optimizer = fluid.optimizer.RMSProp(
|
||||
learning_rate=base_lr,
|
||||
regularization=fluid.regularizer.L2Decay(regularization_coeff=l2_decay))
|
||||
|
||||
return optimizer
|
|
@ -0,0 +1,56 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import copy
|
||||
|
||||
__all__ = ['build_optimizer']
|
||||
|
||||
|
||||
def build_lr_scheduler(lr_config, epochs, step_each_epoch):
|
||||
from . import learning_rate
|
||||
lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
|
||||
if 'name' in lr_config:
|
||||
lr_name = lr_config.pop('name')
|
||||
lr = getattr(learning_rate, lr_name)(**lr_config)()
|
||||
else:
|
||||
lr = lr_config['lr']
|
||||
return lr
|
||||
|
||||
|
||||
def build_optimizer(config, epochs, step_each_epoch, parameters):
|
||||
from . import regularizer, optimizer
|
||||
config = copy.deepcopy(config)
|
||||
# step1 build lr
|
||||
lr = build_lr_scheduler(
|
||||
config.pop('learning_rate'), epochs, step_each_epoch)
|
||||
|
||||
# step2 build regularization
|
||||
if 'regularizer' in config and config['regularizer'] is not None:
|
||||
reg_config = config.pop('regularizer')
|
||||
reg_name = reg_config.pop('name') + 'Decay'
|
||||
reg = getattr(regularizer, reg_name)(**reg_config)()
|
||||
else:
|
||||
reg = None
|
||||
|
||||
# step3 build optimizer
|
||||
optim_name = config.pop('name')
|
||||
optim = getattr(optimizer, optim_name)(learning_rate=lr,
|
||||
regularization=reg,
|
||||
**config)
|
||||
return optim(parameters), lr
|
|
@ -0,0 +1,183 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from paddle.optimizer import lr_scheduler
|
||||
|
||||
|
||||
class Linear(object):
|
||||
"""
|
||||
Linear learning rate decay
|
||||
Args:
|
||||
lr (float): The initial learning rate. It is a python float number.
|
||||
epochs(int): The decay step size. It determines the decay cycle.
|
||||
end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
|
||||
power(float, optional): Power of polynomial. Default: 1.0.
|
||||
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr,
|
||||
epochs,
|
||||
step_each_epoch,
|
||||
end_lr=0.0,
|
||||
power=1.0,
|
||||
warmup_epoch=0,
|
||||
last_epoch=-1,
|
||||
**kwargs):
|
||||
super(Linear, self).__init__()
|
||||
self.lr = lr
|
||||
self.epochs = epochs * step_each_epoch
|
||||
self.end_lr = end_lr
|
||||
self.power = power
|
||||
self.last_epoch = last_epoch
|
||||
self.warmup_epoch = warmup_epoch * step_each_epoch
|
||||
|
||||
def __call__(self):
|
||||
learning_rate = lr_scheduler.PolynomialLR(
|
||||
learning_rate=self.lr,
|
||||
decay_steps=self.epochs,
|
||||
end_lr=self.end_lr,
|
||||
power=self.power,
|
||||
last_epoch=self.last_epoch)
|
||||
if self.warmup_epoch > 0:
|
||||
learning_rate = lr_scheduler.LinearLrWarmup(
|
||||
learning_rate=learning_rate,
|
||||
warmup_steps=self.warmup_epoch,
|
||||
start_lr=0.0,
|
||||
end_lr=self.lr,
|
||||
last_epoch=self.last_epoch)
|
||||
return learning_rate
|
||||
|
||||
|
||||
class Cosine(object):
|
||||
"""
|
||||
Cosine learning rate decay
|
||||
lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
|
||||
Args:
|
||||
lr(float): initial learning rate
|
||||
step_each_epoch(int): steps each epoch
|
||||
epochs(int): total training epochs
|
||||
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr,
|
||||
step_each_epoch,
|
||||
epochs,
|
||||
warmup_epoch=0,
|
||||
last_epoch=-1,
|
||||
**kwargs):
|
||||
super(Cosine, self).__init__()
|
||||
self.lr = lr
|
||||
self.T_max = step_each_epoch * epochs
|
||||
self.last_epoch = last_epoch
|
||||
self.warmup_epoch = warmup_epoch * step_each_epoch
|
||||
|
||||
def __call__(self):
|
||||
learning_rate = lr_scheduler.CosineAnnealingLR(
|
||||
learning_rate=self.lr, T_max=self.T_max, last_epoch=self.last_epoch)
|
||||
if self.warmup_epoch > 0:
|
||||
learning_rate = lr_scheduler.LinearLrWarmup(
|
||||
learning_rate=learning_rate,
|
||||
warmup_steps=self.warmup_epoch,
|
||||
start_lr=0.0,
|
||||
end_lr=self.lr,
|
||||
last_epoch=self.last_epoch)
|
||||
return learning_rate
|
||||
|
||||
|
||||
class Step(object):
|
||||
"""
|
||||
Piecewise learning rate decay
|
||||
Args:
|
||||
step_each_epoch(int): steps each epoch
|
||||
learning_rate (float): The initial learning rate. It is a python float number.
|
||||
step_size (int): the interval to update.
|
||||
gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
|
||||
It should be less than 1.0. Default: 0.1.
|
||||
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr,
|
||||
step_size,
|
||||
step_each_epoch,
|
||||
gamma,
|
||||
warmup_epoch=0,
|
||||
last_epoch=-1,
|
||||
**kwargs):
|
||||
super(Step, self).__init__()
|
||||
self.step_size = step_each_epoch * step_size
|
||||
self.lr = lr
|
||||
self.gamma = gamma
|
||||
self.last_epoch = last_epoch
|
||||
self.warmup_epoch = warmup_epoch * step_each_epoch
|
||||
|
||||
def __call__(self):
|
||||
learning_rate = lr_scheduler.StepLR(
|
||||
learning_rate=self.lr,
|
||||
step_size=self.step_size,
|
||||
gamma=self.gamma,
|
||||
last_epoch=self.last_epoch)
|
||||
if self.warmup_epoch > 0:
|
||||
learning_rate = lr_scheduler.LinearLrWarmup(
|
||||
learning_rate=learning_rate,
|
||||
warmup_steps=self.warmup_epoch,
|
||||
start_lr=0.0,
|
||||
end_lr=self.lr,
|
||||
last_epoch=self.last_epoch)
|
||||
return learning_rate
|
||||
|
||||
|
||||
class Piecewise(object):
|
||||
"""
|
||||
Piecewise learning rate decay
|
||||
Args:
|
||||
boundaries(list): A list of steps numbers. The type of element in the list is python int.
|
||||
values(list): A list of learning rate values that will be picked during different epoch boundaries.
|
||||
The type of element in the list is python float.
|
||||
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
step_each_epoch,
|
||||
decay_epochs,
|
||||
values,
|
||||
warmup_epoch=0,
|
||||
last_epoch=-1,
|
||||
**kwargs):
|
||||
super(Piecewise, self).__init__()
|
||||
self.boundaries = [step_each_epoch * e for e in decay_epochs]
|
||||
self.values = values
|
||||
self.last_epoch = last_epoch
|
||||
self.warmup_epoch = warmup_epoch * step_each_epoch
|
||||
|
||||
def __call__(self):
|
||||
learning_rate = lr_scheduler.PiecewiseLR(
|
||||
boundaries=self.boundaries,
|
||||
values=self.values,
|
||||
last_epoch=self.last_epoch)
|
||||
if self.warmup_epoch > 0:
|
||||
learning_rate = lr_scheduler.LinearLrWarmup(
|
||||
learning_rate=learning_rate,
|
||||
warmup_steps=self.warmup_epoch,
|
||||
start_lr=0.0,
|
||||
end_lr=self.values[0],
|
||||
last_epoch=self.last_epoch)
|
||||
return learning_rate
|
|
@ -0,0 +1,119 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from paddle import optimizer as optim
|
||||
|
||||
|
||||
class Momentum(object):
|
||||
"""
|
||||
Simple Momentum optimizer with velocity state.
|
||||
Args:
|
||||
learning_rate (float|Variable) - The learning rate used to update parameters.
|
||||
Can be a float value or a Variable with one float value as data element.
|
||||
momentum (float) - Momentum factor.
|
||||
regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate, momentum, weight_decay=None, **args):
|
||||
super(Momentum, self).__init__()
|
||||
self.learning_rate = learning_rate
|
||||
self.momentum = momentum
|
||||
self.weight_decay = weight_decay
|
||||
|
||||
def __call__(self, parameters):
|
||||
opt = optim.Momentum(
|
||||
learning_rate=self.learning_rate,
|
||||
momentum=self.momentum,
|
||||
parameters=self.weight_decay,
|
||||
weight_decay=parameters)
|
||||
return opt
|
||||
|
||||
|
||||
class Adam(object):
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta1=0.9,
|
||||
beta2=0.999,
|
||||
epsilon=1e-08,
|
||||
parameter_list=None,
|
||||
weight_decay=None,
|
||||
grad_clip=None,
|
||||
name=None,
|
||||
lazy_mode=False,
|
||||
**kwargs):
|
||||
self.learning_rate = learning_rate
|
||||
self.beta1 = beta1
|
||||
self.beta2 = beta2
|
||||
self.epsilon = epsilon
|
||||
self.parameter_list = parameter_list
|
||||
self.learning_rate = learning_rate
|
||||
self.weight_decay = weight_decay
|
||||
self.grad_clip = grad_clip
|
||||
self.name = name
|
||||
self.lazy_mode = lazy_mode
|
||||
|
||||
def __call__(self, parameters):
|
||||
opt = optim.Adam(
|
||||
learning_rate=self.learning_rate,
|
||||
beta1=self.beta1,
|
||||
beta2=self.beta2,
|
||||
epsilon=self.epsilon,
|
||||
weight_decay=self.weight_decay,
|
||||
grad_clip=self.grad_clip,
|
||||
name=self.name,
|
||||
lazy_mode=self.lazy_mode,
|
||||
parameters=parameters)
|
||||
return opt
|
||||
|
||||
|
||||
class RMSProp(object):
|
||||
"""
|
||||
Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
|
||||
Args:
|
||||
learning_rate (float|Variable) - The learning rate used to update parameters.
|
||||
Can be a float value or a Variable with one float value as data element.
|
||||
momentum (float) - Momentum factor.
|
||||
rho (float) - rho value in equation.
|
||||
epsilon (float) - avoid division by zero, default is 1e-6.
|
||||
regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate,
|
||||
momentum,
|
||||
rho=0.95,
|
||||
epsilon=1e-6,
|
||||
weight_decay=None,
|
||||
**args):
|
||||
super(RMSProp, self).__init__()
|
||||
self.learning_rate = learning_rate
|
||||
self.momentum = momentum
|
||||
self.rho = rho
|
||||
self.epsilon = epsilon
|
||||
self.weight_decay = weight_decay
|
||||
|
||||
def __call__(self, parameters):
|
||||
opt = optim.RMSProp(
|
||||
learning_rate=self.learning_rate,
|
||||
momentum=self.momentum,
|
||||
rho=self.rho,
|
||||
epsilon=self.epsilon,
|
||||
weight_decay=self.weight_decay,
|
||||
parameters=parameters)
|
||||
return opt
|
|
@ -0,0 +1,54 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from paddle import fluid
|
||||
|
||||
|
||||
class L1Decay(object):
|
||||
"""
|
||||
L1 Weight Decay Regularization, which encourages the weights to be sparse.
|
||||
Args:
|
||||
factor(float): regularization coeff. Default:0.0.
|
||||
"""
|
||||
|
||||
def __init__(self, factor=0.0):
|
||||
super(L1Decay, self).__init__()
|
||||
self.regularization_coeff = factor
|
||||
|
||||
def __call__(self):
|
||||
reg = fluid.regularizer.L1Decay(
|
||||
regularization_coeff=self.regularization_coeff)
|
||||
return reg
|
||||
|
||||
|
||||
class L2Decay(object):
|
||||
"""
|
||||
L2 Weight Decay Regularization, which encourages the weights to be sparse.
|
||||
Args:
|
||||
factor(float): regularization coeff. Default:0.0.
|
||||
"""
|
||||
|
||||
def __init__(self, factor=0.0):
|
||||
super(L2Decay, self).__init__()
|
||||
self.regularization_coeff = factor
|
||||
|
||||
def __call__(self):
|
||||
reg = fluid.regularizer.L2Decay(
|
||||
regularization_coeff=self.regularization_coeff)
|
||||
return reg
|
|
@ -0,0 +1,38 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import copy
|
||||
|
||||
__all__ = ['build_post_process']
|
||||
|
||||
|
||||
def build_post_process(config, global_config=None):
|
||||
from .db_postprocess import DBPostProcess
|
||||
|
||||
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode
|
||||
support_dict = ['DBPostProcess', 'CTCLabelDecode', 'AttnLabelDecode']
|
||||
|
||||
config = copy.deepcopy(config)
|
||||
module_name = config.pop('name')
|
||||
if global_config is not None:
|
||||
config.update(global_config)
|
||||
assert module_name in support_dict, Exception(
|
||||
'post process only support {}'.format(support_dict))
|
||||
module_class = eval(module_name)(**config)
|
||||
return module_class
|
|
@ -16,11 +16,7 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
|
||||
import numpy as np
|
||||
import string
|
||||
import cv2
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
@ -31,11 +27,16 @@ class DBPostProcess(object):
|
|||
The post process for Differentiable Binarization (DB).
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
self.thresh = params['thresh']
|
||||
self.box_thresh = params['box_thresh']
|
||||
self.max_candidates = params['max_candidates']
|
||||
self.unclip_ratio = params['unclip_ratio']
|
||||
def __init__(self,
|
||||
thresh=0.3,
|
||||
box_thresh=0.7,
|
||||
max_candidates=1000,
|
||||
unclip_ratio=2.0,
|
||||
**kwargs):
|
||||
self.thresh = thresh
|
||||
self.box_thresh = box_thresh
|
||||
self.max_candidates = max_candidates
|
||||
self.unclip_ratio = unclip_ratio
|
||||
self.min_size = 3
|
||||
|
||||
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
|
||||
|
@ -55,9 +56,9 @@ class DBPostProcess(object):
|
|||
contours, _ = outs[0], outs[1]
|
||||
|
||||
num_contours = min(len(contours), self.max_candidates)
|
||||
boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
|
||||
scores = np.zeros((num_contours, ), dtype=np.float32)
|
||||
|
||||
boxes = []
|
||||
scores = []
|
||||
for index in range(num_contours):
|
||||
contour = contours[index]
|
||||
points, sside = self.get_mini_boxes(contour)
|
||||
|
@ -73,17 +74,14 @@ class DBPostProcess(object):
|
|||
if sside < self.min_size + 2:
|
||||
continue
|
||||
box = np.array(box)
|
||||
if not isinstance(dest_width, int):
|
||||
dest_width = dest_width.item()
|
||||
dest_height = dest_height.item()
|
||||
|
||||
box[:, 0] = np.clip(
|
||||
np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
||||
box[:, 1] = np.clip(
|
||||
np.round(box[:, 1] / height * dest_height), 0, dest_height)
|
||||
boxes[index, :, :] = box.astype(np.int16)
|
||||
scores[index] = score
|
||||
return boxes, scores
|
||||
boxes.append(box.astype(np.int16))
|
||||
scores.append(score)
|
||||
return np.array(boxes, dtype=np.int16), scores
|
||||
|
||||
def unclip(self, box):
|
||||
unclip_ratio = self.unclip_ratio
|
||||
|
@ -131,28 +129,15 @@ class DBPostProcess(object):
|
|||
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
|
||||
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
|
||||
|
||||
def __call__(self, outs_dict, ratio_list):
|
||||
pred = outs_dict['maps']
|
||||
|
||||
pred = pred[:, 0, :, :]
|
||||
def __call__(self, pred, shape_list):
|
||||
pred = pred.numpy()[:, 0, :, :]
|
||||
segmentation = pred > self.thresh
|
||||
|
||||
boxes_batch = []
|
||||
for batch_index in range(pred.shape[0]):
|
||||
height, width = pred.shape[-2:]
|
||||
tmp_boxes, tmp_scores = self.boxes_from_bitmap(
|
||||
height, width = shape_list[batch_index]
|
||||
boxes, scores = self.boxes_from_bitmap(
|
||||
pred[batch_index], segmentation[batch_index], width, height)
|
||||
|
||||
boxes = []
|
||||
for k in range(len(tmp_boxes)):
|
||||
if tmp_scores[k] > self.box_thresh:
|
||||
boxes.append(tmp_boxes[k])
|
||||
if len(boxes) > 0:
|
||||
boxes = np.array(boxes)
|
||||
|
||||
ratio_h, ratio_w = ratio_list[batch_index]
|
||||
boxes[:, :, 0] = boxes[:, :, 0] / ratio_w
|
||||
boxes[:, :, 1] = boxes[:, :, 1] / ratio_h
|
||||
|
||||
boxes_batch.append(boxes)
|
||||
boxes_batch.append({'points': boxes})
|
||||
return boxes_batch
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
import cv2
|
||||
import numpy as np
|
||||
import pyclipper
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
|
||||
class DBPostProcess():
|
||||
def __init__(self,
|
||||
thresh=0.3,
|
||||
box_thresh=0.7,
|
||||
max_candidates=1000,
|
||||
unclip_ratio=1.5):
|
||||
self.min_size = 3
|
||||
self.thresh = thresh
|
||||
self.box_thresh = box_thresh
|
||||
self.max_candidates = max_candidates
|
||||
self.unclip_ratio = unclip_ratio
|
||||
|
||||
def __call__(self, pred, shape_list, is_output_polygon=False):
|
||||
'''
|
||||
batch: (image, polygons, ignore_tags
|
||||
h_w_list: 包含[h,w]的数组
|
||||
pred:
|
||||
binary: text region segmentation map, with shape (N, 1,H, W)
|
||||
'''
|
||||
pred = pred.numpy()[:, 0, :, :]
|
||||
segmentation = self.binarize(pred)
|
||||
batch_out = []
|
||||
for batch_index in range(pred.shape[0]):
|
||||
height, width = shape_list[batch_index]
|
||||
boxes, scores = self.post_p(
|
||||
pred[batch_index],
|
||||
segmentation[batch_index],
|
||||
width,
|
||||
height,
|
||||
is_output_polygon=is_output_polygon)
|
||||
batch_out.append({"points": boxes})
|
||||
return batch_out
|
||||
|
||||
def binarize(self, pred):
|
||||
return pred > self.thresh
|
||||
|
||||
def post_p(self,
|
||||
pred,
|
||||
bitmap,
|
||||
dest_width,
|
||||
dest_height,
|
||||
is_output_polygon=True):
|
||||
'''
|
||||
_bitmap: single map with shape (H, W),
|
||||
whose values are binarized as {0, 1}
|
||||
'''
|
||||
height, width = pred.shape
|
||||
boxes = []
|
||||
new_scores = []
|
||||
contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
|
||||
cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for contour in contours[:self.max_candidates]:
|
||||
epsilon = 0.005 * cv2.arcLength(contour, True)
|
||||
approx = cv2.approxPolyDP(contour, epsilon, True)
|
||||
points = approx.reshape((-1, 2))
|
||||
if points.shape[0] < 4:
|
||||
continue
|
||||
score = self.box_score_fast(pred, points.reshape(-1, 2))
|
||||
if self.box_thresh > score:
|
||||
continue
|
||||
|
||||
if points.shape[0] > 2:
|
||||
box = self.unclip(points, unclip_ratio=self.unclip_ratio)
|
||||
if len(box) > 1 or len(box) == 0:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
four_point_box, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
|
||||
if sside < self.min_size + 2:
|
||||
continue
|
||||
|
||||
if not is_output_polygon:
|
||||
box = np.array(four_point_box)
|
||||
else:
|
||||
box = box.reshape(-1, 2)
|
||||
box[:, 0] = np.clip(
|
||||
np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
||||
box[:, 1] = np.clip(
|
||||
np.round(box[:, 1] / height * dest_height), 0, dest_height)
|
||||
boxes.append(box)
|
||||
new_scores.append(score)
|
||||
return boxes, new_scores
|
||||
|
||||
def unclip(self, box, unclip_ratio=1.5):
|
||||
poly = Polygon(box)
|
||||
distance = poly.area * unclip_ratio / poly.length
|
||||
offset = pyclipper.PyclipperOffset()
|
||||
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
expanded = np.array(offset.Execute(distance))
|
||||
return expanded
|
||||
|
||||
def get_mini_boxes(self, contour):
|
||||
bounding_box = cv2.minAreaRect(contour)
|
||||
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
|
||||
|
||||
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
|
||||
if points[1][1] > points[0][1]:
|
||||
index_1 = 0
|
||||
index_4 = 1
|
||||
else:
|
||||
index_1 = 1
|
||||
index_4 = 0
|
||||
if points[3][1] > points[2][1]:
|
||||
index_2 = 2
|
||||
index_3 = 3
|
||||
else:
|
||||
index_2 = 3
|
||||
index_3 = 2
|
||||
|
||||
box = [
|
||||
points[index_1], points[index_2], points[index_3], points[index_4]
|
||||
]
|
||||
return box, min(bounding_box[1])
|
||||
|
||||
def box_score_fast(self, bitmap, _box):
|
||||
h, w = bitmap.shape[:2]
|
||||
box = _box.copy()
|
||||
xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
|
||||
xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
|
||||
ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
|
||||
ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
|
||||
|
||||
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
|
||||
box[:, 0] = box[:, 0] - xmin
|
||||
box[:, 1] = box[:, 1] - ymin
|
||||
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
|
||||
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
|
|
@ -1,136 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
from .locality_aware_nms import nms_locality
|
||||
import cv2
|
||||
|
||||
import os
|
||||
import sys
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
|
||||
|
||||
|
||||
class EASTPostPocess(object):
|
||||
"""
|
||||
The post process for EAST.
|
||||
"""
|
||||
|
||||
def __init__(self, params):
|
||||
self.score_thresh = params['score_thresh']
|
||||
self.cover_thresh = params['cover_thresh']
|
||||
self.nms_thresh = params['nms_thresh']
|
||||
|
||||
# c++ la-nms is faster, but only support python 3.5
|
||||
self.is_python35 = False
|
||||
if sys.version_info.major == 3 and sys.version_info.minor == 5:
|
||||
self.is_python35 = True
|
||||
|
||||
def restore_rectangle_quad(self, origin, geometry):
|
||||
"""
|
||||
Restore rectangle from quadrangle.
|
||||
"""
|
||||
# quad
|
||||
origin_concat = np.concatenate(
|
||||
(origin, origin, origin, origin), axis=1) # (n, 8)
|
||||
pred_quads = origin_concat - geometry
|
||||
pred_quads = pred_quads.reshape((-1, 4, 2)) # (n, 4, 2)
|
||||
return pred_quads
|
||||
|
||||
def detect(self,
|
||||
score_map,
|
||||
geo_map,
|
||||
score_thresh=0.8,
|
||||
cover_thresh=0.1,
|
||||
nms_thresh=0.2):
|
||||
"""
|
||||
restore text boxes from score map and geo map
|
||||
"""
|
||||
score_map = score_map[0]
|
||||
geo_map = np.swapaxes(geo_map, 1, 0)
|
||||
geo_map = np.swapaxes(geo_map, 1, 2)
|
||||
# filter the score map
|
||||
xy_text = np.argwhere(score_map > score_thresh)
|
||||
if len(xy_text) == 0:
|
||||
return []
|
||||
# sort the text boxes via the y axis
|
||||
xy_text = xy_text[np.argsort(xy_text[:, 0])]
|
||||
#restore quad proposals
|
||||
text_box_restored = self.restore_rectangle_quad(
|
||||
xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :])
|
||||
boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
|
||||
boxes[:, :8] = text_box_restored.reshape((-1, 8))
|
||||
boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
|
||||
if self.is_python35:
|
||||
import lanms
|
||||
boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
|
||||
else:
|
||||
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
|
||||
if boxes.shape[0] == 0:
|
||||
return []
|
||||
# Here we filter some low score boxes by the average score map,
|
||||
# this is different from the orginal paper.
|
||||
for i, box in enumerate(boxes):
|
||||
mask = np.zeros_like(score_map, dtype=np.uint8)
|
||||
cv2.fillPoly(mask, box[:8].reshape(
|
||||
(-1, 4, 2)).astype(np.int32) // 4, 1)
|
||||
boxes[i, 8] = cv2.mean(score_map, mask)[0]
|
||||
boxes = boxes[boxes[:, 8] > cover_thresh]
|
||||
return boxes
|
||||
|
||||
def sort_poly(self, p):
|
||||
"""
|
||||
Sort polygons.
|
||||
"""
|
||||
min_axis = np.argmin(np.sum(p, axis=1))
|
||||
p = p[[min_axis, (min_axis + 1) % 4,\
|
||||
(min_axis + 2) % 4, (min_axis + 3) % 4]]
|
||||
if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]):
|
||||
return p
|
||||
else:
|
||||
return p[[0, 3, 2, 1]]
|
||||
|
||||
def __call__(self, outs_dict, ratio_list):
|
||||
score_list = outs_dict['f_score']
|
||||
geo_list = outs_dict['f_geo']
|
||||
img_num = len(ratio_list)
|
||||
dt_boxes_list = []
|
||||
for ino in range(img_num):
|
||||
score = score_list[ino]
|
||||
geo = geo_list[ino]
|
||||
boxes = self.detect(
|
||||
score_map=score,
|
||||
geo_map=geo,
|
||||
score_thresh=self.score_thresh,
|
||||
cover_thresh=self.cover_thresh,
|
||||
nms_thresh=self.nms_thresh)
|
||||
boxes_norm = []
|
||||
if len(boxes) > 0:
|
||||
ratio_h, ratio_w = ratio_list[ino]
|
||||
boxes = boxes[:, :8].reshape((-1, 4, 2))
|
||||
boxes[:, :, 0] /= ratio_w
|
||||
boxes[:, :, 1] /= ratio_h
|
||||
for i_box, box in enumerate(boxes):
|
||||
box = self.sort_poly(box.astype(np.int32))
|
||||
if np.linalg.norm(box[0] - box[1]) < 5 \
|
||||
or np.linalg.norm(box[3] - box[0]) < 5:
|
||||
continue
|
||||
boxes_norm.append(box)
|
||||
dt_boxes_list.append(np.array(boxes_norm))
|
||||
return dt_boxes_list
|
|
@ -1 +0,0 @@
|
|||
adaptor.so
|
|
@ -1,140 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright (C) 2014 Google Inc.
|
||||
#
|
||||
# This file is part of YouCompleteMe.
|
||||
#
|
||||
# YouCompleteMe is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# YouCompleteMe is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with YouCompleteMe. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import ycm_core
|
||||
|
||||
# These are the compilation flags that will be used in case there's no
|
||||
# compilation database set (by default, one is not set).
|
||||
# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
from plumbum.cmd import python_config
|
||||
|
||||
|
||||
flags = [
|
||||
'-Wall',
|
||||
'-Wextra',
|
||||
'-Wnon-virtual-dtor',
|
||||
'-Winvalid-pch',
|
||||
'-Wno-unused-local-typedefs',
|
||||
'-std=c++11',
|
||||
'-x', 'c++',
|
||||
'-Iinclude',
|
||||
] + python_config('--cflags').split()
|
||||
|
||||
|
||||
# Set this to the absolute path to the folder (NOT the file!) containing the
|
||||
# compile_commands.json file to use that instead of 'flags'. See here for
|
||||
# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
|
||||
#
|
||||
# Most projects will NOT need to set this to anything; you can just change the
|
||||
# 'flags' list of compilation flags.
|
||||
compilation_database_folder = ''
|
||||
|
||||
if os.path.exists( compilation_database_folder ):
|
||||
database = ycm_core.CompilationDatabase( compilation_database_folder )
|
||||
else:
|
||||
database = None
|
||||
|
||||
SOURCE_EXTENSIONS = [ '.cpp', '.cxx', '.cc', '.c', '.m', '.mm' ]
|
||||
|
||||
def DirectoryOfThisScript():
|
||||
return os.path.dirname( os.path.abspath( __file__ ) )
|
||||
|
||||
|
||||
def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
|
||||
if not working_directory:
|
||||
return list( flags )
|
||||
new_flags = []
|
||||
make_next_absolute = False
|
||||
path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
|
||||
for flag in flags:
|
||||
new_flag = flag
|
||||
|
||||
if make_next_absolute:
|
||||
make_next_absolute = False
|
||||
if not flag.startswith( '/' ):
|
||||
new_flag = os.path.join( working_directory, flag )
|
||||
|
||||
for path_flag in path_flags:
|
||||
if flag == path_flag:
|
||||
make_next_absolute = True
|
||||
break
|
||||
|
||||
if flag.startswith( path_flag ):
|
||||
path = flag[ len( path_flag ): ]
|
||||
new_flag = path_flag + os.path.join( working_directory, path )
|
||||
break
|
||||
|
||||
if new_flag:
|
||||
new_flags.append( new_flag )
|
||||
return new_flags
|
||||
|
||||
|
||||
def IsHeaderFile( filename ):
|
||||
extension = os.path.splitext( filename )[ 1 ]
|
||||
return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
|
||||
|
||||
|
||||
def GetCompilationInfoForFile( filename ):
|
||||
# The compilation_commands.json file generated by CMake does not have entries
|
||||
# for header files. So we do our best by asking the db for flags for a
|
||||
# corresponding source file, if any. If one exists, the flags for that file
|
||||
# should be good enough.
|
||||
if IsHeaderFile( filename ):
|
||||
basename = os.path.splitext( filename )[ 0 ]
|
||||
for extension in SOURCE_EXTENSIONS:
|
||||
replacement_file = basename + extension
|
||||
if os.path.exists( replacement_file ):
|
||||
compilation_info = database.GetCompilationInfoForFile(
|
||||
replacement_file )
|
||||
if compilation_info.compiler_flags_:
|
||||
return compilation_info
|
||||
return None
|
||||
return database.GetCompilationInfoForFile( filename )
|
||||
|
||||
|
||||
# This is the entry point; this function is called by ycmd to produce flags for
|
||||
# a file.
|
||||
def FlagsForFile( filename, **kwargs ):
|
||||
if database:
|
||||
# Bear in mind that compilation_info.compiler_flags_ does NOT return a
|
||||
# python list, but a "list-like" StringVec object
|
||||
compilation_info = GetCompilationInfoForFile( filename )
|
||||
if not compilation_info:
|
||||
return None
|
||||
|
||||
final_flags = MakeRelativePathsInFlagsAbsolute(
|
||||
compilation_info.compiler_flags_,
|
||||
compilation_info.compiler_working_dir_ )
|
||||
else:
|
||||
relative_to = DirectoryOfThisScript()
|
||||
final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
|
||||
|
||||
return {
|
||||
'flags': final_flags,
|
||||
'do_cache': True
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue