diff --git a/benchmark/analysis.py b/benchmark/analysis.py new file mode 100644 index 00000000..c4189b99 --- /dev/null +++ b/benchmark/analysis.py @@ -0,0 +1,273 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import json +import os +import re +import traceback + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--filename", type=str, help="The name of log which need to analysis.") + parser.add_argument( + "--log_with_profiler", type=str, help="The path of train log with profiler") + parser.add_argument( + "--profiler_path", type=str, help="The path of profiler timeline log.") + parser.add_argument( + "--keyword", type=str, help="Keyword to specify analysis data") + parser.add_argument( + "--separator", type=str, default=None, help="Separator of different field in log") + parser.add_argument( + '--position', type=int, default=None, help='The position of data field') + parser.add_argument( + '--range', type=str, default="", help='The range of data field to intercept') + parser.add_argument( + '--base_batch_size', type=int, help='base_batch size on gpu') + parser.add_argument( + '--skip_steps', type=int, default=0, help='The number of steps to be skipped') + parser.add_argument( + '--model_mode', type=int, default=-1, help='Analysis mode, default value is -1') + parser.add_argument( + '--ips_unit', type=str, default=None, help='IPS unit') + parser.add_argument( + '--model_name', type=str, default=0, help='training model_name, transformer_base') + parser.add_argument( + '--mission_name', type=str, default=0, help='training mission name') + parser.add_argument( + '--direction_id', type=int, default=0, help='training direction_id') + parser.add_argument( + '--run_mode', type=str, default="sp", help='multi process or single process') + parser.add_argument( + '--index', type=int, default=1, help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}') + parser.add_argument( + '--gpu_num', type=int, default=1, help='nums of training gpus') + args = parser.parse_args() + args.separator = None if args.separator == "None" else args.separator + return args + + +def _is_number(num): + pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') + result = pattern.match(num) + if result: + return True + else: + return False + + +class TimeAnalyzer(object): + def __init__(self, filename, keyword=None, separator=None, position=None, range="-1"): + if filename is None: + raise Exception("Please specify the filename!") + + if keyword is None: + raise Exception("Please specify the keyword!") + + self.filename = filename + self.keyword = keyword + self.separator = separator + self.position = position + self.range = range + self.records = None + self._distil() + + def _distil(self): + self.records = [] + with open(self.filename, "r") as f_object: + lines = f_object.readlines() + for line in lines: + if self.keyword not in line: + continue + try: + result = None + + # Distil the string from a line. + line = line.strip() + line_words = line.split(self.separator) if self.separator else line.split() + if args.position: + result = line_words[self.position] + else: + # Distil the string following the keyword. + for i in range(len(line_words) - 1): + if line_words[i] == self.keyword: + result = line_words[i + 1] + break + + # Distil the result from the picked string. + if not self.range: + result = result[0:] + elif _is_number(self.range): + result = result[0: int(self.range)] + else: + result = result[int(self.range.split(":")[0]): int(self.range.split(":")[1])] + self.records.append(float(result)) + except Exception as exc: + print("line is: {}; separator={}; position={}".format(line, self.separator, self.position)) + + print("Extract {} records: separator={}; position={}".format(len(self.records), self.separator, self.position)) + + def _get_fps(self, mode, batch_size, gpu_num, avg_of_records, run_mode, unit=None): + if mode == -1 and run_mode == 'sp': + assert unit, "Please set the unit when mode is -1." + fps = gpu_num * avg_of_records + elif mode == -1 and run_mode == 'mp': + assert unit, "Please set the unit when mode is -1." + fps = gpu_num * avg_of_records #temporarily, not used now + print("------------this is mp") + elif mode == 0: + # s/step -> samples/s + fps = (batch_size * gpu_num) / avg_of_records + unit = "samples/s" + elif mode == 1: + # steps/s -> steps/s + fps = avg_of_records + unit = "steps/s" + elif mode == 2: + # s/step -> steps/s + fps = 1 / avg_of_records + unit = "steps/s" + elif mode == 3: + # steps/s -> samples/s + fps = batch_size * gpu_num * avg_of_records + unit = "samples/s" + elif mode == 4: + # s/epoch -> s/epoch + fps = avg_of_records + unit = "s/epoch" + else: + ValueError("Unsupported analysis mode.") + + return fps, unit + + def analysis(self, batch_size, gpu_num=1, skip_steps=0, mode=-1, run_mode='sp', unit=None): + if batch_size <= 0: + print("base_batch_size should larger than 0.") + return 0, '' + + if len(self.records) <= skip_steps: # to address the condition which item of log equals to skip_steps + print("no records") + return 0, '' + + sum_of_records = 0 + sum_of_records_skipped = 0 + skip_min = self.records[skip_steps] + skip_max = self.records[skip_steps] + + count = len(self.records) + for i in range(count): + sum_of_records += self.records[i] + if i >= skip_steps: + sum_of_records_skipped += self.records[i] + if self.records[i] < skip_min: + skip_min = self.records[i] + if self.records[i] > skip_max: + skip_max = self.records[i] + + avg_of_records = sum_of_records / float(count) + avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps) + + fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records, run_mode, unit) + fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num, avg_of_records_skipped, run_mode, unit) + if mode == -1: + print("average ips of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average ips of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) + print("\tMin: %.3f %s" % (skip_min, fps_unit)) + print("\tMax: %.3f %s" % (skip_max, fps_unit)) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + elif mode == 1 or mode == 3: + print("average latency of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f steps/s" % avg_of_records) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f steps/s" % avg_of_records_skipped) + print("\tMin: %.3f steps/s" % skip_min) + print("\tMax: %.3f steps/s" % skip_max) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + elif mode == 0 or mode == 2: + print("average latency of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f s/step" % avg_of_records) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f s/step" % avg_of_records_skipped) + print("\tMin: %.3f s/step" % skip_min) + print("\tMax: %.3f s/step" % skip_max) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + + return round(fps_skipped, 3), fps_unit + + +if __name__ == "__main__": + args = parse_args() + run_info = dict() + run_info["log_file"] = args.filename + run_info["model_name"] = args.model_name + run_info["mission_name"] = args.mission_name + run_info["direction_id"] = args.direction_id + run_info["run_mode"] = args.run_mode + run_info["index"] = args.index + run_info["gpu_num"] = args.gpu_num + run_info["FINAL_RESULT"] = 0 + run_info["JOB_FAIL_FLAG"] = 0 + + try: + if args.index == 1: + if args.gpu_num == 1: + run_info["log_with_profiler"] = args.log_with_profiler + run_info["profiler_path"] = args.profiler_path + analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, args.position, args.range) + run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis( + batch_size=args.base_batch_size, + gpu_num=args.gpu_num, + skip_steps=args.skip_steps, + mode=args.model_mode, + run_mode=args.run_mode, + unit=args.ips_unit) + try: + if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0: + run_info["JOB_FAIL_FLAG"] = 1 + except: + pass + elif args.index == 3: + run_info["FINAL_RESULT"] = {} + records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead', None, 3, '').records + records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead', None, 5).records + records_ct_total = TimeAnalyzer(args.filename, 'Computation time', None, 3, '').records + records_gm_total = TimeAnalyzer(args.filename, 'GpuMemcpy Calls', None, 4, '').records + records_gm_ratio = TimeAnalyzer(args.filename, 'GpuMemcpy Calls', None, 6).records + records_gmas_total = TimeAnalyzer(args.filename, 'GpuMemcpyAsync Calls', None, 4, '').records + records_gms_total = TimeAnalyzer(args.filename, 'GpuMemcpySync Calls', None, 4, '').records + run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[0] if records_fo_total else 0 + run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[0] if records_fo_ratio else 0 + run_info["FINAL_RESULT"]["ComputationTime_Total"] = records_ct_total[0] if records_ct_total else 0 + run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[0] if records_gm_total else 0 + run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[0] if records_gm_ratio else 0 + run_info["FINAL_RESULT"]["GpuMemcpyAsync_Total"] = records_gmas_total[0] if records_gmas_total else 0 + run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[0] if records_gms_total else 0 + else: + print("Not support!") + except Exception: + traceback.print_exc() + print("{}".format(json.dumps(run_info))) # it's required, for the log file path insert to the database + diff --git a/benchmark/readme.md b/benchmark/readme.md new file mode 100644 index 00000000..7f7704cc --- /dev/null +++ b/benchmark/readme.md @@ -0,0 +1,34 @@ + +# PaddleOCR DB/EAST 算法训练benchmark测试 + +PaddleOCR/benchmark目录下的文件用于获取并分析训练日志。 +训练采用icdar2015数据集,包括1000张训练图像和500张测试图像。模型配置采用resnet18_vd作为backbone,分别训练batch_size=8和batch_size=16的情况。 + +## 运行训练benchmark + +benchmark/run_det.sh 中包含了三个过程: +- 安装依赖 +- 下载数据 +- 执行训练 +- 日志分析获取IPS + +在执行训练部分,会执行单机单卡(默认0号卡)单机多卡训练,并分别执行batch_size=8和batch_size=16的情况。所以执行完后,每种模型会得到4个日志文件。 + +run_det.sh 执行方式如下: + +``` +# cd PaddleOCR/ +bash benchmark/run_det.sh +``` + +以DB为例,将得到四个日志文件,如下: +``` +det_res18_db_v2.0_sp_bs16_fp32_1 +det_res18_db_v2.0_sp_bs8_fp32_1 +det_res18_db_v2.0_mp_bs16_fp32_1 +det_res18_db_v2.0_mp_bs8_fp32_1 +``` + + + + diff --git a/benchmark/run_benchmark_det.sh b/benchmark/run_benchmark_det.sh index 36228adc..26bcda5d 100644 --- a/benchmark/run_benchmark_det.sh +++ b/benchmark/run_benchmark_det.sh @@ -20,9 +20,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" - train_cmd="-c configs/det/${model_name}.yml - -o Train.loader.batch_size_per_card=${batch_size} - -o Global.epoch_num=${max_iter} " + train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} " case ${run_mode} in sp) train_cmd="python3.7 tools/train.py "${train_cmd}"" @@ -47,6 +45,10 @@ function _train(){ rm ${log_file} cp mylog/workerlog.0 ${log_file} fi + + # run log analysis + analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file} --mission_name ${model_name} --run_mode ${mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_szie} --skip_steps 1 --gpu_num ${num_gpu_devices} --index 1 --model_mode=-1 --ips_unit=samples/sec" + eval $analysis_cmd } _set_params $@ diff --git a/benchmark/run_det.sh b/benchmark/run_det.sh index c94af85c..c507510c 100644 --- a/benchmark/run_det.sh +++ b/benchmark/run_det.sh @@ -1,26 +1,25 @@ # 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 -# 执行目录:需说明 -cd PaddleOCR +# 执行目录: ./PaddleOCR # 1 安装该模型需要的依赖 (如需开启优化策略请注明) python3.7 -m pip install -r requirements.txt # 2 拷贝该模型需要数据、预训练模型 -wget -p ./tain_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../ -wget -p ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams +wget -c -p ./tain_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../ +wget -c -p ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams # 3 批量运行(如不方便批量,1,2需放到单个模型中) -model_mode_list=(det_mv3_db det_r50_vd_east) +model_mode_list=(det_res18_db_v2.0 det_r50_vd_east) fp_item_list=(fp32) -bs_list=(256 128) +bs_list=(8 16) for model_mode in ${model_mode_list[@]}; do for fp_item in ${fp_item_list[@]}; do for bs_item in ${bs_list[@]}; do echo "index is speed, 1gpus, begin, ${model_name}" run_mode=sp - CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} # (5min) + CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} # (5min) sleep 60 echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" run_mode=mp - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} sleep 60 done done diff --git a/configs/det/det_res18_db_v2.0.yml b/configs/det/det_res18_db_v2.0.yml new file mode 100644 index 00000000..7b07ef99 --- /dev/null +++ b/configs/det/det_res18_db_v2.0.yml @@ -0,0 +1,131 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_res18/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet18_vd_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + layers: 18 + disable_se: True + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/ppocr/utils/profiler.py b/ppocr/utils/profiler.py new file mode 100644 index 00000000..c4e28bc6 --- /dev/null +++ b/ppocr/utils/profiler.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler( + _profiler_options['state'], _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/tools/program.py b/tools/program.py index 430631bf..0b766928 100755 --- a/tools/program.py +++ b/tools/program.py @@ -31,6 +31,7 @@ from ppocr.utils.stats import TrainingStats from ppocr.utils.save_load import save_model from ppocr.utils.utility import print_dict from ppocr.utils.logging import get_logger +from ppocr.utils import profiler from ppocr.data import build_dataloader import numpy as np @@ -42,6 +43,13 @@ class ArgsParser(ArgumentParser): self.add_argument("-c", "--config", help="configuration file to use") self.add_argument( "-o", "--opt", nargs='+', help="set configuration options") + self.add_argument( + '-p', + '--profiler_options', + type=str, + default=None, + help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".' + ) def parse_args(self, argv=None): args = super(ArgsParser, self).parse_args(argv) @@ -158,6 +166,7 @@ def train(config, epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] + profiler_options = config['profiler_options'] global_step = 0 if 'global_step' in pre_best_model_dict: @@ -209,6 +218,7 @@ def train(config, max_iter = len(train_dataloader) - 1 if platform.system( ) == "Windows" else len(train_dataloader) for idx, batch in enumerate(train_dataloader): + profiler.add_profiler_step(profiler_options) train_reader_cost += time.time() - batch_start if idx >= max_iter: break @@ -391,8 +401,11 @@ def eval(model, def preprocess(is_train=False): FLAGS = ArgsParser().parse_args() + profiler_options = FLAGS.profiler_options config = load_config(FLAGS.config) merge_config(FLAGS.opt) + profile_dic = {"profiler_options": FLAGS.profiler_options} + merge_config(profile_dic) if is_train: # save_config