add grad clip
This commit is contained in:
parent
913e11cbb8
commit
ccd7c40be6
|
@ -11,7 +11,7 @@
|
|||
## 配置文件参数介绍
|
||||
|
||||
以 `rec_chinese_lite_train_v1.1.yml ` 为例
|
||||
### Global
|
||||
### Global
|
||||
|
||||
| 字段 | 用途 | 默认值 | 备注 |
|
||||
| :----------------------: | :---------------------: | :--------------: | :--------------------: |
|
||||
|
@ -42,6 +42,7 @@
|
|||
| name | 优化器类名 | Adam | 目前支持`Momentum`,`Adam`,`RMSProp`, 见[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) |
|
||||
| beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ |
|
||||
| beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ |
|
||||
| clip_norm | 所允许的二范数最大值 | | \ |
|
||||
| **lr** | 设置学习率decay方式 | - | \ |
|
||||
| name | 学习率decay类名 | Cosine | 目前支持`Linear`,`Cosine`,`Step`,`Piecewise`, 见[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
|
||||
| learning_rate | 基础学习率 | 0.001 | \ |
|
||||
|
@ -119,4 +120,4 @@
|
|||
| shuffle | 每个epoch是否将数据集顺序打乱 | True | \ |
|
||||
| batch_size_per_card | 训练时单卡batch size | 256 | \ |
|
||||
| drop_last | 是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch | True | \ |
|
||||
| num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ |
|
||||
| num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ |
|
||||
|
|
|
@ -10,7 +10,7 @@ The following list can be viewed through `--help`
|
|||
## INTRODUCTION TO GLOBAL PARAMETERS OF CONFIGURATION FILE
|
||||
|
||||
Take rec_chinese_lite_train_v1.1.yml as an example
|
||||
### Global
|
||||
### Global
|
||||
|
||||
| Parameter | Use | Defaults | Note |
|
||||
| :----------------------: | :---------------------: | :--------------: | :--------------------: |
|
||||
|
@ -41,6 +41,7 @@ Take rec_chinese_lite_train_v1.1.yml as an example
|
|||
| name | Optimizer class name | Adam | Currently supports`Momentum`,`Adam`,`RMSProp`, see [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) |
|
||||
| beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ |
|
||||
| beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ |
|
||||
| clip_norm | The maximum norm value | - | \ |
|
||||
| **lr** | Set the learning rate decay method | - | \ |
|
||||
| name | Learning rate decay class name | Cosine | Currently supports`Linear`,`Cosine`,`Step`,`Piecewise`, see[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
|
||||
| learning_rate | Set the base learning rate | 0.001 | \ |
|
||||
|
@ -118,4 +119,4 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and
|
|||
| shuffle | Does each epoch disrupt the order of the data set | True | \ |
|
||||
| batch_size_per_card | Single card batch size during training | 256 | \ |
|
||||
| drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ |
|
||||
| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ |
|
||||
| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ |
|
||||
|
|
|
@ -16,8 +16,8 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import copy
|
||||
import paddle
|
||||
|
||||
__all__ = ['build_optimizer']
|
||||
|
||||
|
@ -49,7 +49,13 @@ def build_optimizer(config, epochs, step_each_epoch, parameters):
|
|||
|
||||
# step3 build optimizer
|
||||
optim_name = config.pop('name')
|
||||
if 'clip_norm' in config:
|
||||
clip_norm = config.pop('clip_norm')
|
||||
grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
|
||||
else:
|
||||
grad_clip = None
|
||||
optim = getattr(optimizer, optim_name)(learning_rate=lr,
|
||||
weight_decay=reg,
|
||||
grad_clip=grad_clip,
|
||||
**config)
|
||||
return optim(parameters), lr
|
||||
|
|
|
@ -30,18 +30,25 @@ class Momentum(object):
|
|||
regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate, momentum, weight_decay=None, **args):
|
||||
def __init__(self,
|
||||
learning_rate,
|
||||
momentum,
|
||||
weight_decay=None,
|
||||
grad_clip=None,
|
||||
**args):
|
||||
super(Momentum, self).__init__()
|
||||
self.learning_rate = learning_rate
|
||||
self.momentum = momentum
|
||||
self.weight_decay = weight_decay
|
||||
self.grad_clip = grad_clip
|
||||
|
||||
def __call__(self, parameters):
|
||||
opt = optim.Momentum(
|
||||
learning_rate=self.learning_rate,
|
||||
momentum=self.momentum,
|
||||
parameters=parameters,
|
||||
weight_decay=self.weight_decay)
|
||||
weight_decay=self.weight_decay,
|
||||
grad_clip=self.grad_clip,
|
||||
parameters=parameters)
|
||||
return opt
|
||||
|
||||
|
||||
|
@ -96,10 +103,11 @@ class RMSProp(object):
|
|||
|
||||
def __init__(self,
|
||||
learning_rate,
|
||||
momentum,
|
||||
momentum=0.0,
|
||||
rho=0.95,
|
||||
epsilon=1e-6,
|
||||
weight_decay=None,
|
||||
grad_clip=None,
|
||||
**args):
|
||||
super(RMSProp, self).__init__()
|
||||
self.learning_rate = learning_rate
|
||||
|
@ -107,6 +115,7 @@ class RMSProp(object):
|
|||
self.rho = rho
|
||||
self.epsilon = epsilon
|
||||
self.weight_decay = weight_decay
|
||||
self.grad_clip = grad_clip
|
||||
|
||||
def __call__(self, parameters):
|
||||
opt = optim.RMSProp(
|
||||
|
@ -115,5 +124,6 @@ class RMSProp(object):
|
|||
rho=self.rho,
|
||||
epsilon=self.epsilon,
|
||||
weight_decay=self.weight_decay,
|
||||
grad_clip=self.grad_clip,
|
||||
parameters=parameters)
|
||||
return opt
|
||||
|
|
Loading…
Reference in New Issue