84 lines
3.2 KiB
Python
84 lines
3.2 KiB
Python
from __future__ import print_function
|
|
|
|
import copy
|
|
import six
|
|
import warnings
|
|
|
|
import functools
|
|
from paddle.fluid import layers
|
|
from paddle.fluid import framework
|
|
from paddle.fluid import core
|
|
from paddle.fluid import name_scope
|
|
from paddle.fluid.dygraph import base as imperative_base
|
|
from paddle.fluid.clip import GradientClipBase, _correct_clip_op_role_var
|
|
|
|
class DoubleClip(GradientClipBase):
|
|
def __init__(self, clip_value, clip_norm, group_name="default_group", need_clip=None):
|
|
super(DoubleClip, self).__init__(need_clip)
|
|
self.clip_value = float(clip_value)
|
|
self.clip_norm = float(clip_norm)
|
|
self.group_name = group_name
|
|
|
|
def __str__(self):
|
|
return "Gradient Clip By Value and GlobalNorm, value={}, global_norm={}".format(
|
|
self.clip_value, self.clip_norm)
|
|
|
|
@imperative_base.no_grad
|
|
def _dygraph_clip(self, params_grads):
|
|
params_grads = self._dygraph_clip_by_value(params_grads)
|
|
params_grads = self._dygraph_clip_by_global_norm(params_grads)
|
|
return params_grads
|
|
|
|
@imperative_base.no_grad
|
|
def _dygraph_clip_by_value(self, params_grads):
|
|
params_and_grads = []
|
|
for p, g in params_grads:
|
|
if g is None:
|
|
continue
|
|
if self._need_clip_func is not None and not self._need_clip_func(p):
|
|
params_and_grads.append((p, g))
|
|
continue
|
|
new_grad = layers.clip(x=g, min=-self.clip_value, max=self.clip_value)
|
|
params_and_grads.append((p, new_grad))
|
|
return params_and_grads
|
|
|
|
@imperative_base.no_grad
|
|
def _dygraph_clip_by_global_norm(self, params_grads):
|
|
params_and_grads = []
|
|
sum_square_list = []
|
|
for p, g in params_grads:
|
|
if g is None:
|
|
continue
|
|
if self._need_clip_func is not None and not self._need_clip_func(p):
|
|
continue
|
|
merge_grad = g
|
|
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
|
merge_grad = layers.merge_selected_rows(g)
|
|
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
|
square = layers.square(merge_grad)
|
|
sum_square = layers.reduce_sum(square)
|
|
sum_square_list.append(sum_square)
|
|
|
|
# all parameters have been filterd out
|
|
if len(sum_square_list) == 0:
|
|
return params_grads
|
|
|
|
global_norm_var = layers.concat(sum_square_list)
|
|
global_norm_var = layers.reduce_sum(global_norm_var)
|
|
global_norm_var = layers.sqrt(global_norm_var)
|
|
max_global_norm = layers.fill_constant(
|
|
shape=[1], dtype='float32', value=self.clip_norm)
|
|
clip_var = layers.elementwise_div(
|
|
x=max_global_norm,
|
|
y=layers.elementwise_max(
|
|
x=global_norm_var, y=max_global_norm))
|
|
for p, g in params_grads:
|
|
if g is None:
|
|
continue
|
|
if self._need_clip_func is not None and not self._need_clip_func(p):
|
|
params_and_grads.append((p, g))
|
|
continue
|
|
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
|
params_and_grads.append((p, new_grad))
|
|
|
|
return params_and_grads |