jittor.optim 源代码

# ***************************************************************
# Copyright (c) 2023 Jittor. All Rights Reserved. 
# Maintainers:
#     Guowei Yang <471184555@qq.com>
#     Guoye Yang <498731903@qq.com>
#     Wenyang Zhou <576825820@qq.com>
#     Meng-Hao Guo <guomenghao1997@gmail.com>
#     Dun Liang <randonlang@gmail.com>.
#
# 
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import jittor as jt
import numpy as np

[文档] class Optimizer(object): '''优化器的基础类。这个基类可以用来实现各种优化算法,比如随机梯度下降等。 参数: - params (``list``): 模型参数。 - lr (``float``): 学习速率。 - param_sync_iter (``int, optional``): 参数同步的迭代次数。默认值:``10000`` 代码示例: >>> import jittor as jt >>> from jittor.optim import Optimizer >>> class MyOptimizer(Optimizer): ... def __init__(self, params, lr): ... super(MyOptimizer, self).__init__(params, lr) ... def step(self, loss): ... self.zero_grad() ... self.backward(loss) ... for group in self.param_groups: ... for param, grad in zip(group['params'], group['grads']): ... if not param.is_stop_grad(): ... param.update(param - self.lr * grad) ... >>> x = jt.randn([2,3]) >>> optimizer = MyOptimizer([x], lr=0.1) >>> loss = x.sum() >>> optimizer.step(loss) >>> print(x) jt.Var([[-0.481019 0.01914055 -0.74143946] [ 0.33761212 -1.7029546 -0.8524694 ]], dtype=float32) ''' def __init__(self, params, lr, param_sync_iter=10000): self.param_groups = [] self.lr = lr self.param_sync_iter = param_sync_iter assert len(params) > 0, "Length of parameters should not be zero" if not isinstance(params[0], dict): params = [{'params': params}] for pg in params: assert isinstance(pg, dict) self.param_groups.append(pg) self.n_step = 0 # __zero_grad is a value for fast determ the grad is zero or not # so we can omit 0+x self.__zero_grad = True self._grad_map = {} def add_param_group(self, group): self.param_groups.append(group)
[文档] def clip_grad_norm(self, max_norm:float, norm_type:int=2): """剪切此优化器的梯度范数,范数是对所有梯度一起计算的。 参数: max_norm (``float`` or ``int``): 梯度的最大范数 norm_type (``int``): 1-范数或2-范数 示例: >>> a = jt.ones(2) ... opt = jt.optim.SGD([a], 0.1) ... loss = a*a ... opt.zero_grad() ... opt.backward(loss) ... print(opt.param_groups[0]['grads'][0].norm()) 2.83 >>> opt.clip_grad_norm(0.01, 2) ... print(opt.param_groups[0]['grads'][0].norm()) 0.01 >>> opt.step() """ if self.__zero_grad: return grads = [] for pg in self.param_groups: for p, g in zip(pg["params"], pg["grads"]): if p.is_stop_grad(): continue grads.append(g.flatten()) if len(grads) == 0: return total_norm = jt.norm(jt.concat(grads), norm_type) clip_coef = jt.minimum(max_norm / (total_norm + 1e-6), 1.0) for pg in self.param_groups: for p, g in zip(pg["params"], pg["grads"]): if p.is_stop_grad(): continue g.update(g*clip_coef)
@property def defaults(self): exclude = set(("defaults", "pre_step", "step")) return { k:v for k, v in self.__dict__.items() if k[0] != '_' and k not in exclude and not callable(v) } def state_dict(self): state = {"defaults": self.defaults} return state
[文档] def load_state_dict(self, state): def dfs(x): if isinstance(x, list): for i in range(len(x)): x[i] = dfs(x[i]) elif isinstance(x, dict): for k in x: x[k] = dfs(x[k]) elif isinstance(x, np.ndarray): return jt.array(x).stop_grad() elif isinstance(x, jt.Var): return x.stop_grad() return x exclude = set(("param_groups", "params")) for k, v in state["defaults"].items(): if k not in exclude: setattr(self, k, dfs(v)) param_groups = dfs(state["defaults"].get('param_groups', None)) if param_groups is not None: exclude = set(("params",)) for i in range(len(param_groups)): for k, v in param_groups[i].items(): if k not in exclude: self.param_groups[i][k] = v
def zero_grad(self): self.__zero_grad = True
[文档] def backward(self, loss, retain_graph=False): ''' optimize.backward(loss) 用于累积多个step的梯度,可以如下使用: 原始源代码: >>> n_iter = 10000 ... batch_size = 100 ... ... >>> for i in range(n_iter): ... ... ... loss = calc_loss() ... optimizer.step(loss) 累积版本: >>> n_iter = 10000 ... batch_size = 100 ... accumulation_steps = 10 ... n_iter *= accumulation_steps ... batch_size //= accumulation_steps ... ... >>> for i in range(n_iter): ... ... ... loss = calc_loss() ... # 如果损失是跨批次的平均值,我们需要除以 accumulation_steps ... optimizer.backward(loss / accumulation_steps) ... if (i+1) % accumulation_steps == 0: ... optimizer.step() ''' # clean prev grads params = [] params_has_grad = [] for pg in self.param_groups: for p in pg['params']: params.append(p) if not p.is_stop_grad(): params_has_grad.append(p) # sync prev params jt.sync(params_has_grad) # get gradient grads = jt.grad(loss, params_has_grad, retain_graph) # sync grads and model if in mpi if jt.in_mpi: dep = [] def add_dep(v): nonlocal dep v._add_dependency(dep) dep = [v] for g in grads: g.assign(g.mpi_all_reduce("mean")) add_dep(g._input(0)) if self.n_step % self.param_sync_iter == 0: for p in params: p.assign(p.mpi_broadcast()) add_dep(p) self.n_step += 1 # set up grads in param_groups pid = 0 for pg in self.param_groups: if "grads" not in pg: pg["grads"] = [ jt.zeros_like(p).stop_grad().stop_fuse() for p in pg['params'] ] pg_grads = pg["grads"] for i, p in enumerate(pg['params']): if not p.is_stop_grad(): # accumulate grad and stop grad of grad g = grads[pid].stop_grad() if not self.__zero_grad: g = g + pg_grads[i] pg_grads[i].update(g) pid += 1 self.__zero_grad = False
[文档] def pre_step(self, loss, retain_graph=False): """ 在 step 之前执行的操作,比如计算梯度,retain_graph 等。 例子: >>> class MyOptimizer(Optimizer): ... def step(self, loss): ... self.pre_step(loss) ... ... ... self.post_step() """ if loss is not None: self.backward(loss, retain_graph) jt.flags.node_order = 1
[文档] def post_step(self): """ 在 step 之后执行的操作,比如更新参数值。 例子: >>> class MyOptimizer(Optimizer): ... def step(self, loss): ... self.pre_step(loss) ... ... ... self.post_step() """ jt.flags.node_order = 0 self.zero_grad()
def step(self, loss=None, retain_graph=False): self.pre_step(loss, retain_graph) for pg in self.param_groups: lr = pg.get("lr", self.lr) for p, g in zip(pg["params"], pg["grads"]): if p.is_stop_grad(): continue p.update(p - g * lr) self.post_step() def _build_grad_map(self): _grad_map = {} for pg in self.param_groups: for p, g in zip(pg["params"], pg["grads"]): _grad_map[id(p)] = g self._grad_map = _grad_map def find_grad(self, v:jt.Var) -> jt.Var: if id(v) not in self._grad_map: self._build_grad_map() if id(v) not in self._grad_map: raise RuntimeError("This variable is not managed by this optimizer") return self._grad_map[id(v)]
[文档] def opt_grad(v:jt.Var, opt:Optimizer): '''获取优化器中某个变量的梯度, 参数: - v(``Var``): 优化器中的变量 - opt (``Optimizer``): 优化器 代码示例: >>> model = Model() >>> optimizer = SGD(model.parameters()) >>> ... >>> optimizer.backward(loss) >>> for p in model.parameters(): >>> grad = p.opt_grad(optimizer) 返回值: 优化器中某个变量的梯度(``Var``) ''' return opt.find_grad(v)
jt.Var.opt_grad = opt_grad
[文档] class SGD(Optimizer): ''' 随机梯度下降算法优化器的实现,包括学习率衰减、动量、权重衰减等功能。在 step 方法中,根据参数的梯度和当前的动量更新参数值,也支持添加新的参数组,并能够计算带有动量的更新值。 该算法使用 SGD 进行更新,其中加入了动量项、权重衰减项,并可以选择是否使用 Nesterov 加速。如果使用动量项,则更新过程会考虑历史的梯度信息,更新公式如下: .. math:: v &= \\text{momentum} \\times v + dp \\times (1 - \\text{dampening}) \\\\ dp &= p \\times \\text{weight_decay} + g 如果使用 Nesterov 动量, 则更新过程会提前按历史方向进行一步预测,这样在某些问题上会获得更好的训练效果,更新公式如下: .. math:: p = p - (dp + \\text{momentum} * v) * lr 否则,更新公式为: .. math:: p = p - v \\times lr 以上 :math:`p,g,v` 分别表示参数值、梯度以及参数更新量。 参数: - params(``list``): 待优化的参数或者已经定义好的参数组 - lr(``float``): 学习率,用于控制参数更新的步长。 - momentum(``float``, 可选): 动量因子。默认值: ``0`` - weight_decay(``float``, 可选): 权重衰减系数。默认值: ``0`` - dampening(``float``, 可选): 动量的抑制因子。默认值: ``0`` - nesterov(``bool``, 可选): 是否采用 Nesterov 动量。默认值:``False`` 代码示例: >>> import jittor as jt >>> model = jt.nn.Linear(10, 2) >>> loss_fn = jt.nn.CrossEntropyLoss() >>> optimizer = jt.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.001) >>> x = jt.randn([5, 10]) >>> y_true = jt.array([0, 1, 0, 1, 1]) >>> y_pred = model(x) >>> loss = loss_fn(y_pred, y_true) >>> optimizer.step(loss) ''' def __init__(self, params, lr, momentum=0, weight_decay=0, dampening=0, nesterov=False): super().__init__(params, lr) self.momentum = momentum self.weight_decay = weight_decay self.dampening = dampening self.nesterov = nesterov # initialize required arguments for pg in self.param_groups: values = pg["values"] = [] for p in pg["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) def add_param_group(self, group): values = group["values"] = [] for p in group["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) self.param_groups.append(group) def step(self, loss=None, retain_graph=False): self.pre_step(loss, retain_graph=False) jt.flags.node_order = 1 for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) momentum = pg.get("momentum", self.momentum) weight_decay = pg.get("weight_decay", self.weight_decay) dampening = pg.get("dampening", self.dampening) nesterov = pg.get("nesterov", self.nesterov) # optimize main body for p, g, v in zip(pg["params"], pg["grads"], pg["values"]): if p.is_stop_grad(): continue dp = p * weight_decay + g v.update(momentum * v + dp * (1 - dampening)) if nesterov: p.update(p - (dp + momentum * v) * lr) else: p.update(p - v * lr) self.post_step()
[文档] class RMSprop(Optimizer): '''RMSprop 优化器类,用于在深度学习模型训练过程中更新模型参数以最小化损失函数,继承自 Optimizer 类。 RMSprop 是一种自适应学习率方法,为每个参数设置单独的学习率。它的公式包括累积平方梯度和参数更新的计算: .. math:: v(w, t) &= \\alpha \\cdot v(w, t-1) + (1 - \\alpha) g(w, t)^2 \\\\ w &= w - \\frac{lr \\cdot g(w, t)}{\\sqrt{v(w, t)} + \\varepsilon} 其中,:math:`g(w, t)` 为 :math:`t` 时刻参数 :math:`w` 上的梯度,:math:`v` 为累积平方梯度, :math:`\\alpha` 为平滑常数, :math:`\\varepsilon` 为防止零除项。 参数: - params (``list``): 被优化的模型的参数。 - lr (``float``): 学习率,默认值:``1e-2`` - eps (``float``): 添加到分母的术语以避免零除。默认值: ``1e-8`` - alpha (``float``): 平滑常数。 默认值: ``0.99`` 代码示例: >>> import jittor as jt ... model = jt.nn.Linear(10, 2) ... loss_fn = jt.nn.CrossEntropyLoss() ... optimizer = jt.optim.RMSprop(model.parameters(), lr=0.1) ... x = jt.randn([5, 10]) ... y_true = jt.array([0, 1, 0, 1, 1]) ... y_pred = model(x) ... loss = loss_fn(y_pred, y_true) ... optimizer.step(loss) ''' def __init__(self, params, lr=1e-2, eps=1e-8, alpha=0.99): super().__init__(params, lr) self.eps = eps self.alpha = alpha # initialize required arguments for each param_groups for pg in self.param_groups: values = pg["values"] = [] for p in pg["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) def add_param_group(self, group): values = group["values"] = [] for p in group["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) self.param_groups.append(group) def step(self, loss=None, retain_graph=False): self.pre_step(loss, retain_graph) for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) alpha = pg.get("alpha", self.alpha) for p, g, v in zip(pg["params"], pg["grads"], pg["values"]): if p.is_stop_grad(): continue v.update(alpha * v + (1-alpha) * g * g) p.update(p - lr * g / (jt.sqrt(v) + eps)) self.post_step()
[文档] class Adam(Optimizer): ''' Adam优化器,它结合了Momentum和RMSprop两种优化方法的主要观点,优化公式如下: .. math:: m_t &= b1 \\times m_{t-1} + (1 - b1) \\times g_t \\\\ v_t &= b2 \\times v_{t-1} + (1 - b2) \\times g_t^2 \\\\ p_t &= p_{t-1} - \\frac{lr \\times m_t} {\\sqrt{v_t} + eps} 其中,:math:`m_t` 和 :math:`v_t` 分别是梯度的一阶矩估计和二阶矩估计,:math:`b1`, :math:`b2` 是估计的系数,:math:`lr` 是学习率,:math:`eps` 是用于数值稳定性的项,:math:`g_t` 是梯度,:math:`p_t` 是参数。 构造参数: - params (``iterable``): 待优化的参数,或定义了参数进行分组的字典。对于已经定义好的模型,可以用 ``model.parameters()`` 获取其参数 - lr (``float``): 学习率 - eps (``float``, 可选): 用于改善数值稳定性的项。默认值:``1e-08`` - betas (``Tuple[float, float]``, 可选): 用于计算梯度以及梯度平方的估计值的系数, 默认值: ``(0.9, 0.999)`` - weight_decay (``float``, 可选): 权重衰减(L2 penalty)。默认值: ``0`` step的执行参数: - loss (``Var``, 可选): 已经计算好的神经网络loss。默认值: ``None`` - retain_graph (``bool``, 可选): 是否保留计算图。默认值: ``False`` 代码示例: >>> optimizer = jt.optim.Adam(model.parameters(), lr, eps=1e-8, betas=(0.9, 0.999)) >>> optimizer.step(loss) ''' def __init__(self, params, lr, eps=1e-8, betas=(0.9, 0.999), weight_decay=0): super().__init__(params, lr) self.eps = eps self.betas = betas self.weight_decay = weight_decay # assert weight_decay==0, "weight_decay is not supported yet" # initialize required arguments for each param_groups for pg in self.param_groups: values = pg["values"] = [] m = pg["m"] = [] for p in pg["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) m.append(jt.zeros(p.shape, p.dtype).stop_grad()) def add_param_group(self, group): values = group["values"] = [] m = group["m"] = [] for p in group["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) m.append(jt.zeros(p.shape, p.dtype).stop_grad()) self.param_groups.append(group) def step(self, loss=None, retain_graph=False): self.pre_step(loss, retain_graph) n = float(self.n_step) jt.flags.node_order = 1 for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) weight_decay = pg.get("weight_decay", self.weight_decay) b0, b1 = pg.get("betas", self.betas) for p, g, v, m in zip(pg["params"], pg["grads"], pg["values"], pg["m"]): if p.is_stop_grad(): continue g = p * weight_decay + g m.update(b0 * m + (1-b0) * g) v.update(b1 * v + (1-b1) * g * g) step_size = lr * jt.sqrt(1-b1**n) / (1-b0 ** n) p.update(p - m * step_size / (jt.sqrt(v) + eps)) self.post_step()
[文档] class AdamW(Optimizer): ''' AdamW 优化器,是 Adam 优化器的一个变体,其中对权重衰减进行了修正。 step 更新操作数学描述如下: .. math:: \\begin{aligned} t &\\leftarrow t + 1 \\\\ \\widetilde{\\eta_t} &\\leftarrow \\eta_t \\times a^{t} \\\\ g_t &\\leftarrow \ abla_{\\theta} L_t(\\theta) + \\lambda \\theta \\\\ m_t &\\leftarrow \\beta_{1_t} m_{t-1} + (1 - \\beta_{1_t}) g_t \\\\ \\widetilde{m_t} &\\leftarrow \\frac{m_t}{1 - \\beta_{1_t}^t} \\\\ v_t &\\leftarrow \\beta_{2_t} v_{t-1} + (1 - \\beta_{2_t}) g_t^2 \\\\ \\widetilde{v_t} &\\leftarrow \\frac{v_t}{1 - \\beta_{2_t}^t} \\\\ \\theta &\\leftarrow \\theta - \\widetilde{\\eta_t} \ abla_{\\theta} L_t(\\theta) \\\\ \\end{aligned} 其中, :math:`\\theta` 是参数, :math:`g` 是梯度, :math:`v` 是梯度的平方, :math:`m` 是梯度的指数移动平均值, :math:`{m_t}` 是偏差修正后的梯度的移动平均值, :math:`{v_t}` 是偏差修正后的梯度的平方的移动平均值, :math:`\\eta` 是学习率, :math:`\\beta_{1_t}` 和 :math:`\\beta_{2_t}` 是梯度和梯度平方动量项的系数, :math:`\\lambda` 是权重衰减系数。 参数: - params(``iterable``):待优化参数的迭代器,或是定义了参数组的字典。 - lr (``float``):学习率。 - eps (``float``):为了增加数值计算的稳定性而加到分母里的项。默认值: ``1e-8`` - betas (``Tuple[float, float]``):计算一阶动量和二阶动量的指数衰减率元组。默认值: ``(0.9, 0.999)`` 代码示例: >>> import jittor as jt >>> model = jt.nn.Linear(10, 2) >>> loss_fn = jt.nn.CrossEntropyLoss() >>> optimizer = jt.optim.AdamW(params=model.parameters(), lr=0.1, eps=1e-8, betas=(0.9, 0.999)) >>> x = jt.randn([5, 10]) >>> y_true = jt.array([0, 1, 0, 1, 1]) >>> y_pred = model(x) >>> loss = loss_fn(y_pred, y_true) >>> optimizer.step(loss) ''' def __init__(self, params, lr, eps=1e-8, betas=(0.9, 0.999), weight_decay=0): super().__init__(params, lr) self.eps = eps self.betas = betas self.weight_decay = weight_decay # assert weight_decay==0, "weight_decay is not supported yet" # initialize required arguments for each param_groups for pg in self.param_groups: values = pg["values"] = [] m = pg["m"] = [] for p in pg["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) m.append(jt.zeros(p.shape, p.dtype).stop_grad()) def add_param_group(self, group): values = group["values"] = [] m = group["m"] = [] for p in group["params"]: values.append(jt.zeros(p.shape, p.dtype).stop_grad()) m.append(jt.zeros(p.shape, p.dtype).stop_grad()) self.param_groups.append(group) def step(self, loss=None, retain_graph=False): self.pre_step(loss, retain_graph) n = float(self.n_step) for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) weight_decay = pg.get("weight_decay", self.weight_decay) b0, b1 = pg.get("betas", self.betas) for p, g, v, m in zip(pg["params"], pg["grads"], pg["values"], pg["m"]): if p.is_stop_grad(): continue p.update(p * (1 - lr * weight_decay)) bias_correction1 = 1 - b0 ** n bias_correction2 = 1 - b1 ** n m.update(b0 * m + (1-b0) * g) #exp_avg v.update(b1 * v + (1-b1) * g * g) #exp_avg_sq denom = jt.sqrt(v) / jt.sqrt(bias_correction2) + eps step_size = lr / bias_correction1 p.update(p - step_size * m / denom) self.post_step()
[文档] class Adan(Optimizer): ''' 实现Adan优化器。它在 `Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J].arXiv preprint arXiv:2208.06677, 2022 <https://arxiv.org/abs/2208.06677>`__ 中提出。 Adan是大多数DNN框架的高效优化器,计算负载比其他最新方法小约2倍,对训练设置和批处理大小具有稳健性,易于即插即用。 构造参数: - params (``iterable``): 要优化的参数的可迭代对象或者定义参数组的字典. - lr (``float``, 可选): 学习率。默认值: ``1e-3`` - betas (``Tuple[float, float, float]``, 可选): 用于一阶和二阶矩的系数。默认值: ``(0.98, 0.92, 0.99)`` - eps (``float``, 可选): 用于改善数值稳定性的分母中添加的项。默认值: ``1e-8`` - weight_decay (``float``, 可选): 分离的权重衰减 (L2惩罚) 。默认值:``0`` - max_grad_norm (``float``, 可选): 用于裁剪全局梯度范数的值,默认值: ``0.0`` (无裁剪) step执行参数: - loss (``Var``, 可选): 已经计算好的神经网络loss。默认值: ``None`` - retain_graph (``bool``, 可选): 是否保留计算图。默认值: ``False`` 代码示例: >>> optimizer = jt.optim.Adan(model.parameters(), lr=0.001) >>> optimizer.step(loss) ''' def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, weight_decay=0.0, max_grad_norm=0.0): super().__init__(params, lr) self.betas = betas self.eps = eps self.weight_decay = weight_decay self.max_grad_norm = max_grad_norm for pg in self.param_groups: pg["m"] = [] pg["v"] = [] pg["d"] = [] pg["pre_grad"] = [] for p in pg["params"]: pg["m"].append(jt.zeros(p.shape, p.dtype).stop_grad()) pg["v"].append(jt.zeros(p.shape, p.dtype).stop_grad()) pg["d"].append(jt.zeros(p.shape, p.dtype).stop_grad()) pg["pre_grad"].append(jt.zeros(p.shape, p.dtype).stop_grad()) def add_param_group(self, group): group["m"] = [] group["v"] = [] group["d"] = [] group["pre_grad"] = [] for p in group["params"]: group["m"].append(jt.zeros(p.shape, p.dtype).stop_grad()) group["v"].append(jt.zeros(p.shape, p.dtype).stop_grad()) group["d"].append(jt.zeros(p.shape, p.dtype).stop_grad()) group["pre_grad"].append(jt.zeros(p.shape, p.dtype).stop_grad()) self.param_groups.append(group) def step(self, loss=None, retain_graph=False): self.pre_step(loss, retain_graph) n = float(self.n_step) for pg in self.param_groups: lr = pg.get("lr", self.lr) betas = pg.get("betas", self.betas) eps = pg.get("eps", self.eps) weight_decay = pg.get("weight_decay", self.weight_decay) max_grad_norm = pg.get("max_grad_norm", self.max_grad_norm) if max_grad_norm>0: self.clip_grad_norm(max_grad_norm) beta1, beta2, beta3 = betas bias_correction1 = 1 - beta1 ** n bias_correction2 = 1 - beta2 ** n bias_correction3_sqrt = jt.sqrt(1 - beta3 ** n) step_size_diff = lr * beta2 * bias_correction3_sqrt / bias_correction2 step_size = lr * bias_correction3_sqrt / bias_correction1 eps_bias_sqrt = eps * bias_correction3_sqrt for p, g, m, v, d, pre_g in zip(pg["params"], pg["grads"], pg["m"], pg["v"], pg["d"], pg["pre_grad"]): if p.is_stop_grad(): continue if self.n_step>0: pre_g.update(g - pre_g) # Update pre_g as grad_diff m.update(beta1 * m + (1 - beta1) * g) d.update(beta2 * d + (1 - beta2) * pre_g) # Use pre_g as grad_diff pre_g.update(jt.multiply(pre_g, beta2) + g) # Update pre_g as update (g + beta2 * grad_diff) v.update(beta3 * v + (1 - beta3) * pre_g * pre_g) # Use pre_g as update p.update(p - (step_size * m + step_size_diff * d) / (jt.sqrt(v) + eps_bias_sqrt)) p.update(p / (1 + lr * weight_decay)) pre_g.update(g) # Update pre_g for the next iteration self.post_step()
[文档] class LRScheduler: ''' 学习率调度器的基类,用来根据训练的轮次调整学习率。 参数: - optimizer (``Optimizer``): 优化器,用来进行模型参数的优化。 - last_epoch (``int``, 可选): 最后的轮次(epoch)。该值默认为 ``-1``,代表使用优化器的学习率来初始化。否则,需要确保每个参数组中都有 ``initial_lr`` 这个值来进行学习率的初始化。 注意: 该类方法的使用,需要搭配其子类来使用,因为 ``get_lr()`` 方法需要在子类中被重写以定义如何根据轮次来调整学习率。 ''' def __init__(self,optimizer, last_epoch=-1): assert isinstance(optimizer,Optimizer) self.optimizer = optimizer if last_epoch==-1: for gp in optimizer.param_groups: gp.setdefault('initial_lr',gp.get('lr',optimizer.lr)) else: for gp in optimizer.param_groups: assert 'initial_lr' in gp self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) self.last_epoch = last_epoch self.optimizer._step_count = 0 self._step_count = 0 self.step() def get_lr(self): raise NotImplementedError def get_last_lr(self): return self._last_lr def step(self,epoch=None): self._step_count += 1 if epoch is None: self.last_epoch += 1 values = self.get_lr() else: self.last_epoch = epoch values = self.get_lr() for i, data in enumerate(zip(self.optimizer.param_groups, values)): param_group, lr = data param_group['lr'] = lr self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
[文档] class LambdaLR(LRScheduler): ''' 用于实现学习率调度功能的类,其根据使用者设置的学习率迭代公式(可用 lambda 表达式作为参数给出),对基本学习率进行调整。 参数: - optimizer (``Optimizer``): 已经初始化的优化器。 - lr_lambda (``callable or list``): 一个函数或者一个函数列表,用以定义学习率策略. 对应公式为: :math:`lr = \\text{init_lr} \\times \\text{lr_lambda(last_epoch)}` - last_epoch (``int``, 可选): 最后一次迭代的 epoch 数。默认值: ``-1``。 代码示例: >>> from jittor.optim import SGD, LambdaLR >>> optimizer = SGD(model.parameters(), lr=0.1) >>> scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 0.95 ** epoch) >>> for epoch in range(100): ... train(...) ... validate(...) ... scheduler.step() ''' def __init__(self, optimizer, lr_lambda, last_epoch=-1): if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple): self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups) else: if len(lr_lambda) != len(optimizer.param_groups): raise ValueError("Expected {} lr_lambdas, but got {}".format(len(optimizer.param_groups), len(lr_lambda))) self.lr_lambdas = list(lr_lambda) super(LambdaLR, self).__init__(optimizer, last_epoch) def get_lr(self): return [base_lr * lmbda(self.last_epoch) for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]