Shortcuts

Source code for catalyst.contrib.nn.optimizers.qhadamw

from typing import Callable, Optional

import torch
from torch.optim.optimizer import Optimizer


[docs]class QHAdamW(Optimizer): """Implements QHAdam algorithm. Combines QHAdam algorithm that was proposed in `Quasi-hyperbolic momentum and Adam for deep learning`_ with weight decay decoupling from `Decoupled Weight Decay Regularization`_ paper. Example: >>> optimizer = QHAdamW( ... model.parameters(), ... lr=3e-4, nus=(0.8, 1.0), betas=(0.99, 0.999)) >>> optimizer.zero_grad() >>> loss_fn(model(input), target).backward() >>> optimizer.step() Main origins of inspiration: https://github.com/iprally/qhadamw-pytorch/blob/master/qhadamw.py (MIT License) .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 .. _Quasi-hyperbolic momentum and Adam for deep learning: https://arxiv.org/abs/1810.06801 """
[docs] def __init__( self, params, lr=1e-3, betas=(0.995, 0.999), nus=(0.7, 1.0), weight_decay=0.0, eps=1e-8, ): r""" Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (:math:`\alpha` from the paper) (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of the gradient and its square (default: (0.995, 0.999)) nus (Tuple[float, float], optional): immediate discount factors used to estimate the gradient and its square (default: (0.7, 1.0)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 regularization coefficient, times two) (default: 0.0) """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= eps: raise ValueError(f"Invalid epsilon value: {eps}") if not 0.0 <= betas[0] < 1.0: raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") if not 0.0 <= betas[1] < 1.0: raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") if weight_decay < 0.0: raise ValueError(f"Invalid weight_decay value: {weight_decay}") defaults = { "lr": lr, "betas": betas, "nus": nus, "weight_decay": weight_decay, "eps": eps, } super(QHAdamW, self).__init__(params, defaults)
[docs] def step(self, closure: Optional[Callable] = None): """Makes optimizer step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: lr = group["lr"] beta1, beta2 = group["betas"] nu1, nu2 = group["nus"] weight_decay = group["weight_decay"] eps = group["eps"] for p in group["params"]: if p.grad is None: continue d_p = p.grad.data if d_p.is_sparse: raise RuntimeError( "QHAdamW does not support sparse gradients" ) param_state = self.state[p] # Original QHAdam implementation for weight decay: # if weight_decay != 0: # d_p.add_(weight_decay, p.data) d_p_sq = d_p.mul(d_p) if len(param_state) == 0: param_state["beta1_weight"] = 0.0 param_state["beta2_weight"] = 0.0 param_state["exp_avg"] = torch.zeros_like(p.data) param_state["exp_avg_sq"] = torch.zeros_like(p.data) param_state["beta1_weight"] = ( 1.0 + beta1 * param_state["beta1_weight"] ) param_state["beta2_weight"] = ( 1.0 + beta2 * param_state["beta2_weight"] ) beta1_weight = param_state["beta1_weight"] beta2_weight = param_state["beta2_weight"] exp_avg = param_state["exp_avg"] exp_avg_sq = param_state["exp_avg_sq"] beta1_adj = 1.0 - (1.0 / beta1_weight) beta2_adj = 1.0 - (1.0 / beta2_weight) exp_avg.mul_(beta1_adj).add_(1.0 - beta1_adj, d_p) exp_avg_sq.mul_(beta2_adj).add_(1.0 - beta2_adj, d_p_sq) avg_grad = exp_avg.mul(nu1) if nu1 != 1.0: avg_grad.add_(1.0 - nu1, d_p) avg_grad_rms = exp_avg_sq.mul(nu2) if nu2 != 1.0: avg_grad_rms.add_(1.0 - nu2, d_p_sq) avg_grad_rms.sqrt_() if eps != 0.0: avg_grad_rms.add_(eps) # Original QHAdam implementation: # p.data.addcdiv_(-lr, avg_grad, avg_grad_rms) # Implementation following AdamW paper: p.data.add_(-weight_decay, p.data).addcdiv_( -lr, avg_grad, avg_grad_rms ) return loss