其他
从SGD到NadaMax,十种优化算法原理及实现
来自 | 知乎 作者 | 永远在你身后
链接丨https://zhuanlan.zhihu.com/p/81020717
编辑丨极市平台
本文仅用于学术分享,若侵权,请联系后台作删文处理。
SGD
Momentum
Nesterov Momentum
AdaGrad
RMSProp
AdaDelta
Adam
AdaMax
Nadam
NadaMax
SGD
Numpy实现神经网络框架(3)——线性层反向传播推导及实现
卷积核梯度计算的推导及实现
Momentum
import numpy as np
class Momentum(object):
def __init__(self, alpha=0.9, lr=1e-3):
self.alpha = alpha # 动量系数
self.lr = lr # 学习率
self.v = 0 # 初始速度为0
def update(self, g: np.ndarray): # g = J'(w) 为本轮训练参数的梯度
self.v = self.alpha * self.v - self.lr * g # 公式
return self.v # 返回的是参数的增量,下同
Nesterov Momentum
那么Nesterov Momentum就提前使用这个梯度进行更新:
AdaGrad
class AdaGrad(object):
def __init__(self, eps=1e-8, lr=1e-3):
self.r = eps # r_0 = epsilon
self.lr = lr
def update(self, g: np.ndarray):
r = r + np.square(g)
return -self.lr * g / np.sqrt(r)
RMSProp
class RMSProp(object):
def __init__(self, lr=1e-3, beta=0.999, eps=1e-8):
self.r = eps
self.lr = lr
self.beta = beta
def update(self, g: np.ndarray):
r = r * self.beta + (1-self.beta) * np.square(g)
return -self.lr * g / np.sqrt(r)
AdaDelta
class AdaDelta(object):
def __init__(self, beta=0.999, eps=1e-8):
self.r = eps
self.s = eps
self.beta = beta
def update(self, g: np.ndarray):
g_square = (1-self.beta) * np.square(g) # (1-beta)*g^2
r = r * self.beta + g_square
frac = s / r
res = -np.sqrt(frac) * g
s = s * self.beta + frac * g_squaretmp # 少一次乘法。。。
return res
Adam
class Adam(object):
def __init__(self, lr=1e-3, alpha=0.9, beta=0.999, eps=1e-8):
self.s = 0
self.r = eps
self.lr = lr
self.alpha = alpha
self.beta = beta
self.alpha_i = 1
self.beta_i = 1
def update(self, g: np.ndarray):
self.s = self.s * self.alpha + (1-self.alpha) * g
self.r = self.r * self.beta + (1-self.beta) * np.square(g)
self.alpha_i *= self.alpha
self.beta_i *= self.beta_i
lr = -self.lr * (1-self.beta_i)**0.5 / (1-self.alpha_i)
return lr * self.s / np.sqrt(self.r)
AdaMax
class AdaMax(object):
def __init__(self, lr=1e-3, alpha=0.9, beta=0.999):
self.s = 0
self.r = 0
self.lr = lr
self.alpha = alpha
self.alpha_i = 1
self.beta = beta
def update(self, g: np.ndarray):
self.s = self.s * self.alpha + (1-self.alpha) * g
self.r = np.maximum(self.r*self.beta, np.abs(g))
self.alpha_i *= self.alpha
lr = -self.lr / (1-self.alpha_i)
return lr * self.s / self.r
Nadam
class Nadam(object):
def __init__(self, lr=1e-3, alpha=0.9, beta=0.999, eps=1e-8):
self.s = 0
self.r = eps
self.lr = lr
self.alpha = alpha
self.beta = beta
self.alpha_i = 1
self.beta_i = 1
def update(self, g: np.ndarray):
self.s = self.s * self.alpha + (1-self.alpha) * g
self.r = self.r * self.beta + (1-self.beta) * np.square(g)
self.alpha_i *= self.alpha
self.beta_i *= self.beta_i
lr = -self.lr * (1-self.beta_i)**0.5 / (1-self.alpha_i)
return lr * (self.s * self.alpha + (1-self.alpha) * g) / np.sqrt(self.r)
NadaMax
class NadaMax(object):
def __init__(self, lr=1e-3, alpha=0.9, beta=0.999):
self.s = 0
self.r = 0
self.lr = lr
self.alpha = alpha
self.alpha_i = 1
self.beta = beta
def update(self, g: np.ndarray):
self.s = self.s * self.alpha + (1-self.alpha) * g
self.r = np.maximum(self.r*self.beta, np.abs(g))
self.alpha_i *= self.alpha
lr = -self.lr / (1-self.alpha_i)
return lr * (self.s * self.alpha + (1-self.alpha) * g) / self.r
参考资料
拿到2021灰飞烟灭算法岗offer的大佬们是啥样的?