AIGC基础:从VAE到DDPM原理、代码详解
前言
AIGC 目前是一个非常火热的方向,DALLE-2,ImageGen,Stable Diffusion 的图像在以假乱真的前提下,又有着脑洞大开的艺术性,以下是用开源的 Stable Diffusion 生成的一些图片。
这些模型后边都使用了 Diffusion Model 的技术,但是缺乏相关背景知识去单纯学习 Diffusion Model 门槛会比较高,不过沿着 AE、VAE、CVAE、DDPM 这一系列的生成模型的路线、循序学习会更好的理解和掌握,本文将从原理、数学推导、代码详细讲述这些模型。
AE (AutoEncoder)
AE 模型作用是提取数据的核心特征(Latent Attributes),如果通过提取的低维特征可以完美复原原始数据,那么说明这个特征是可以作为原始数据非常优秀的表征。
AE 模型的结构如下图:
训练数据通过 Encoder 得到 Latent,Latent 再通过 Decoder 得到重建数据,通过重建数据和训练的数据差异来构造训练 Loss,代码如下(本文所有的场景都是 mnist,编码器和解码器都用了最基本的卷积网络):
class DownConvLayer(tf.keras.layers.Layer):
def __init__(self, dim):
super(DownConvLayer, self).__init__()
self.conv = tf.keras.layers.Conv2D(dim, 3, activation=tf.keras.layers.ReLU(), use_bias=False, padding='same')
self.pool = tf.keras.layers.MaxPool2D(2)
def call(self, x, training=False, **kwargs):
x = self.conv(x)
x = self.pool(x)
return x
class UpConvLayer(tf.keras.layers.Layer):
def __init__(self, dim):
super(UpConvLayer, self).__init__()
self.conv = tf.keras.layers.Conv2D(dim, 3, activation=tf.keras.layers.ReLU(), use_bias=False, padding='same')
# 通过UpSampling2D上采样
self.pool = tf.keras.layers.UpSampling2D(2)
def call(self, x, training=False, **kwargs):
x = self.conv(x)
x = self.pool(x)
return x
# 示例代码都是通过非常简单的卷积操作实现编码器和解码器
class Encoder(tf.keras.layers.Layer):
def __init__(self, dim, layer_num=3):
super(Encoder, self).__init__()
self.convs = [DownConvLayer(dim) for _ in range(layer_num)]
def call(self, x, training=False, **kwargs):
for conv in self.convs:
x = conv(x, training)
return x
class Decoder(tf.keras.layers.Layer):
def __init__(self, dim, layer_num=3):
super(Decoder, self).__init__()
self.convs = [UpConvLayer(dim) for _ in range(layer_num)]
self.final_conv = tf.keras.layers.Conv2D(1, 3, strides=1)
def call(self, x, training=False, **kwargs):
for conv in self.convs:
x = conv(x, training)
# 将图像转成和输入图像shape一致
reconstruct = self.final_conv(x)
return reconstruct
class AutoEncoderModel(tf.keras.Model):
def __init__(self):
super(AutoEncoderModel, self).__init__()
self.encoder = Encoder(64, layer_num=3)
self.decoder = Decoder(64, layer_num=3)
def call(self, inputs, training=None, mask=None):
image = inputs[0]
# 得到图像的特征表示
latent = self.encoder(image, training)
# 通过特征重建图像
reconstruct_img = self.decoder(latent, training)
return reconstruct_img
@tf.function
def train_step(self, data):
img = data["image"]
with tf.GradientTape() as tape:
reconstruct_img = self((img,), True)
trainable_vars = self.trainable_variables
# 利用l2 loss 来判断重建图片和原始图像的一致性
l2_loss = (reconstruct_img - img) ** 2
l2_loss = tf.reduce_mean(tf.reduce_sum(
l2_loss, axis=(1, 2, 3)
))
gradients = tape.gradient(l2_loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
return {"l2_loss": l2_loss}
通过 AE 模型可以看到,只要有有效的数据的 Latent Attribute 表示,那么就可以通过 Decoder 来生成新数据,但是在 AE 模型中,Latent 是通过已有数据生成的,所以没法生成已有数据外的新数据。
所以我们设想,是不是可以假设 Latent 符合一定分布规律,只要通过有限参数能够描述这个分布,那么就可以通过这个分布得到不在训练数据中的新 Latent,利用这个新 Latent 就能生成全新数据,基于这个思路,有了 VAE(Variational AutoEncoder 变分自编码器)。
VAE
VAE 中假设 Latent Attributes (公式中用 z)符合正态分布,也就是通过训练数据得到的 z 满足以下条件:
因为 z 是向量,所 都是向量,分别为正态分布的均值和方差。有了学习得到正态分布的参数 ,那么就可以从这个正态分布中采样新的 z,新的 z 通过解码器得到新的数据。
class VAEModel(tf.keras.Model):
def __init__(self, inference=False):
super(VAEModel, self).__init__()
self.inference = inference
self.encoder = Encoder(64, layer_num=3)
self.decoder = Decoder(64, layer_num=3)
# mnist 的size是28,这里为了简单对齐大小,缩放成了32
self.img_size = 32
# z的维度
self.latent_dim = 64
# 通过全连接来学习隐特征z正态分布的均值
self.z_mean_mlp = tf.keras.Sequential(
[
tf.keras.layers.Dense(self.latent_dim * 2, activation="relu"),
tf.keras.layers.Dense(self.latent_dim, use_bias=False),
]
)
# 通过全连接来学习隐特征z正态分布的方差的对数log(o^2)
self.z_log_var_mlp = tf.keras.Sequential(
[
tf.keras.layers.Dense(self.latent_dim * 2, activation="relu"),
tf.keras.layers.Dense(self.latent_dim, use_bias=False),
]
)
# 通过全连接将z 缩放成上采样输入适配的shape
self.decoder_input_size = [int(self.img_size / (2 ** 3)), 64]
self.decoder_dense = tf.keras.layers.Dense(
self.decoder_input_size[0] * self.decoder_input_size[0] * self.decoder_input_size[1],
activation="relu")
def sample_latent(self, bs, image):
# 推理阶段的z直接可以从标准正态分布中采样,因为训练的decoder已经可以从标准高斯分布生成新的图片了
if self.inference:
z = tf.keras.backend.random_normal(shape=(bs, self.latent_dim))
z_mean, z_log_var = None, None
else:
x = image
x = self.encoder(x)
x = tf.keras.layers.Flatten()(x)
z_mean = self.z_mean_mlp(x)
z_log_var = self.z_log_var_mlp(x)
epsilon = tf.keras.backend.random_normal(shape=(bs, self.latent_dim))
'''
实现重参数采样公式17
u + exp(0.5*log(o^2))*e
=u +exp(0.5*2*log(o))*e
=u + exp(log(o))*e
=u + o*e
'''
z = z_mean + tf.exp(0.5 * z_log_var) * epsilon
return z, z_mean, z_log_var
def call(self, inputs, training=None, mask=None):
# 推理生成图片时,image为None
bs, image = inputs[0], inputs[1]
z, z_mean, z_log_var = self.sample_latent(bs, image)
latent = self.decoder_dense(z)
latent = tf.reshape(latent,
[-1, self.decoder_input_size[0], self.decoder_input_size[0], self.decoder_input_size[1]])
# 通过z重建图像
reconstruct_img = self.decoder(latent, training)
return reconstruct_img, z_mean, z_log_var
def compute_loss(self, reconstruct_img, z_mean, z_log_var, img):
# 利用l2 loss 来判断重建图片和原始图像的一致性
l2_loss = (reconstruct_img - img) ** 2
l2_loss = tf.reduce_mean(tf.reduce_sum(
l2_loss, axis=(1, 2, 3)
))
# 实现公式48
kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
total_loss = kl_loss + l2_loss
return {"l2_loss": l2_loss, "total_loss": total_loss, "kl_loss": kl_loss}
@tf.function
def forward(self, data, training):
img = data["img_data"]
bs = tf.shape(img)[0]
reconstruct_img, z_mean, z_log_var = self((bs, img), training)
return self.compute_loss(reconstruct_img, z_mean, z_log_var, img)
def train_step(self, data):
with tf.GradientTape() as tape:
result = self.forward(data, True)
trainable_vars = self.trainable_variables
gradients = tape.gradient(result["total_loss"], trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
return result
CVAE
class CVAEModel(VAEModel):
def __init__(self, inference=False):
super(CVAEModel, self).__init__(inference=inference)
# 定义label的Embedding
self.label_dim = 128
self.label_embedding = tf.Variable(
initial_value=tf.keras.initializers.HeNormal()(shape=[10, self.label_dim]),
trainable=True,
)
self.encoder_y_dense = tf.keras.layers.Dense(self.img_size * self.img_size, activation="relu")
self.decoder_y_dense = tf.keras.layers.Dense(
self.decoder_input_size[0] * self.decoder_input_size[0] * self.decoder_input_size[1], activation="relu")
def call(self, inputs, training=None, mask=None):
# 推理生成图片时,image为None
bs, image, label = inputs[0], inputs[1], inputs[2]
label_emb = tf.nn.embedding_lookup(self.label_embedding, label)
label_emb = tf.reshape(label_emb, [-1, self.label_dim])
if not self.inference:
# 训练阶段将条件label的embedding拼接到图片上作为encoder的输入
encoder_y = self.encoder_y_dense(label_emb)
encoder_y = tf.reshape(encoder_y, [-1, self.img_size, self.img_size, 1])
image = tf.concat([encoder_y, image], axis=-1)
z, z_mean, z_log_var = self.sample_latent(bs, image)
latent = self.decoder_dense(z)
# 将条件label的embedding拼接到z上作为decoder的输入
decoder_y = self.decoder_y_dense(label_emb)
latent = tf.concat([latent, decoder_y], axis=-1)
latent = tf.reshape(latent,
[-1, self.decoder_input_size[0], self.decoder_input_size[0],
self.decoder_input_size[1] * 2])
# 通过特征重建图像
reconstruct_img = self.decoder(latent, training)
return reconstruct_img, z_mean, z_log_var
@tf.function
def forward(self, data, training):
img = data["img_data"]
label = data["label"]
bs = tf.shape(img)[0]
reconstruct_img, z_mean, z_log_var = self((bs, img, label), training)
return self.compute_loss(reconstruct_img, z_mean, z_log_var, img)
def train_step(self, data):
with tf.GradientTape() as tape:
result = self.forward(data, True)
trainable_vars = self.trainable_variables
gradients = tape.gradient(result["total_loss"], trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
return result
生成 0~9 的图片效果如下:
所以训练过程如下:
class ConvResidualLayer(tf.keras.layers.Layer):
def __init__(self, filter_num):
super(ConvResidualLayer, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(filter_num, kernel_size=1, padding='same')
# import tensorflow_addons as tfa
self.gn1 = tfa.layers.GroupNormalization(8)
self.conv2 = tf.keras.layers.Conv2D(filter_num, kernel_size=3, padding='same')
self.gn2 = tfa.layers.GroupNormalization(8)
self.act2 = tf.keras.activations.swish
def call(self, inputs, training=False, *args, **kwargs):
residual = self.conv1(inputs)
x = self.gn1(residual)
x = tf.nn.swish(x)
x = self.conv2(x)
x = self.gn2(x)
x = tf.nn.swish(x)
out = x + residual
return out / 1.44
class SimpleDDPMModel(tf.keras.Model):
def __init__(self, max_time_step=100):
super(SimpleDDPMModel, self).__init__()
# 定义ddpm 前向过程的一些参数
self.max_time_step = max_time_step
# 采用numpy 的float64,避免连乘的精度失准
betas = np.linspace(1e-4, 0.02, max_time_step, dtype=np.float64)
alphas = 1.0 - betas
alphas_bar = np.cumprod(alphas, axis=0)
betas_bar = 1.0 - alphas_bar
self.betas, self.alphas, self.alphas_bar, self.betas_bar = tuple(
map(
lambda x: tf.constant(x, tf.float32),
[betas, alphas, alphas_bar, betas_bar]
)
)
filter_nums = [64, 128, 256]
self.encoders = [tf.keras.Sequential([
ConvResidualLayer(num),
tf.keras.layers.MaxPool2D(2)
]) for num in filter_nums]
self.mid_conv = ConvResidualLayer(filter_nums[-1])
self.decoders = [tf.keras.Sequential([
tf.keras.layers.Conv2DTranspose(num, 3, strides=2, padding="same"),
ConvResidualLayer(num),
ConvResidualLayer(num),
]) for num in reversed(filter_nums)]
self.final_conv = tf.keras.Sequential(
[
ConvResidualLayer(64),
tf.keras.layers.Conv2D(1, 3, padding="same")
]
)
self.img_size = 32
self.time_embeddings = [
tf.keras.Sequential(
[
tf.keras.layers.Dense(num, activation=tf.keras.layers.LeakyReLU()),
tf.keras.layers.Dense(num)
]
)
for num in filter_nums]
# 实现公式 64 从原始数据生成噪音图像
def q_noisy_sample(self, x_0, t, noisy):
alpha_bar, beta_bar = self.extract([self.alphas_bar, self.betas_bar], t)
sqrt_alpha_bar, sqrt_beta_bar = tf.sqrt(alpha_bar), tf.sqrt(beta_bar)
return sqrt_alpha_bar * x_0 + sqrt_beta_bar * noisy
def extract(self, sources, t):
bs = tf.shape(t)[0]
targets = [tf.gather(source, t) for i, source in enumerate(sources)]
return tuple(map(lambda x: tf.reshape(x, [bs, 1, 1, 1]), targets))
# 实现公式 131,从噪声数据恢复上一步的数据
def p_real_sample(self, x_t, t, pred_noisy):
alpha, beta, beta_bar = self.extract([self.alphas, self.betas, self.betas_bar], t)
noisy = tf.random.normal(shape=tf.shape(x_t))
# 这里的噪声系数和beta取值一样,也可以满足越靠近0,噪声越小
noisy_weight = tf.sqrt(beta)
# 当t==0 时,不加入随机噪声
bs = tf.shape(x_t)[0]
noisy_mask = tf.reshape(
1 - tf.cast(tf.equal(t, 0), tf.float32), [bs, 1, 1, 1]
)
noisy_weight *= noisy_mask
x_t_1 = (x_t - beta * pred_noisy / tf.sqrt(beta_bar)) / tf.sqrt(alpha) + noisy * noisy_weight
return x_t_1
# unet 的下采样
def encoder(self, noisy_img, t, data, training):
xs = []
for idx, conv in enumerate(self.encoders):
noisy_img = conv(noisy_img)
t = tf.cast(t, tf.float32)
time_embedding = self.time_embeddings[idx](t)
time_embedding = tf.reshape(time_embedding, [-1, 1, 1, tf.shape(time_embedding)[-1]])
# time embedding 直接相加
noisy_img += time_embedding
xs.append(noisy_img)
return xs
# unet的上采样
def decoder(self, noisy_img, xs, training):
xs.reverse()
for idx, conv in enumerate(self.decoders):
noisy_img = conv(tf.concat([xs[idx], noisy_img], axis=-1))
return noisy_img
@tf.function
def pred_noisy(self, data, training):
img = data["img_data"]
bs = tf.shape(img)[0]
noisy = tf.random.normal(shape=tf.shape(img))
t = data.get("t", None)
# 在训练阶段t为空,随机生成成t
if t is None:
t = tf.random.uniform(shape=[bs, 1], minval=0, maxval=self.max_time_step, dtype=tf.int32)
noisy_img = self.q_noisy_sample(img, t, noisy)
else:
noisy_img = img
xs = self.encoder(noisy_img, t, data, training)
x = self.mid_conv(xs[-1])
x = self.decoder(x, xs, training)
pred_noisy = self.final_conv(x)
return {
"pred_noisy": pred_noisy, "noisy": noisy,
"loss": tf.reduce_mean(tf.reduce_sum((pred_noisy - noisy) ** 2, axis=(1, 2, 3)), axis=-1)
}
# 生成图片
def call(self, inputs, training=None, mask=None):
bs = inputs[0]
x_t = tf.random.normal(shape=[bs, self.img_size, self.img_size, 1])
for i in reversed(range(0, self.max_time_step)):
t = tf.reshape(tf.repeat(i, bs), [bs, 1])
p = self.pred_noisy({"img_data": x_t, "t": t}, False)
x_t = self.p_real_sample(x_t, t, p["pred_noisy"])
return x_t
def train_step(self, data):
with tf.GradientTape() as tape:
result = self.pred_noisy(data, True)
trainable_vars = self.trainable_variables
gradients = tape.gradient(result["loss"], trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
return {"loss": result["loss"]}
def test_step(self, data):
result = self.pred_noisy(data, False)
return {"loss": result["loss"]}
class SimpleCDDPMModel(SimpleDDPMModel):
def __init__(self, max_time_step=100, label_num=10):
super(SimpleCDDPMModel, self).__init__(max_time_step=max_time_step)
# condition 的embedding和time step的一致
self.condition_embedding = [
tf.keras.Sequential(
[
tf.keras.layers.Embedding(label_num, num),
tf.keras.layers.Dense(num)
]
)
for num in self.filter_nums]
# unet 的下采样
def encoder(self, noisy_img, t, data, training):
xs = []
mask = tf.random.uniform(shape=(), minval=0.0, maxval=1.0, dtype=tf.float32)
for idx, conv in enumerate(self.encoders):
noisy_img = conv(noisy_img)
t = tf.cast(t, tf.float32)
time_embedding = self.time_embeddings[idx](t)
time_embedding = tf.reshape(time_embedding, [-1, 1, 1, tf.shape(time_embedding)[-1]])
# time embedding 直接相加
noisy_img += time_embedding
# 获取 condition 的embedding
condition_embedding = self.condition_embedding[idx](data["label"])
condition_embedding = tf.reshape(condition_embedding, [-1, 1, 1, tf.shape(condition_embedding)[-1]])
# 训练阶段一定的概率下加入condition,推理阶段全部加入
if training:
if mask < 0.15:
condition_embedding = tf.zeros_like(condition_embedding)
noisy_img += condition_embedding
xs.append(noisy_img)
return xs
# 生成图片
def call(self, inputs, training=None, mask=None):
bs = inputs[0]
label = tf.reshape(tf.repeat(inputs[1], bs), [-1, 1])
x_t = tf.random.normal(shape=[bs, self.img_size, self.img_size, 1])
for i in reversed(range(0, self.max_time_step)):
t = tf.reshape(tf.repeat(i, bs), [bs, 1])
p = self.pred_noisy({"img_data": x_t, "t": t, "label": label}, False)
x_t = self.p_real_sample(x_t, t, p["pred_noisy"])
return x_t
参考文献
[1] https://www.jarvis73.com/2022/08/08/Diffusion-Model-1/
[2] https://blog.csdn.net/qihangran5467/article/details/118337892
[3] https://jaketae.github.io/study/vae/
[4] https://pyro.ai/examples/cvae.html
[5] https://lilianweng.github.io/posts/2021-07-11-diffusion-models/
[6] https://spaces.ac.cn/archives/9164
[7] https://zhuanlan.zhihu.com/p/575984592
[8] https://kxz18.github.io/2022/06/19/Diffusion/
[9] https://zhuanlan.zhihu.com/p/502668154
[10] https://xyfjason.top/2022/09/29/%E4%BB%8EVAE%E5%88%B0DDPM/
[11] https://arxiv.org/pdf/2208.11970.pdf
更多阅读
#投 稿 通 道#
让你的文字被更多人看到
如何才能让更多的优质内容以更短路径到达读者群体,缩短读者寻找优质内容的成本呢?答案就是:你不认识的人。
总有一些你不认识的人,知道你想知道的东西。PaperWeekly 或许可以成为一座桥梁,促使不同背景、不同方向的学者和学术灵感相互碰撞,迸发出更多的可能性。
PaperWeekly 鼓励高校实验室或个人,在我们的平台上分享各类优质内容,可以是最新论文解读,也可以是学术热点剖析、科研心得或竞赛经验讲解等。我们的目的只有一个,让知识真正流动起来。
📝 稿件基本要求:
• 文章确系个人原创作品,未曾在公开渠道发表,如为其他平台已发表或待发表的文章,请明确标注
• 稿件建议以 markdown 格式撰写,文中配图以附件形式发送,要求图片清晰,无版权问题
• PaperWeekly 尊重原作者署名权,并将为每篇被采纳的原创首发稿件,提供业内具有竞争力稿酬,具体依据文章阅读量和文章质量阶梯制结算
📬 投稿通道:
• 投稿邮箱:hr@paperweekly.site
• 来稿请备注即时联系方式(微信),以便我们在稿件选用的第一时间联系作者
• 您也可以直接添加小编微信(pwbot02)快速投稿,备注:姓名-投稿
△长按添加PaperWeekly小编
🔍
现在,在「知乎」也能找到我们了
进入知乎首页搜索「PaperWeekly」
点击「关注」订阅我们的专栏吧