其他
【他山之石】Ray和Pytorch Lightning 使用指北
“他山之石,可以攻玉”,站在巨人的肩膀才能看得更高,走得更远。在科研的道路上,更需借助东风才能更快前行。为此,我们特别搜集整理了一些实用的代码链接,数据集,软件,编程技巧等,开辟“他山之石”专栏,助你乘风破浪,一路奋勇向前,敬请关注。
地址:https://www.zhihu.com/people/chea-sim
01
进行数据预处理,比如清洗数据,数据增强等等。对于NLP任务就是进行分词,将文本输入转化为计算机能够理解的向量格式。 模型选择,选择合适的模型,使用Pytorch框架的话就是继承一个nn.module类。 模型训练,使用各种花里胡哨的训练方式,比如分布式,半精度,各种优化器轮着来。而这部分往往是论文不太涉及的部分,也是我们一般称为dirty work的部分。因为训练模型,超参搜索,都是没有什么创新但是又需要很大的功夫去写这些重复的代码。
02
2.1 设置参数,训练,汇报结果
from ray.tune.integration.pytorch_lightning import TuneReportCallback
callback = TuneReportCallback(
{
"loss": "val_loss",
"mean_accuracy": "val_accuracy"
},
on="validation_end")
2.2 修改argparse和训练函数
2.3 塞进run
栗子
class LightningMNISTClassifier(pl.LightningModule):
def __init__(self, config, data_dir=None):
super(LightningMNISTClassifier, self).__init__()
self.data_dir = data_dir or os.getcwd()
self.layer_1_size = config["layer_1_size"]
self.layer_2_size = config["layer_2_size"]
self.lr = config["lr"]
self.batch_size = config["batch_size"]
# mnist images are (1, 28, 28) (channels, width, height)
self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)
def forward(self, x):
batch_size, channels, width, height = x.size()
x = x.view(batch_size, -1)
x = self.layer_1(x)
x = torch.relu(x)
x = self.layer_2(x)
x = torch.relu(x)
x = self.layer_3(x)
x = torch.log_softmax(x, dim=1)
return x
def cross_entropy_loss(self, logits, labels):
return F.nll_loss(logits, labels)
def accuracy(self, logits, labels):
_, predicted = torch.max(logits.data, 1)
correct = (predicted == labels).sum().item()
accuracy = correct / len(labels)
return torch.tensor(accuracy)
def training_step(self, train_batch, batch_idx):
x, y = train_batch
logits = self.forward(x)
loss = self.cross_entropy_loss(logits, y)
accuracy = self.accuracy(logits, y)
self.log("ptl/train_loss", loss)
self.log("ptl/train_accuracy", accuracy)
return loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
logits = self.forward(x)
loss = self.cross_entropy_loss(logits, y)
accuracy = self.accuracy(logits, y)
return {"val_loss": loss, "val_accuracy": accuracy}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
self.log("ptl/val_loss", avg_loss)
self.log("ptl/val_accuracy", avg_acc)
@staticmethod
def download_data(data_dir):
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307, ), (0.3081, ))
])
return MNIST(data_dir, train=True, download=True, transform=transform)
def prepare_data(self):
mnist_train = self.download_data(self.data_dir)
self.mnist_train, self.mnist_val = random_split(
mnist_train, [55000, 5000])
def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=int(self.batch_size))
def val_dataloader(self):
return DataLoader(self.mnist_val, batch_size=int(self.batch_size))
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
def train_mnist(config):
model = LightningMNISTClassifier(config)
trainer = pl.Trainer(max_epochs=10, show_progress_bar=False)
trainer.fit(model)
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
model = LightningMNISTClassifier(config, data_dir)
trainer = pl.Trainer(
max_epochs=num_epochs,
gpus=num_gpus,
logger=TensorBoardLogger(
save_dir=tune.get_trial_dir(), name="", version="."),
progress_bar_refresh_rate=0,
callbacks=[
TuneReportCallback(
{
"loss": "ptl/val_loss",
"mean_accuracy": "ptl/val_accuracy"
},
on="validation_end")
])
trainer.fit(model)
from ray.tune.integration.pytorch_lightning import TuneReportCallback
callback = TuneReportCallback({
"loss": "avg_val_loss",
"mean_accuracy": "avg_val_accuracy"
}, on="validation_end")
config = {
"layer_1_size": tune.choice([32, 64, 128]),
"layer_2_size": tune.choice([64, 128, 256]),
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([32, 64, 128]),
}
scheduler = ASHAScheduler(
max_t=num_epochs,
grace_period=1,
reduction_factor=2
)
reporter = CLIReporter(
metric_columns=["loss", "mean_accuracy", "training_iteration"])
analysis = tune.run(
tune.with_parameters(
train_mnist_tune,
data_dir=data_dir,
num_epochs=num_epochs,
num_gpus=gpus_per_trial),
resources_per_trial={
"cpu": 1,
"gpu": gpus_per_trial
},
metric="loss",
mode="min",
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter,
name="tune_mnist_asha")
03
3.1 argparse设置
default_worker.py: error: unrecognized arguments
可能是由于argparse没有使用args = parser.parse_known_args()或者 parse放在函数里面,这个是需要放在每个训练函数外面的。
3.2 启动个数太少
“他山之石”历史文章
如何在科研论文中画出漂亮的插图?
PyTorch 源码解读之 torch.optim:优化算法接口详解
AI框架基础技术之深度学习中的通信优化
SimCLR:用于视觉表征的对比学习框架
Pytorch Autograd与计算图
tensorflow2.4性能调优最佳实践
PyTorch | DDP系列:入门教程、实现原理与源代码解析、实战与技巧
教你用PyTorch玩转Transformer英译中翻译模型!
深度学习工程技巧之网格调参
PyTorch使用预训练模型进行模型加载
深度学习调参经验总结
PyTorch实现断点继续训练
Pytorch/Tensorflow-gpu训练并行加速trick(含代码)
从NumPy开始实现一个支持Auto-grad的CNN框架
pytorch_lightning 全程笔记
更多他山之石专栏文章,
请点击文章底部“阅读原文”查看
分享、点赞、在看,给个三连击呗!