其他
Autograd解析|OneFlow学习笔记
import oneflow as of
x = of.randn(2, 2, requires_grad=True)
y = x + 100
z = y.sum()
z.backward()
tensor([[1., 1.],
[1., 1.]], dtype=oneflow.float32)
def _backward(self, gradient=None, retain_graph=False, create_graph=False):
if not lazy_mode.is_enabled():
flow.autograd.backward(self, gradient, retain_graph, create_graph)
else:
...
ONEFLOW_API_PYBIND11_MODULE("autograd", m) {
m.def("backward", &Backward);
m.def("grad", &Grad);
}
Maybe<one::TensorTuple> Backward(const one::TensorTuple& outputs, const one::TensorTuple& out_grads,
bool retain_graph, bool create_graph) {
if (create_graph) { retain_graph = true; }
std::shared_ptr<one::TensorTuple> gradients = JUST(CheckAndInitOutGrads(outputs, out_grads));
JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensorIf(
outputs, *gradients, retain_graph, create_graph));
return std::make_shared<one::TensorTuple>(0);
}
AutogradEngine* GetThreadLocalAutogradEngine() {
thread_local static GraphAutogradEngine autograd_engine;
return &autograd_engine;
}
Maybe<void> GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
const TensorTuple& out_grads,
bool retain_graph,
bool create_graph) {
for (int i = 0; i < outputs.size(); ++i) {
JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i)));
}
GraphTask graph_task(outputs, retain_graph, create_graph);
JUST(graph_task.ComputeDependencies());
JUST(graph_task.Apply(/*save_grad_for_leaf=*/true));
return Maybe<void>::Ok();
}
is_leaf_:是不是叶子节点 requires_grad_:是不是需要求梯度值 retain_grad_:对于非叶子节点,是不是保存梯度值 acc_grad_:在gradient accumulation的的情况下,多个mini-batch的梯度累加 current_grad_:当前这个batch的梯度值
GraphFunctionNode::GraphFunctionNode(const std::string& name,
const std::shared_ptr<BackwardFunction>& backward_fn,
const TensorTuple& inputs, const TensorTuple& outputs)
: FunctionNode(name, backward_fn) {
input_meta_data_.resize(inputs.size());
next_functions_.reserve(inputs.size());
for (int i = 0; i < inputs.size(); ++i) {
if (inputs.at(i)->requires_grad()) {
input_meta_data_.at(i) = inputs.at(i)->mut_autograd_meta();
next_functions_.emplace_back(inputs.at(i)->mut_grad_fn_node());
}
}
output_meta_data_.resize(outputs.size());
output_tensor_infos_.reserve(outputs.size());
for (int i = 0; i < outputs.size(); ++i) {
const auto& autograd_meta =
NewAutogradMeta(outputs.at(i)->requires_grad(), outputs.at(i)->is_leaf());
outputs.at(i)->set_autograd_meta(autograd_meta);
output_meta_data_.at(i) = outputs.at(i)->mut_autograd_meta();
output_tensor_infos_.emplace_back(TensorInfo(*outputs.at(i)));
}
backward_fn_ = backward_fn;
}
Maybe<void> AutogradInterpreter::Apply(
const OpExpr& op_expr,
const TensorTuple& inputs,
TensorTuple* outputs,
const OpExprInterpContext& ctx) const {
...
autograd::AutoGradMode mode(false);
JUST(internal_->Apply(op_expr, inputs, outputs, ctx));
std::shared_ptr<OpExprGradClosure> grad_closure(nullptr);
if (requires_grad && !LazyMode::is_enabled()) {
grad_closure = JUST(op_expr.GetOrCreateOpGradClosure());
auto backward_fn = std::make_shared<BackwardFunction>();
backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
bool create_graph) -> Maybe<void> {
autograd::AutoGradMode mode(create_graph);
JUST(grad_closure->Apply(out_grads, in_grads));
return Maybe<void>::Ok();
};
backward_fn->status = [=]() { return grad_closure->state()->SavedTensors().size() > 0; };
JUST(GetThreadLocalAutogradEngine()->AddNode(op_expr.op_type_name() + "_backward", backward_fn,
inputs, outputs));
}
...
return Maybe<void>::Ok();
}
Maybe<FunctionNode> GraphAutogradEngine::AddNode(
const std::string& name, const std::shared_ptr<BackwardFunction>& backward_fn,
const TensorTuple& inputs, TensorTuple* outputs) {
// Firstly push function_node of tensor in stack which is leaf and requires_grad
for (const std::shared_ptr<Tensor>& in_tensor : inputs) {
if (in_tensor->is_leaf() && in_tensor->requires_grad()) {
if (!in_tensor->grad_fn_node()) { JUST(AddAccumulateFunctionNode(in_tensor)); }
}
}
std::shared_ptr<FunctionNode> func_node =
std::make_shared<GraphFunctionNode>(name, backward_fn, inputs, *outputs);
for (const std::shared_ptr<Tensor>& out_tensor : *outputs) {
out_tensor->set_grad_fn_node(func_node);
}
return func_node;
}
...
JUST(graph_task.ComputeDependencies());
JUST(graph_task.Apply(/*save_grad_for_leaf=*/true));
...
class GraphTask final {
bool retain_graph_;
bool create_graph_;
std::vector<FunctionNode*> roots_;
HashMap<FunctionNode*, int> dependencies_;
HashSet<FunctionNode*> need_execute_;
};
Maybe<void> GraphTask::Apply(bool save_grad_for_leaf) {
std::queue<FunctionNode*> queue;
for (FunctionNode* node : roots_) {
if (dependencies_[node] == 0) { queue.push(node); }
}
while (!queue.empty()) {
FunctionNode* node = queue.front();
queue.pop();
if (!need_execute_.empty() && need_execute_.find(node) == need_execute_.end()) {
node->ReleaseOutTensorArgs();
continue;
}
if (/*bool not_ready_to_apply=*/!(JUST(node->Apply(create_graph_)))) { continue; }
if (save_grad_for_leaf) { JUST(node->AccGrad4LeafTensor(create_graph_)); }
JUST(node->AccGrad4RetainGradTensor());
node->ReleaseOutTensorArgs();
if (!retain_graph_) { node->ReleaseData(); }
for (const auto& next_grad_fn : node->next_functions()) {
FunctionNode* next_node = next_grad_fn.get();
dependencies_[next_node] -= 1;
if (dependencies_[next_node] == 0) { queue.push(next_node); }
}
}
return Maybe<void>::Ok();
}
node->Apply node->AccGrad4LeafTensor
Maybe<bool> FunctionNode::Apply(bool create_graph) {
...
JUST(backward_fn_->body(output_grads, &input_grads, create_graph));
for (int i = 0; i < input_meta_data_.size(); ++i) {
if (input_grads.at(i)) {
...
JUST(input_meta_data_.at(i)->current_grad()->PushPartialTensor(input_grads.at(i)));
}
}
return true;
}
Maybe<void> CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) {
autograd::AutoGradMode mode(autograd_mode);
auto current_grad = JUST(autograd_meta->current_grad()->GetAccTensor({}));
if (!current_grad) { return Maybe<void>::Ok(); }
if (autograd_meta->acc_grad()) {
...
DevVmDepObjectConsumeModeGuard guard(DevVmDepObjectConsumeMode::NONE);
const auto& output = JUST(functional::Add(autograd_meta->acc_grad(), current_grad, /*alpha=*/1,
/*inplace=*/autograd_meta->is_grad_acc_inplace()));
JUST(autograd_meta->set_acc_grad(output));
} else {
JUST(autograd_meta->set_acc_grad(current_grad));
}
for (const auto& hook : autograd_meta->post_grad_accumulation_hooks()) {
auto new_grad = hook(autograd_meta->acc_grad());
if (new_grad) { JUST(autograd_meta->set_acc_grad(new_grad)); }
}
return Maybe<void>::Ok();
}
(特别感谢同事yinggang中间的各种答疑解惑。本文主要参考代码:https://github.com/Oneflow-Inc/oneflow/commit/a4144f9ecb7e85ad073a810c3359bce7bfeb05e1)