diff --git a/torchopt/nn/stateless.py b/torchopt/nn/stateless.py
index c7f92b86..0f4f17b3 100644
--- a/torchopt/nn/stateless.py
+++ b/torchopt/nn/stateless.py
@@ -84,7 +84,7 @@ def reparametrize(
     module: nn.Module,
     named_tensors: dict[str, torch.Tensor] | Iterable[tuple[str, torch.Tensor]],
     allow_missing: bool = False,
-) -> Generator[nn.Module, None, None]:
+) -> Generator[nn.Module]:
     """Reparameterize the module parameters and/or buffers."""
     if not isinstance(named_tensors, dict):
         named_tensors = dict(named_tensors)
diff --git a/tutorials/1_Functional_Optimizer.ipynb b/tutorials/1_Functional_Optimizer.ipynb
index afc55f38..3f21465b 100644
--- a/tutorials/1_Functional_Optimizer.ipynb
+++ b/tutorials/1_Functional_Optimizer.ipynb
@@ -1,588 +1,539 @@
 {
-  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# TorchOpt as Functional Optimizer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[<img align=\"left\" src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/1_Functional_Optimizer.ipynb)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will introduce how TorchOpt can be treated as functional optimizer to conduct normal optimization with functional programming style. We will also illustrate how to conduct differentiable optimization with functional programming in PyTorch."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Basic API\n",
-    "\n",
-    "In this first part, we will illustrate how TorchOpt can be used as a functional optimizer. We compare it with different API in [JAX](https://github.com/google/jax) and [PyTorch](https://pytorch.org) to help understand the similarity and dissimilarity. We use simple network, Adam optimizer and MSE loss objective."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from collections import OrderedDict\n",
-    "\n",
-    "import functorch\n",
-    "import jax\n",
-    "import jax.numpy as jnp\n",
-    "import optax\n",
-    "import torch\n",
-    "import torch.autograd\n",
-    "import torch.nn as nn\n",
-    "\n",
-    "import torchopt\n",
-    "\n",
-    "\n",
-    "class Net(nn.Module):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, 1, bias=True)\n",
-    "        nn.init.ones_(self.fc.weight)\n",
-    "        nn.init.zeros_(self.fc.bias)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.fc(x)\n",
-    "\n",
-    "\n",
-    "def mse(inputs, targets):\n",
-    "    return ((inputs - targets) ** 2).mean()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.1 Original JAX implementation\n",
-    "\n",
-    "The first example is JAX implementation coupled with [Optax](https://github.com/deepmind/optax), which belongs to functional programming style."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def origin_jax():\n",
-    "    batch_size = 1\n",
-    "    dim = 1\n",
-    "    params = OrderedDict([('weight', jnp.ones((dim, 1))), ('bias', jnp.zeros((1,)))])\n",
-    "\n",
-    "    def model(params, x):\n",
-    "        return jnp.matmul(x, params['weight']) + params['bias']\n",
-    "\n",
-    "    # Obtain the `opt_state` that contains statistics for the optimizer\n",
-    "    learning_rate = 1.0\n",
-    "    optimizer = optax.adam(learning_rate)\n",
-    "    opt_state = optimizer.init(params)\n",
-    "\n",
-    "    def compute_loss(params, x, y):\n",
-    "        pred = model(params, x)\n",
-    "        return mse(pred, y)\n",
-    "\n",
-    "    xs = 2 * jnp.ones((batch_size, dim))\n",
-    "    ys = jnp.ones((batch_size, 1))\n",
-    "\n",
-    "    grads = jax.grad(compute_loss)(params, xs, ys)\n",
-    "    updates, opt_state = optimizer.update(grads, opt_state)\n",
-    "\n",
-    "    print('Parameters before update:', params)\n",
-    "    params = optax.apply_updates(params, updates)\n",
-    "    print('Parameters after update:', params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameters before update:\n",
-      "OrderedDict([\n",
-      "    ('weight', DeviceArray([[1.]], dtype=float32)),\n",
-      "    ('bias', DeviceArray([0.], dtype=float32))\n",
-      "])\n",
-      "Parameters after update:\n",
-      "OrderedDict([\n",
-      "    ('weight', DeviceArray([[6.735325e-06]], dtype=float32)),\n",
-      "    ('bias', DeviceArray([-0.99999326], dtype=float32))\n",
-      "])\n"
-     ]
-    }
-   ],
-   "source": [
-    "origin_jax()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.2 `functorch` with TorchOpt\n",
-    "\n",
-    "The second example is [`functorch`](https://pytorch.org/functorch) coupled with TorchOpt. It basically follows the same structure with the JAX example."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def interact_with_functorch():\n",
-    "    batch_size = 1\n",
-    "    dim = 1\n",
-    "    net = Net(dim)\n",
-    "    model, params = functorch.make_functional(net)  # get the functional version of the model\n",
-    "\n",
-    "    # Obtain the `opt_state` that contains statistics for the optimizer\n",
-    "    learning_rate = 1.0\n",
-    "    optimizer = torchopt.adam(learning_rate)\n",
-    "    opt_state = optimizer.init(params)\n",
-    "\n",
-    "    xs = 2 * torch.ones((batch_size, dim))\n",
-    "    ys = torch.ones((batch_size, 1))\n",
-    "\n",
-    "    pred = model(params, xs)\n",
-    "    loss = mse(pred, ys)\n",
-    "\n",
-    "    grads = torch.autograd.grad(loss, params)\n",
-    "    updates, opt_state = optimizer.update(grads, opt_state)\n",
-    "\n",
-    "    print('Parameters before update:', params)\n",
-    "    params = torchopt.apply_updates(params, updates)\n",
-    "    print('Parameters after update:', params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameters before update:\n",
-      "(\n",
-      "    Parameter containing: tensor([[1.]], requires_grad=True),\n",
-      "    Parameter containing: tensor([0.], requires_grad=True)\n",
-      ")\n",
-      "Parameters after update:\n",
-      "(\n",
-      "    Parameter containing: tensor([[6.6757e-06]], requires_grad=True),\n",
-      "    Parameter containing: tensor([-1.0000], requires_grad=True)\n",
-      ")\n"
-     ]
-    }
-   ],
-   "source": [
-    "interact_with_functorch()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "TorchOpt also offers a wrapper `torchopt.FuncOptimizer` to make it easier to maintain the optimizer states."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def interact_with_functorch_with_wrapper():\n",
-    "    batch_size = 1\n",
-    "    dim = 1\n",
-    "    net = Net(dim)\n",
-    "    model, params = functorch.make_functional(net)  # get the functional version of the model\n",
-    "\n",
-    "    learning_rate = 1.0\n",
-    "    optimizer = torchopt.FuncOptimizer(torchopt.adam(learning_rate))\n",
-    "\n",
-    "    xs = 2 * torch.ones((batch_size, dim))\n",
-    "    ys = torch.ones((batch_size, 1))\n",
-    "\n",
-    "    pred = model(params, xs)\n",
-    "    loss = mse(pred, ys)\n",
-    "\n",
-    "    print('Parameters before update:', params)\n",
-    "    params = optimizer.step(loss, params)\n",
-    "    print('Parameters after update:', params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameters before update:\n",
-      "(\n",
-      "    Parameter containing: tensor([[1.]], requires_grad=True),\n",
-      "    Parameter containing: tensor([0.], requires_grad=True)\n",
-      ")\n",
-      "Parameters after update:\n",
-      "(\n",
-      "    tensor([[6.6757e-06]], grad_fn=<AddBackward0>),\n",
-      "    tensor([-1.0000], grad_fn=<AddBackward0>)\n",
-      ")\n"
-     ]
-    }
-   ],
-   "source": [
-    "interact_with_functorch_with_wrapper()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.3 Full TorchOpt\n",
-    "\n",
-    "`torchopt.Optimizer` is the base class for our PyTorch-like optimizer. Combined with the functional optimizer `torchopt.sgd` and `torchopt.adam`, we can define our high-level API `torchopt.SGD` and `torchopt.Adam`. The third example is to illustrate that TorchOpt can also directly replace `torch.optim` with exactly the same usage. Note the API difference happens between `torchopt.adam()` and `torchopt.Adam()`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def full_torchopt():\n",
-    "    batch_size = 1\n",
-    "    dim = 1\n",
-    "    net = Net(dim)\n",
-    "\n",
-    "    learning_rate = 1.0\n",
-    "    # High-level API\n",
-    "    optim = torchopt.Adam(net.parameters(), lr=learning_rate)\n",
-    "    # Low-level API\n",
-    "    optim = torchopt.Optimizer(net.parameters(), torchopt.adam(lr=learning_rate))\n",
-    "\n",
-    "    xs = 2 * torch.ones((batch_size, dim))\n",
-    "    ys = torch.ones((batch_size, 1))\n",
-    "\n",
-    "    pred = net(xs)\n",
-    "    loss = mse(pred, ys)\n",
-    "\n",
-    "    print('Parameters before update:', dict(net.named_parameters()))\n",
-    "    optim.zero_grad()\n",
-    "    loss.backward()\n",
-    "    optim.step()\n",
-    "    print('Parameters after update:', dict(net.named_parameters()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameters before update:\n",
-      "{\n",
-      "    'fc.weight': Parameter containing: tensor([[1.]], requires_grad=True),\n",
-      "    'fc.bias': Parameter containing: tensor([0.], requires_grad=True)\n",
-      "}\n",
-      "Parameters after update:\n",
-      "{\n",
-      "    'fc.weight': Parameter containing: tensor([[6.6757e-06]], requires_grad=True),\n",
-      "    'fc.bias': Parameter containing: tensor([-1.0000], requires_grad=True)\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "full_torchopt()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.4 Original PyTorch\n",
-    "\n",
-    "The final example is to original PyTorch example with `torch.optim`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def origin_torch():\n",
-    "    batch_size = 1\n",
-    "    dim = 1\n",
-    "    net = Net(dim)\n",
-    "\n",
-    "    learning_rate = 1.0\n",
-    "    optim = torch.optim.Adam(net.parameters(), lr=learning_rate)\n",
-    "\n",
-    "    xs = 2 * torch.ones((batch_size, dim))\n",
-    "    ys = torch.ones((batch_size, 1))\n",
-    "\n",
-    "    pred = net(xs)\n",
-    "    loss = mse(pred, ys)\n",
-    "\n",
-    "    print('Parameters before update:', dict(net.named_parameters()))\n",
-    "    optim.zero_grad()\n",
-    "    loss.backward()\n",
-    "    optim.step()\n",
-    "    print('Parameters after update:', dict(net.named_parameters()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameters before update:\n",
-      "{\n",
-      "    'fc.weight': Parameter containing: tensor([[1.]], requires_grad=True),\n",
-      "    'fc.bias': Parameter containing: tensor([0.], requires_grad=True)\n",
-      "}\n",
-      "Parameters after update:\n",
-      "{\n",
-      "    'fc.weight': Parameter containing: tensor([[1.1921e-07]], requires_grad=True),\n",
-      "    'fc.bias': Parameter containing: tensor([-1.0000], requires_grad=True)\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "origin_torch()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Differentiable Optimization with Functional Optimizer\n",
-    "\n",
-    "Coupled with functional optimizer, you can conduct differentiable optimization by setting the `inplace` flag as `False` in update and `apply_updates` function. (which might be helpful for meta-learning algorithm implementation with functional programming style). \n",
-    "\n",
-    "Note that `torchopt.SGD` and `torchopt.Adam` do not support differentiable optimization. Refer to the Meta-Optimizer notebook for PyTorch-like differentiable optimizers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def differentiable():\n",
-    "    batch_size = 1\n",
-    "    dim = 1\n",
-    "    net = Net(dim)\n",
-    "    model, params = functorch.make_functional(net)  # get the functional version of the model\n",
-    "\n",
-    "    # Meta-parameter\n",
-    "    meta_param = nn.Parameter(torch.ones(1))\n",
-    "\n",
-    "    # SGD example\n",
-    "    learning_rate = 1.0\n",
-    "    optimizer = torchopt.sgd(learning_rate)\n",
-    "    opt_state = optimizer.init(params)\n",
-    "\n",
-    "    xs = torch.ones((batch_size, dim))\n",
-    "    ys = torch.ones((batch_size, 1))\n",
-    "\n",
-    "    pred = model(params, xs)\n",
-    "    # Where meta_param is used\n",
-    "    pred = pred + meta_param\n",
-    "    loss = mse(pred, ys)\n",
-    "\n",
-    "    grads = torch.autograd.grad(loss, params, create_graph=True)\n",
-    "    updates, opt_state = optimizer.update(grads, opt_state, inplace=False)\n",
-    "    # Update parameters with single step SGD update\n",
-    "    params = torchopt.apply_updates(params, updates, inplace=False)\n",
-    "\n",
-    "    pred = model(params, xs)\n",
-    "    loss = mse(pred, ys)\n",
-    "    loss.backward()\n",
-    "\n",
-    "    print('Gradient for the meta-parameter:', meta_param.grad)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Gradient for the meta-parameter: tensor([32.])\n"
-     ]
-    }
-   ],
-   "source": [
-    "differentiable()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.1 Track the Gradient of Momentum\n",
-    "\n",
-    "Note that most modern optimizers involve momentum term in the gradient update (basically only SGD with `momentum = 0` does not involve). We provide an option for user to choose whether to also track the meta-gradient through momentum term. The default option is `moment_requires_grad=True`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optim = torchopt.adam(lr=1.0, moment_requires_grad=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optim = torchopt.adam(lr=1.0, moment_requires_grad=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optim = torchopt.sgd(lr=1.0, momentum=0.8, moment_requires_grad=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Accelerated Optimizer\n",
-    "\n",
-    "Users can use accelerated optimizer by setting the `use_accelerated_op` as `True`. Currently we only support the Adam optimizer."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Check whether the `accelerated_op` is available:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n"
-     ]
+  "cells" : [
+    {"cell_type" : "markdown", "metadata" : {}, "source" : ["# TorchOpt as Functional Optimizer"]},
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["[<img align=\"left\" "
+                  "src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://"
+                  "colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/"
+                  "1_Functional_Optimizer.ipynb)"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["In this tutorial, we will introduce how TorchOpt can be treated as functional "
+                  "optimizer to conduct normal optimization with functional programming style. We "
+                  "will also illustrate how to conduct differentiable optimization with functional "
+                  "programming in PyTorch."]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 1. Basic API\n",
+        "\n",
+        "In this first part, we will illustrate how TorchOpt can be used as a functional "
+        "optimizer. We compare it with different API in [JAX](https://github.com/google/jax) and "
+        "[PyTorch](https://pytorch.org) to help understand the similarity and dissimilarity. We "
+        "use simple network, Adam optimizer and MSE loss objective."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 1,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "from collections import OrderedDict\n",
+        "\n",
+        "import functorch\n",
+        "import jax\n",
+        "import jax.numpy as jnp\n",
+        "import optax\n",
+        "import torch\n",
+        "import torch.autograd\n",
+        "import torch.nn as nn\n",
+        "\n",
+        "import torchopt\n",
+        "\n",
+        "\n",
+        "class Net(nn.Module):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, 1, bias=True)\n",
+        "        nn.init.ones_(self.fc.weight)\n",
+        "        nn.init.zeros_(self.fc.bias)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.fc(x)\n",
+        "\n",
+        "\n",
+        "def mse(inputs, targets):\n",
+        "    return ((inputs - targets) ** 2).mean()"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 1.1 Original JAX implementation\n",
+        "\n",
+        "The first example is JAX implementation coupled with "
+        "[Optax](https://github.com/deepmind/optax), which belongs to functional programming style."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 2,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "def origin_jax():\n",
+        "    batch_size = 1\n",
+        "    dim = 1\n",
+        "    params = OrderedDict([('weight', jnp.ones((dim, 1))), ('bias', jnp.zeros((1,)))])\n",
+        "\n",
+        "    def model(params, x):\n",
+        "        return jnp.matmul(x, params['weight']) + params['bias']\n",
+        "\n",
+        "    # Obtain the `opt_state` that contains statistics for the optimizer\n",
+        "    learning_rate = 1.0\n",
+        "    optimizer = optax.adam(learning_rate)\n",
+        "    opt_state = optimizer.init(params)\n",
+        "\n",
+        "    def compute_loss(params, x, y):\n",
+        "        pred = model(params, x)\n",
+        "        return mse(pred, y)\n",
+        "\n",
+        "    xs = 2 * jnp.ones((batch_size, dim))\n",
+        "    ys = jnp.ones((batch_size, 1))\n",
+        "\n",
+        "    grads = jax.grad(compute_loss)(params, xs, ys)\n",
+        "    updates, opt_state = optimizer.update(grads, opt_state)\n",
+        "\n",
+        "    print('Parameters before update:', params)\n",
+        "    params = optax.apply_updates(params, updates)\n",
+        "    print('Parameters after update:', params)"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 3,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "Parameters before update:\n",
+          "OrderedDict([\n",
+          "    ('weight', DeviceArray([[1.]], dtype=float32)),\n",
+          "    ('bias', DeviceArray([0.], dtype=float32))\n",
+          "])\n",
+          "Parameters after update:\n",
+          "OrderedDict([\n",
+          "    ('weight', DeviceArray([[6.735325e-06]], dtype=float32)),\n",
+          "    ('bias', DeviceArray([-0.99999326], dtype=float32))\n",
+          "])\n"
+        ]
+      } ],
+      "source" : ["origin_jax()"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 1.2 `functorch` with TorchOpt\n",
+        "\n",
+        "The second example is [`functorch`](https://pytorch.org/functorch) coupled with TorchOpt. "
+        "It basically follows the same structure with the JAX example."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 4,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "def interact_with_functorch():\n",
+        "    batch_size = 1\n",
+        "    dim = 1\n",
+        "    net = Net(dim)\n",
+        "    model, params = functorch.make_functional(net)  # get the functional version of the "
+        "model\n",
+        "\n",
+        "    # Obtain the `opt_state` that contains statistics for the optimizer\n",
+        "    learning_rate = 1.0\n",
+        "    optimizer = torchopt.adam(learning_rate)\n",
+        "    opt_state = optimizer.init(params)\n",
+        "\n",
+        "    xs = 2 * torch.ones((batch_size, dim))\n",
+        "    ys = torch.ones((batch_size, 1))\n",
+        "\n",
+        "    pred = model(params, xs)\n",
+        "    loss = mse(pred, ys)\n",
+        "\n",
+        "    grads = torch.autograd.grad(loss, params)\n",
+        "    updates, opt_state = optimizer.update(grads, opt_state)\n",
+        "\n",
+        "    print('Parameters before update:', params)\n",
+        "    params = torchopt.apply_updates(params, updates)\n",
+        "    print('Parameters after update:', params)"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 5,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "Parameters before update:\n",
+          "(\n",
+          "    Parameter containing: tensor([[1.]], requires_grad=True),\n",
+          "    Parameter containing: tensor([0.], requires_grad=True)\n",
+          ")\n",
+          "Parameters after update:\n",
+          "(\n",
+          "    Parameter containing: tensor([[6.6757e-06]], requires_grad=True),\n",
+          "    Parameter containing: tensor([-1.0000], requires_grad=True)\n",
+          ")\n"
+        ]
+      } ],
+      "source" : ["interact_with_functorch()"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["TorchOpt also offers a wrapper `torchopt.FuncOptimizer` to make it easier to "
+                  "maintain the optimizer states."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 6,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "def interact_with_functorch_with_wrapper():\n",
+        "    batch_size = 1\n",
+        "    dim = 1\n",
+        "    net = Net(dim)\n",
+        "    model, params = functorch.make_functional(net)  # get the functional version of the "
+        "model\n",
+        "\n",
+        "    learning_rate = 1.0\n",
+        "    optimizer = torchopt.FuncOptimizer(torchopt.adam(learning_rate))\n",
+        "\n",
+        "    xs = 2 * torch.ones((batch_size, dim))\n",
+        "    ys = torch.ones((batch_size, 1))\n",
+        "\n",
+        "    pred = model(params, xs)\n",
+        "    loss = mse(pred, ys)\n",
+        "\n",
+        "    print('Parameters before update:', params)\n",
+        "    params = optimizer.step(loss, params)\n",
+        "    print('Parameters after update:', params)"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 7,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "Parameters before update:\n",
+          "(\n",
+          "    Parameter containing: tensor([[1.]], requires_grad=True),\n",
+          "    Parameter containing: tensor([0.], requires_grad=True)\n",
+          ")\n",
+          "Parameters after update:\n",
+          "(\n",
+          "    tensor([[6.6757e-06]], grad_fn=<AddBackward0>),\n",
+          "    tensor([-1.0000], grad_fn=<AddBackward0>)\n",
+          ")\n"
+        ]
+      } ],
+      "source" : ["interact_with_functorch_with_wrapper()"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 1.3 Full TorchOpt\n",
+        "\n",
+        "`torchopt.Optimizer` is the base class for our PyTorch-like optimizer. Combined with the "
+        "functional optimizer `torchopt.sgd` and `torchopt.adam`, we can define our high-level API "
+        "`torchopt.SGD` and `torchopt.Adam`. The third example is to illustrate that TorchOpt can "
+        "also directly replace `torch.optim` with exactly the same usage. Note the API difference "
+        "happens between `torchopt.adam()` and `torchopt.Adam()`."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 8,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "def full_torchopt():\n",
+        "    batch_size = 1\n",
+        "    dim = 1\n",
+        "    net = Net(dim)\n",
+        "\n",
+        "    learning_rate = 1.0\n",
+        "    # High-level API\n",
+        "    optim = torchopt.Adam(net.parameters(), lr=learning_rate)\n",
+        "    # Low-level API\n",
+        "    optim = torchopt.Optimizer(net.parameters(), torchopt.adam(lr=learning_rate))\n",
+        "\n",
+        "    xs = 2 * torch.ones((batch_size, dim))\n",
+        "    ys = torch.ones((batch_size, 1))\n",
+        "\n",
+        "    pred = net(xs)\n",
+        "    loss = mse(pred, ys)\n",
+        "\n",
+        "    print('Parameters before update:', dict(net.named_parameters()))\n",
+        "    optim.zero_grad()\n",
+        "    loss.backward()\n",
+        "    optim.step()\n",
+        "    print('Parameters after update:', dict(net.named_parameters()))"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 9,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "Parameters before update:\n",
+          "{\n",
+          "    'fc.weight': Parameter containing: tensor([[1.]], requires_grad=True),\n",
+          "    'fc.bias': Parameter containing: tensor([0.], requires_grad=True)\n",
+          "}\n",
+          "Parameters after update:\n",
+          "{\n",
+          "    'fc.weight': Parameter containing: tensor([[6.6757e-06]], requires_grad=True),\n",
+          "    'fc.bias': Parameter containing: tensor([-1.0000], requires_grad=True)\n",
+          "}\n"
+        ]
+      } ],
+      "source" : ["full_torchopt()"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 1.4 Original PyTorch\n",
+        "\n",
+        "The final example is to original PyTorch example with `torch.optim`."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 10,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "def origin_torch():\n",
+        "    batch_size = 1\n",
+        "    dim = 1\n",
+        "    net = Net(dim)\n",
+        "\n",
+        "    learning_rate = 1.0\n",
+        "    optim = torch.optim.Adam(net.parameters(), lr=learning_rate)\n",
+        "\n",
+        "    xs = 2 * torch.ones((batch_size, dim))\n",
+        "    ys = torch.ones((batch_size, 1))\n",
+        "\n",
+        "    pred = net(xs)\n",
+        "    loss = mse(pred, ys)\n",
+        "\n",
+        "    print('Parameters before update:', dict(net.named_parameters()))\n",
+        "    optim.zero_grad()\n",
+        "    loss.backward()\n",
+        "    optim.step()\n",
+        "    print('Parameters after update:', dict(net.named_parameters()))"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 11,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "Parameters before update:\n",
+          "{\n",
+          "    'fc.weight': Parameter containing: tensor([[1.]], requires_grad=True),\n",
+          "    'fc.bias': Parameter containing: tensor([0.], requires_grad=True)\n",
+          "}\n",
+          "Parameters after update:\n",
+          "{\n",
+          "    'fc.weight': Parameter containing: tensor([[1.1921e-07]], requires_grad=True),\n",
+          "    'fc.bias': Parameter containing: tensor([-1.0000], requires_grad=True)\n",
+          "}\n"
+        ]
+      } ],
+      "source" : ["origin_torch()"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 2. Differentiable Optimization with Functional Optimizer\n",
+        "\n",
+        "Coupled with functional optimizer, you can conduct differentiable optimization by setting "
+        "the `inplace` flag as `False` in update and `apply_updates` function. (which might be "
+        "helpful for meta-learning algorithm implementation with functional programming style). \n",
+        "\n",
+        "Note that `torchopt.SGD` and `torchopt.Adam` do not support differentiable optimization. "
+        "Refer to the Meta-Optimizer notebook for PyTorch-like differentiable optimizers."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 12,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "def differentiable():\n",
+        "    batch_size = 1\n",
+        "    dim = 1\n",
+        "    net = Net(dim)\n",
+        "    model, params = functorch.make_functional(net)  # get the functional version of the "
+        "model\n",
+        "\n",
+        "    # Meta-parameter\n",
+        "    meta_param = nn.Parameter(torch.ones(1))\n",
+        "\n",
+        "    # SGD example\n",
+        "    learning_rate = 1.0\n",
+        "    optimizer = torchopt.sgd(learning_rate)\n",
+        "    opt_state = optimizer.init(params)\n",
+        "\n",
+        "    xs = torch.ones((batch_size, dim))\n",
+        "    ys = torch.ones((batch_size, 1))\n",
+        "\n",
+        "    pred = model(params, xs)\n",
+        "    # Where meta_param is used\n",
+        "    pred = pred + meta_param\n",
+        "    loss = mse(pred, ys)\n",
+        "\n",
+        "    grads = torch.autograd.grad(loss, params, create_graph=True)\n",
+        "    updates, opt_state = optimizer.update(grads, opt_state, inplace=False)\n",
+        "    # Update parameters with single step SGD update\n",
+        "    params = torchopt.apply_updates(params, updates, inplace=False)\n",
+        "\n",
+        "    pred = model(params, xs)\n",
+        "    loss = mse(pred, ys)\n",
+        "    loss.backward()\n",
+        "\n",
+        "    print('Gradient for the meta-parameter:', meta_param.grad)"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 13,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : ["Gradient for the meta-parameter: tensor([32.])\n"]
+      } ],
+      "source" : ["differentiable()"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 2.1 Track the Gradient of Momentum\n",
+        "\n",
+        "Note that most modern optimizers involve momentum term in the gradient update (basically "
+        "only SGD with `momentum = 0` does not involve). We provide an option for user to choose "
+        "whether to also track the meta-gradient through momentum term. The default option is "
+        "`moment_requires_grad=True`."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 14,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : ["optim = torchopt.adam(lr=1.0, moment_requires_grad=False)"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 15,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : ["optim = torchopt.adam(lr=1.0, moment_requires_grad=True)"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 16,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : ["optim = torchopt.sgd(lr=1.0, momentum=0.8, moment_requires_grad=True)"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 3. Accelerated Optimizer\n",
+        "\n",
+        "Users can use accelerated optimizer by setting the `use_accelerated_op` as `True`. "
+        "Currently we only support the Adam optimizer."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Check whether the `accelerated_op` is available:"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 17,
+      "metadata" : {},
+      "outputs" : [ {"name" : "stdout", "output_type" : "stream", "text" : ["True\n"]} ],
+      "source" : ["torchopt.accelerated_op_available(torch.device('cpu'))"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 18,
+      "metadata" : {},
+      "outputs" : [ {"name" : "stdout", "output_type" : "stream", "text" : ["True\n"]} ],
+      "source" : ["torchopt.accelerated_op_available(torch.device('cuda'))"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 19,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "net = Net(1).cuda()\n",
+        "optim = torchopt.Adam(net.parameters(), lr=1.0, use_accelerated_op=True)"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 20,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : ["optim = torchopt.adam(lr=1.0, use_accelerated_op=True)"]
     }
-   ],
-   "source": [
-    "torchopt.accelerated_op_available(torch.device('cpu'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n"
-     ]
+  ],
+  "metadata" : {
+    "kernelspec" :
+        {"display_name" : "Python 3 (ipykernel)", "language" : "python", "name" : "python3"},
+    "language_info" : {
+      "codemirror_mode" : {"name" : "ipython", "version" : 3},
+      "file_extension" : ".py",
+      "mimetype" : "text/x-python",
+      "name" : "python",
+      "nbconvert_exporter" : "python",
+      "pygments_lexer" : "ipython3",
+      "version" : "3.9.15"
+    },
+    "vscode" : {
+      "interpreter" :
+          {"hash" : "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"}
     }
-   ],
-   "source": [
-    "torchopt.accelerated_op_available(torch.device('cuda'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "net = Net(1).cuda()\n",
-    "optim = torchopt.Adam(net.parameters(), lr=1.0, use_accelerated_op=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optim = torchopt.adam(lr=1.0, use_accelerated_op=True)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
   },
-  "vscode": {
-   "interpreter": {
-    "hash": "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+  "nbformat" : 4,
+  "nbformat_minor" : 4
 }
diff --git a/tutorials/2_Visualization.ipynb b/tutorials/2_Visualization.ipynb
index dd58c48d..17c85a7b 100644
--- a/tutorials/2_Visualization.ipynb
+++ b/tutorials/2_Visualization.ipynb
@@ -1,216 +1,616 @@
 {
-  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Visualization in TorchOpt"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[<img align=\"left\" src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/2_Visualization.ipynb)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In [PyTorch](https://pytorch.org), if the attribute `requires_grad` of a tensor is `True`, the computation graph will be created if we use the tensor to do any operations. The computation graph is implemented like link-list -- `Tensor`s are nodes and they are linked by their attribute `gran_fn`. [PyTorchViz](https://github.com/szagoruyko/pytorchviz) is a Python package that uses [Graphviz](https://graphviz.org) as a backend for plotting computation graphs. TorchOpt use PyTorchViz as the blueprint and provide more easy-to-use visualization functions on the premise of supporting all its functions."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's start with a simple multiplication computation graph. We declared the variable `x` with flag `requires_grad=True` and compute `y = 2 * x`. Then we visualize the computation graph of `y`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
+  "cells" : [
+    {"cell_type" : "markdown", "metadata" : {}, "source" : ["# Visualization in TorchOpt"]},
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7fd0a30377f0>\n"
-     ]
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["[<img align=\"left\" "
+                  "src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://"
+                  "colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/"
+                  "2_Visualization.ipynb)"]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"109pt\" height=\"214pt\"\n viewBox=\"0.00 0.00 109.00 214.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 210)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-210 105,-210 105,4 -4,4\"/>\n<!-- 140534064715952 -->\n<g id=\"node1\" class=\"node\">\n<title>140534064715952</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"77.5,-30 23.5,-30 23.5,0 77.5,0 77.5,-30\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">y</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140534064838304 -->\n<g id=\"node2\" class=\"node\">\n<title>140534064838304</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"95,-85 6,-85 6,-66 95,-66 95,-85\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140534064838304&#45;&gt;140534064715952 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140534064838304&#45;&gt;140534064715952</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-65.87C50.5,-59.11 50.5,-49.35 50.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-40.11 50.5,-30.11 47,-40.11 54,-40.11\"/>\n</g>\n<!-- 140534064837776 -->\n<g id=\"node3\" class=\"node\">\n<title>140534064837776</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"101,-140 0,-140 0,-121 101,-121 101,-140\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140534064837776&#45;&gt;140534064838304 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140534064837776&#45;&gt;140534064838304</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-120.75C50.5,-113.8 50.5,-103.85 50.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-95.09 50.5,-85.09 47,-95.09 54,-95.09\"/>\n</g>\n<!-- 140534064714832 -->\n<g id=\"node4\" class=\"node\">\n<title>140534064714832</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"77.5,-206 23.5,-206 23.5,-176 77.5,-176 77.5,-206\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-194\" font-family=\"monospace\" font-size=\"10.00\">x</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140534064714832&#45;&gt;140534064837776 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140534064714832&#45;&gt;140534064837776</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-175.84C50.5,-168.21 50.5,-158.7 50.5,-150.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-150.27 50.5,-140.27 47,-150.27 54,-150.27\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from IPython.display import display\n",
-    "\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import torchopt\n",
-    "\n",
-    "\n",
-    "x = torch.tensor(1.0, requires_grad=True)\n",
-    "y = 2 * x\n",
-    "display(torchopt.visual.make_dot(y, params={'x': x, 'y': y}))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The figure shows `y` is connected by the multiplication edge. The gradient of `y` will flow through the multiplication backward function then accumulated on `x`. Note that we pass a dictionary for adding node labels."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then let's plot a neural network. Note that we can pass the generator returned by method `named_parameters` for adding node labels."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["In [PyTorch](https://pytorch.org), if the attribute `requires_grad` of a tensor "
+                  "is `True`, the computation graph will be created if we use the tensor to do any "
+                  "operations. The computation graph is implemented like link-list -- `Tensor`s "
+                  "are nodes and they are linked by their attribute `gran_fn`. "
+                  "[PyTorchViz](https://github.com/szagoruyko/pytorchviz) is a Python package that "
+                  "uses [Graphviz](https://graphviz.org) as a backend for plotting computation "
+                  "graphs. TorchOpt use PyTorchViz as the blueprint and provide more easy-to-use "
+                  "visualization functions on the premise of supporting all its functions."]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7fd00fd56e20>\n"
-     ]
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Let's start with a simple multiplication computation graph. We declared the "
+                  "variable `x` with flag `requires_grad=True` and compute `y = 2 * x`. Then we "
+                  "visualize the computation graph of `y`."]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"216pt\" height=\"335pt\"\n viewBox=\"0.00 0.00 216.00 335.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 331)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-331 212,-331 212,4 -4,4\"/>\n<!-- 140534659780336 -->\n<g id=\"node1\" class=\"node\">\n<title>140534659780336</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"130.5,-30 76.5,-30 76.5,0 130.5,0 130.5,-30\"/>\n<text text-anchor=\"middle\" x=\"103.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">loss</text>\n<text text-anchor=\"middle\" x=\"103.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140531595570768 -->\n<g id=\"node2\" class=\"node\">\n<title>140531595570768</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"160,-85 47,-85 47,-66 160,-66 160,-85\"/>\n<text text-anchor=\"middle\" x=\"103.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140531595570768&#45;&gt;140534659780336 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140531595570768&#45;&gt;140534659780336</title>\n<path fill=\"none\" stroke=\"black\" d=\"M103.5,-65.87C103.5,-59.11 103.5,-49.35 103.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"107,-40.11 103.5,-30.11 100,-40.11 107,-40.11\"/>\n</g>\n<!-- 140531595570576 -->\n<g id=\"node3\" class=\"node\">\n<title>140531595570576</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"154,-140 53,-140 53,-121 154,-121 154,-140\"/>\n<text text-anchor=\"middle\" x=\"103.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140531595570576&#45;&gt;140531595570768 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140531595570576&#45;&gt;140531595570768</title>\n<path fill=\"none\" stroke=\"black\" d=\"M103.5,-120.75C103.5,-113.8 103.5,-103.85 103.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"107,-95.09 103.5,-85.09 100,-95.09 107,-95.09\"/>\n</g>\n<!-- 140531595570528 -->\n<g id=\"node4\" class=\"node\">\n<title>140531595570528</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"101,-195 0,-195 0,-176 101,-176 101,-195\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140531595570528&#45;&gt;140531595570576 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140531595570528&#45;&gt;140531595570576</title>\n<path fill=\"none\" stroke=\"black\" d=\"M59.25,-175.75C66.97,-168.03 78.4,-156.6 87.72,-147.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"90.31,-149.64 94.91,-140.09 85.36,-144.69 90.31,-149.64\"/>\n</g>\n<!-- 140531595583632 -->\n<g id=\"node5\" class=\"node\">\n<title>140531595583632</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"80,-261 21,-261 21,-231 80,-231 80,-261\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-249\" font-family=\"monospace\" font-size=\"10.00\">fc.bias</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140531595583632&#45;&gt;140531595570528 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140531595583632&#45;&gt;140531595570528</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-230.84C50.5,-223.21 50.5,-213.7 50.5,-205.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-205.27 50.5,-195.27 47,-205.27 54,-205.27\"/>\n</g>\n<!-- 140531595571104 -->\n<g id=\"node6\" class=\"node\">\n<title>140531595571104</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"196,-195 119,-195 119,-176 196,-176 196,-195\"/>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140531595571104&#45;&gt;140531595570576 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140531595571104&#45;&gt;140531595570576</title>\n<path fill=\"none\" stroke=\"black\" d=\"M148.58,-175.75C140.72,-168.03 129.07,-156.6 119.58,-147.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"121.84,-144.6 112.25,-140.09 116.94,-149.59 121.84,-144.6\"/>\n</g>\n<!-- 140531595570432 -->\n<g id=\"node7\" class=\"node\">\n<title>140531595570432</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"208,-255.5 107,-255.5 107,-236.5 208,-236.5 208,-255.5\"/>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-243.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140531595570432&#45;&gt;140531595571104 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140531595570432&#45;&gt;140531595571104</title>\n<path fill=\"none\" stroke=\"black\" d=\"M157.5,-236.37C157.5,-228.25 157.5,-215.81 157.5,-205.39\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"161,-205.17 157.5,-195.17 154,-205.17 161,-205.17\"/>\n</g>\n<!-- 140531595582816 -->\n<g id=\"node8\" class=\"node\">\n<title>140531595582816</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"193,-327 122,-327 122,-297 193,-297 193,-327\"/>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">fc.weight</text>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-304\" font-family=\"monospace\" font-size=\"10.00\">(1, 5)</text>\n</g>\n<!-- 140531595582816&#45;&gt;140531595570432 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140531595582816&#45;&gt;140531595570432</title>\n<path fill=\"none\" stroke=\"black\" d=\"M157.5,-296.8C157.5,-287.7 157.5,-275.79 157.5,-265.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"161,-265.84 157.5,-255.84 154,-265.84 161,-265.84\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "class Net(nn.Module):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, 1, bias=True)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.fc(x)\n",
-    "\n",
-    "\n",
-    "dim = 5\n",
-    "batch_size = 2\n",
-    "net = Net(dim)\n",
-    "xs = torch.ones((batch_size, dim))\n",
-    "ys = torch.ones((batch_size, 1))\n",
-    "pred = net(xs)\n",
-    "loss = F.mse_loss(pred, ys)\n",
-    "\n",
-    "display(torchopt.visual.make_dot(loss, params=(net.named_parameters(), {'loss': loss})))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The computation graph of meta-learning algorithms will be much more complex. Our visualization tool allows users take as input the extracted network state for better visualization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "code",
+      "execution_count" : 1,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7fd0a30377f0>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"109pt\" height=\"214pt\"\n viewBox=\"0.00 0.00 109.00 214.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 210)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-210 105,-210 105,4 "
+                "-4,4\"/>\n<!-- 140534064715952 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140534064715952</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"77.5,-30 23.5,-30 23.5,0 77.5,0 77.5,-30\"/>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-18\" font-family=\"monospace\" "
+                "font-size=\"10.00\">y</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-7\" "
+                "font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140534064838304 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140534064838304</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"95,-85 6,-85 6,-66 95,-66 95,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140534064838304&#45;&gt;140534064715952 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140534064838304&#45;&gt;140534064715952</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-65.87C50.5,-59.11 50.5,-49.35 "
+                "50.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-40.11 "
+                "50.5,-30.11 47,-40.11 54,-40.11\"/>\n</g>\n<!-- 140534064837776 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140534064837776</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"101,-140 0,-140 0,-121 101,-121 "
+                "101,-140\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140534064837776&#45;&gt;140534064838304 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140534064837776&#45;&gt;140534064838304</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-120.75C50.5,-113.8 50.5,-103.85 "
+                "50.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-95.09 "
+                "50.5,-85.09 47,-95.09 54,-95.09\"/>\n</g>\n<!-- 140534064714832 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140534064714832</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"77.5,-206 23.5,-206 23.5,-176 "
+                "77.5,-176 77.5,-206\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-194\" "
+                "font-family=\"monospace\" font-size=\"10.00\">x</text>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-183\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140534064714832&#45;&gt;140534064837776 "
+                "-->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140534064714832&#45;&gt;140534064837776</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-175.84C50.5,-168.21 50.5,-158.7 "
+                "50.5,-150.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-150.27 "
+                "50.5,-140.27 47,-150.27 54,-150.27\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "from IPython.display import display\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "import torchopt\n",
+        "\n",
+        "\n",
+        "x = torch.tensor(1.0, requires_grad=True)\n",
+        "y = 2 * x\n",
+        "display(torchopt.visual.make_dot(y, params={'x': x, 'y': y}))"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["The figure shows `y` is connected by the multiplication edge. The gradient of "
+                  "`y` will flow through the multiplication backward function then accumulated on "
+                  "`x`. Note that we pass a dictionary for adding node labels."]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Then let's plot a neural network. Note that we can pass the generator returned "
+                  "by method `named_parameters` for adding node labels."]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7fd00fd56640>\n"
-     ]
+      "cell_type" : "code",
+      "execution_count" : 2,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7fd00fd56e20>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"216pt\" height=\"335pt\"\n viewBox=\"0.00 0.00 216.00 335.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 331)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-331 212,-331 212,4 "
+                "-4,4\"/>\n<!-- 140534659780336 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140534659780336</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"130.5,-30 76.5,-30 76.5,0 130.5,0 130.5,-30\"/>\n<text "
+                "text-anchor=\"middle\" x=\"103.5\" y=\"-18\" font-family=\"monospace\" "
+                "font-size=\"10.00\">loss</text>\n<text text-anchor=\"middle\" x=\"103.5\" "
+                "y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140531595570768 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140531595570768</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"160,-85 47,-85 47,-66 160,-66 160,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"103.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140531595570768&#45;&gt;140534659780336 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140531595570768&#45;&gt;140534659780336</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M103.5,-65.87C103.5,-59.11 103.5,-49.35 "
+                "103.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"107,-40.11 "
+                "103.5,-30.11 100,-40.11 107,-40.11\"/>\n</g>\n<!-- 140531595570576 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140531595570576</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"154,-140 53,-140 53,-121 154,-121 "
+                "154,-140\"/>\n<text text-anchor=\"middle\" x=\"103.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140531595570576&#45;&gt;140531595570768 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140531595570576&#45;&gt;140531595570768</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M103.5,-120.75C103.5,-113.8 103.5,-103.85 "
+                "103.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"107,-95.09 "
+                "103.5,-85.09 100,-95.09 107,-95.09\"/>\n</g>\n<!-- 140531595570528 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140531595570528</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"101,-195 0,-195 0,-176 101,-176 "
+                "101,-195\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-183\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140531595570528&#45;&gt;140531595570576 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140531595570528&#45;&gt;140531595570576</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M59.25,-175.75C66.97,-168.03 78.4,-156.6 "
+                "87.72,-147.28\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"90.31,-149.64 94.91,-140.09 85.36,-144.69 90.31,-149.64\"/>\n</g>\n<!-- "
+                "140531595583632 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140531595583632</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"80,-261 21,-261 21,-231 80,-231 80,-261\"/>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-249\" font-family=\"monospace\" "
+                "font-size=\"10.00\">fc.bias</text>\n<text text-anchor=\"middle\" x=\"50.5\" "
+                "y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140531595583632&#45;&gt;140531595570528 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140531595583632&#45;&gt;140531595570528</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-230.84C50.5,-223.21 50.5,-213.7 "
+                "50.5,-205.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-205.27 "
+                "50.5,-195.27 47,-205.27 54,-205.27\"/>\n</g>\n<!-- 140531595571104 -->\n<g "
+                "id=\"node6\" class=\"node\">\n<title>140531595571104</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"196,-195 119,-195 119,-176 196,-176 "
+                "196,-195\"/>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-183\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140531595571104&#45;&gt;140531595570576 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140531595571104&#45;&gt;140531595570576</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M148.58,-175.75C140.72,-168.03 129.07,-156.6 "
+                "119.58,-147.28\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"121.84,-144.6 112.25,-140.09 116.94,-149.59 "
+                "121.84,-144.6\"/>\n</g>\n<!-- 140531595570432 -->\n<g id=\"node7\" "
+                "class=\"node\">\n<title>140531595570432</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"208,-255.5 107,-255.5 107,-236.5 208,-236.5 "
+                "208,-255.5\"/>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-243.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140531595570432&#45;&gt;140531595571104 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140531595570432&#45;&gt;140531595571104</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M157.5,-236.37C157.5,-228.25 157.5,-215.81 "
+                "157.5,-205.39\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"161,-205.17 "
+                "157.5,-195.17 154,-205.17 161,-205.17\"/>\n</g>\n<!-- 140531595582816 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140531595582816</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"193,-327 122,-327 122,-297 193,-297 "
+                "193,-327\"/>\n<text text-anchor=\"middle\" x=\"157.5\" y=\"-315\" "
+                "font-family=\"monospace\" font-size=\"10.00\">fc.weight</text>\n<text "
+                "text-anchor=\"middle\" x=\"157.5\" y=\"-304\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1, 5)</text>\n</g>\n<!-- "
+                "140531595582816&#45;&gt;140531595570432 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140531595582816&#45;&gt;140531595570432</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M157.5,-296.8C157.5,-287.7 157.5,-275.79 "
+                "157.5,-265.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"161,-265.84 "
+                "157.5,-255.84 154,-265.84 161,-265.84\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "class Net(nn.Module):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, 1, bias=True)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.fc(x)\n",
+        "\n",
+        "\n",
+        "dim = 5\n",
+        "batch_size = 2\n",
+        "net = Net(dim)\n",
+        "xs = torch.ones((batch_size, dim))\n",
+        "ys = torch.ones((batch_size, 1))\n",
+        "pred = net(xs)\n",
+        "loss = F.mse_loss(pred, ys)\n",
+        "\n",
+        "display(torchopt.visual.make_dot(loss, params=(net.named_parameters(), {'loss': loss})))"
+      ]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"402pt\" height=\"995pt\"\n viewBox=\"0.00 0.00 402.00 995.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 991)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-991 398,-991 398,4 -4,4\"/>\n<!-- 140531595614064 -->\n<g id=\"node1\" class=\"node\">\n<title>140531595614064</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"298.5,-30 244.5,-30 244.5,0 298.5,0 298.5,-30\"/>\n<text text-anchor=\"middle\" x=\"271.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">loss</text>\n<text text-anchor=\"middle\" x=\"271.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140531595567168 -->\n<g id=\"node2\" class=\"node\">\n<title>140531595567168</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"328,-85 215,-85 215,-66 328,-66 328,-85\"/>\n<text text-anchor=\"middle\" x=\"271.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140531595567168&#45;&gt;140531595614064 -->\n<g id=\"edge28\" class=\"edge\">\n<title>140531595567168&#45;&gt;140531595614064</title>\n<path fill=\"none\" stroke=\"black\" d=\"M271.5,-65.87C271.5,-59.11 271.5,-49.35 271.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"275,-40.11 271.5,-30.11 268,-40.11 275,-40.11\"/>\n</g>\n<!-- 140531595569232 -->\n<g id=\"node3\" class=\"node\">\n<title>140531595569232</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"316,-140 227,-140 227,-121 316,-121 316,-140\"/>\n<text text-anchor=\"middle\" x=\"271.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140531595569232&#45;&gt;140531595567168 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140531595569232&#45;&gt;140531595567168</title>\n<path fill=\"none\" stroke=\"black\" d=\"M271.5,-120.75C271.5,-113.8 271.5,-103.85 271.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"275,-95.09 271.5,-85.09 268,-95.09 275,-95.09\"/>\n</g>\n<!-- 140531595568800 -->\n<g id=\"node4\" class=\"node\">\n<title>140531595568800</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"281,-195 180,-195 180,-176 281,-176 281,-195\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140531595568800&#45;&gt;140531595569232 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140531595568800&#45;&gt;140531595569232</title>\n<path fill=\"none\" stroke=\"black\" d=\"M237.27,-175.75C243.06,-168.26 251.56,-157.28 258.64,-148.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"261.51,-150.14 264.86,-140.09 255.97,-145.86 261.51,-150.14\"/>\n</g>\n<!-- 140534660247264 -->\n<g id=\"node5\" class=\"node\">\n<title>140534660247264</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"135,-459 40,-459 40,-418 135,-418 135,-459\"/>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-447\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-436\" font-family=\"monospace\" font-size=\"10.00\">step1.fc.bias</text>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-425\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140534660247264&#45;&gt;140531595568800 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140534660247264&#45;&gt;140531595568800</title>\n<path fill=\"none\" stroke=\"black\" d=\"M93.61,-417.94C106.18,-379.61 137.57,-292.99 182.5,-231 190.44,-220.04 201.39,-209.65 210.82,-201.66\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"213.06,-204.35 218.6,-195.32 208.64,-198.92 213.06,-204.35\"/>\n</g>\n<!-- 140534553595376 -->\n<g id=\"node6\" class=\"node\">\n<title>140534553595376</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"101,-849.5 0,-849.5 0,-830.5 101,-830.5 101,-849.5\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-837.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140534553595376&#45;&gt;140534660247264 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140534553595376&#45;&gt;140534660247264</title>\n<path fill=\"none\" stroke=\"black\" d=\"M45.24,-830.23C34.59,-811.63 11.5,-766.55 11.5,-725.5 11.5,-725.5 11.5,-725.5 11.5,-558.5 11.5,-528.63 17.82,-520.42 33.5,-495 39.83,-484.73 48.36,-474.85 56.76,-466.33\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"59.33,-468.71 64.06,-459.23 54.45,-463.7 59.33,-468.71\"/>\n</g>\n<!-- 140534553592832 -->\n<g id=\"node13\" class=\"node\">\n<title>140534553592832</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"216,-789 115,-789 115,-770 216,-770 216,-789\"/>\n<text text-anchor=\"middle\" x=\"165.5\" y=\"-777\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140534553595376&#45;&gt;140534553592832 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140534553595376&#45;&gt;140534553592832</title>\n<path fill=\"none\" stroke=\"black\" d=\"M67.47,-830.37C86.41,-820.73 117.29,-805.03 139.33,-793.81\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"141.14,-796.82 148.46,-789.17 137.96,-790.58 141.14,-796.82\"/>\n</g>\n<!-- 140534064448352 -->\n<g id=\"node7\" class=\"node\">\n<title>140534064448352</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"98,-921 3,-921 3,-891 98,-891 98,-921\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-909\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-898\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140534064448352&#45;&gt;140534553595376 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140534064448352&#45;&gt;140534553595376</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-890.8C50.5,-881.7 50.5,-869.79 50.5,-859.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-859.84 50.5,-849.84 47,-859.84 54,-859.84\"/>\n</g>\n<!-- 140534553595616 -->\n<g id=\"node8\" class=\"node\">\n<title>140534553595616</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"132,-514 43,-514 43,-495 132,-495 132,-514\"/>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-502\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140534553595616&#45;&gt;140534660247264 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140534553595616&#45;&gt;140534660247264</title>\n<path fill=\"none\" stroke=\"black\" d=\"M87.5,-494.87C87.5,-488.22 87.5,-478.63 87.5,-469.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"91,-469.01 87.5,-459.01 84,-469.01 91,-469.01\"/>\n</g>\n<!-- 140534553594848 -->\n<g id=\"node9\" class=\"node\">\n<title>140534553594848</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"135,-569 40,-569 40,-550 135,-550 135,-569\"/>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-557\" font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- 140534553594848&#45;&gt;140534553595616 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140534553594848&#45;&gt;140534553595616</title>\n<path fill=\"none\" stroke=\"black\" d=\"M87.5,-549.75C87.5,-542.8 87.5,-532.85 87.5,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"91,-524.09 87.5,-514.09 84,-524.09 91,-524.09\"/>\n</g>\n<!-- 140534553594992 -->\n<g id=\"node10\" class=\"node\">\n<title>140534553594992</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"133,-624 44,-624 44,-605 133,-605 133,-624\"/>\n<text text-anchor=\"middle\" x=\"88.5\" y=\"-612\" font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- 140534553594992&#45;&gt;140534553594848 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140534553594992&#45;&gt;140534553594848</title>\n<path fill=\"none\" stroke=\"black\" d=\"M88.33,-604.75C88.2,-597.8 88.02,-587.85 87.85,-579.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"91.35,-579.02 87.66,-569.09 84.35,-579.15 91.35,-579.02\"/>\n</g>\n<!-- 140534553594800 -->\n<g id=\"node11\" class=\"node\">\n<title>140534553594800</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"236,-679 75,-679 75,-660 236,-660 236,-679\"/>\n<text text-anchor=\"middle\" x=\"155.5\" y=\"-667\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140534553594800&#45;&gt;140534553594992 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140534553594800&#45;&gt;140534553594992</title>\n<path fill=\"none\" stroke=\"black\" d=\"M144.74,-659.98C134.73,-652.07 119.61,-640.11 107.57,-630.58\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"109.47,-627.63 99.46,-624.17 105.13,-633.12 109.47,-627.63\"/>\n</g>\n<!-- 140531595617904 -->\n<g id=\"node25\" class=\"node\">\n<title>140531595617904</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"230,-624 153,-624 153,-605 230,-605 230,-624\"/>\n<text text-anchor=\"middle\" x=\"191.5\" y=\"-612\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140534553594800&#45;&gt;140531595617904 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140534553594800&#45;&gt;140531595617904</title>\n<path fill=\"none\" stroke=\"black\" d=\"M161.44,-659.75C166.48,-652.34 173.84,-641.5 180.01,-632.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"182.94,-634.33 185.67,-624.09 177.15,-630.39 182.94,-634.33\"/>\n</g>\n<!-- 140534553593072 -->\n<g id=\"node12\" class=\"node\">\n<title>140534553593072</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"210,-734 121,-734 121,-715 210,-715 210,-734\"/>\n<text text-anchor=\"middle\" x=\"165.5\" y=\"-722\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140534553593072&#45;&gt;140534553594800 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140534553593072&#45;&gt;140534553594800</title>\n<path fill=\"none\" stroke=\"black\" d=\"M163.85,-714.75C162.54,-707.8 160.66,-697.85 159.02,-689.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"162.41,-688.27 157.12,-679.09 155.54,-689.56 162.41,-688.27\"/>\n</g>\n<!-- 140534553592832&#45;&gt;140534553593072 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140534553592832&#45;&gt;140534553593072</title>\n<path fill=\"none\" stroke=\"black\" d=\"M165.5,-769.75C165.5,-762.8 165.5,-752.85 165.5,-744.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"169,-744.09 165.5,-734.09 162,-744.09 169,-744.09\"/>\n</g>\n<!-- 140534553593456 -->\n<g id=\"node14\" class=\"node\">\n<title>140534553593456</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"215,-849.5 138,-849.5 138,-830.5 215,-830.5 215,-849.5\"/>\n<text text-anchor=\"middle\" x=\"176.5\" y=\"-837.5\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140534553593456&#45;&gt;140534553592832 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140534553593456&#45;&gt;140534553592832</title>\n<path fill=\"none\" stroke=\"black\" d=\"M174.88,-830.37C173.33,-822.16 170.96,-809.54 168.99,-799.05\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"172.42,-798.35 167.13,-789.17 165.54,-799.64 172.42,-798.35\"/>\n</g>\n<!-- 140534553593888 -->\n<g id=\"node15\" class=\"node\">\n<title>140534553593888</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"246,-915.5 145,-915.5 145,-896.5 246,-896.5 246,-915.5\"/>\n<text text-anchor=\"middle\" x=\"195.5\" y=\"-903.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140534553593888&#45;&gt;140534553593456 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140534553593888&#45;&gt;140534553593456</title>\n<path fill=\"none\" stroke=\"black\" d=\"M192.94,-896.37C190.15,-886.97 185.61,-871.67 182,-859.53\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"185.35,-858.5 179.14,-849.91 178.63,-860.49 185.35,-858.5\"/>\n</g>\n<!-- 140531595572368 -->\n<g id=\"node20\" class=\"node\">\n<title>140531595572368</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"284,-327 177,-327 177,-286 284,-286 284,-327\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-304\" font-family=\"monospace\" font-size=\"10.00\">step1.fc.weight</text>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-293\" font-family=\"monospace\" font-size=\"10.00\">(1, 5)</text>\n</g>\n<!-- 140534553593888&#45;&gt;140531595572368 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140534553593888&#45;&gt;140531595572368</title>\n<path fill=\"none\" stroke=\"black\" d=\"M204.52,-896.26C223.22,-877.15 264.5,-829.43 264.5,-780.5 264.5,-780.5 264.5,-780.5 264.5,-437.5 264.5,-402.08 252.54,-362.94 242.8,-336.96\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"245.96,-335.4 239.08,-327.34 239.43,-337.93 245.96,-335.4\"/>\n</g>\n<!-- 140531595612944 -->\n<g id=\"node16\" class=\"node\">\n<title>140531595612944</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"249,-987 142,-987 142,-957 249,-957 249,-987\"/>\n<text text-anchor=\"middle\" x=\"195.5\" y=\"-975\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.weight</text>\n<text text-anchor=\"middle\" x=\"195.5\" y=\"-964\" font-family=\"monospace\" font-size=\"10.00\">(1, 5)</text>\n</g>\n<!-- 140531595612944&#45;&gt;140534553593888 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140531595612944&#45;&gt;140534553593888</title>\n<path fill=\"none\" stroke=\"black\" d=\"M195.5,-956.8C195.5,-947.7 195.5,-935.79 195.5,-925.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"199,-925.84 195.5,-915.84 192,-925.84 199,-925.84\"/>\n</g>\n<!-- 140531595567888 -->\n<g id=\"node17\" class=\"node\">\n<title>140531595567888</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"394,-789 293,-789 293,-770 394,-770 394,-789\"/>\n<text text-anchor=\"middle\" x=\"343.5\" y=\"-777\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140531595567888&#45;&gt;140531595569232 -->\n<g id=\"edge27\" class=\"edge\">\n<title>140531595567888&#45;&gt;140531595569232</title>\n<path fill=\"none\" stroke=\"black\" d=\"M340.67,-769.93C334.94,-751.8 322.5,-708.13 322.5,-670.5 322.5,-670.5 322.5,-670.5 322.5,-239.5 322.5,-204.28 300.05,-168.25 284.94,-147.99\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"287.7,-145.84 278.81,-140.09 282.17,-150.13 287.7,-145.84\"/>\n</g>\n<!-- 140531595567888&#45;&gt;140534553593072 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140531595567888&#45;&gt;140534553593072</title>\n<path fill=\"none\" stroke=\"black\" d=\"M314.9,-769.98C284.89,-761.05 237.52,-746.94 204.18,-737.02\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"204.91,-733.58 194.32,-734.08 202.91,-740.29 204.91,-733.58\"/>\n</g>\n<!-- 140531595613184 -->\n<g id=\"node18\" class=\"node\">\n<title>140531595613184</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"382,-855 305,-855 305,-825 382,-825 382,-855\"/>\n<text text-anchor=\"middle\" x=\"343.5\" y=\"-843\" font-family=\"monospace\" font-size=\"10.00\">meta_param</text>\n<text text-anchor=\"middle\" x=\"343.5\" y=\"-832\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140531595613184&#45;&gt;140531595567888 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140531595613184&#45;&gt;140531595567888</title>\n<path fill=\"none\" stroke=\"black\" d=\"M343.5,-824.84C343.5,-817.21 343.5,-807.7 343.5,-799.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"347,-799.27 343.5,-789.27 340,-799.27 347,-799.27\"/>\n</g>\n<!-- 140534553594272 -->\n<g id=\"node19\" class=\"node\">\n<title>140534553594272</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"269,-250 192,-250 192,-231 269,-231 269,-250\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140534553594272&#45;&gt;140531595568800 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140534553594272&#45;&gt;140531595568800</title>\n<path fill=\"none\" stroke=\"black\" d=\"M230.5,-230.75C230.5,-223.8 230.5,-213.85 230.5,-205.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"234,-205.09 230.5,-195.09 227,-205.09 234,-205.09\"/>\n</g>\n<!-- 140531595572368&#45;&gt;140534553594272 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140531595572368&#45;&gt;140534553594272</title>\n<path fill=\"none\" stroke=\"black\" d=\"M230.5,-285.95C230.5,-277.85 230.5,-268.5 230.5,-260.47\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"234,-260.26 230.5,-250.26 227,-260.26 234,-260.26\"/>\n</g>\n<!-- 140534553593504 -->\n<g id=\"node21\" class=\"node\">\n<title>140534553593504</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"236,-382 147,-382 147,-363 236,-363 236,-382\"/>\n<text text-anchor=\"middle\" x=\"191.5\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140534553593504&#45;&gt;140531595572368 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140534553593504&#45;&gt;140531595572368</title>\n<path fill=\"none\" stroke=\"black\" d=\"M196.76,-362.87C201.03,-355.87 207.29,-345.59 213.27,-335.78\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"216.4,-337.37 218.61,-327.01 210.42,-333.73 216.4,-337.37\"/>\n</g>\n<!-- 140534553592976 -->\n<g id=\"node22\" class=\"node\">\n<title>140534553592976</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"231,-448 154,-448 154,-429 231,-429 231,-448\"/>\n<text text-anchor=\"middle\" x=\"192.5\" y=\"-436\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140534553592976&#45;&gt;140534553593504 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140534553592976&#45;&gt;140534553593504</title>\n<path fill=\"none\" stroke=\"black\" d=\"M192.37,-428.87C192.22,-419.66 191.99,-404.79 191.8,-392.77\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"195.3,-392.35 191.64,-382.41 188.3,-392.46 195.3,-392.35\"/>\n</g>\n<!-- 140534553593216 -->\n<g id=\"node23\" class=\"node\">\n<title>140534553593216</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"232,-514 155,-514 155,-495 232,-495 232,-514\"/>\n<text text-anchor=\"middle\" x=\"193.5\" y=\"-502\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140534553593216&#45;&gt;140534553592976 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140534553593216&#45;&gt;140534553592976</title>\n<path fill=\"none\" stroke=\"black\" d=\"M193.37,-494.87C193.22,-485.66 192.99,-470.79 192.8,-458.77\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"196.3,-458.35 192.64,-448.41 189.3,-458.46 196.3,-458.35\"/>\n</g>\n<!-- 140534553593552 -->\n<g id=\"node24\" class=\"node\">\n<title>140534553593552</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"236,-569 153,-569 153,-550 236,-550 236,-569\"/>\n<text text-anchor=\"middle\" x=\"194.5\" y=\"-557\" font-family=\"monospace\" font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- 140534553593552&#45;&gt;140534553593216 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140534553593552&#45;&gt;140534553593216</title>\n<path fill=\"none\" stroke=\"black\" d=\"M194.33,-549.75C194.2,-542.8 194.02,-532.85 193.85,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"197.35,-524.02 193.66,-514.09 190.35,-524.15 197.35,-524.02\"/>\n</g>\n<!-- 140531595617904&#45;&gt;140534553593552 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140531595617904&#45;&gt;140534553593552</title>\n<path fill=\"none\" stroke=\"black\" d=\"M192,-604.75C192.39,-597.8 192.95,-587.85 193.45,-579.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"196.94,-579.27 194.01,-569.09 189.95,-578.88 196.94,-579.27\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["The computation graph of meta-learning algorithms will be much more complex. "
+                  "Our visualization tool allows users take as input the extracted network state "
+                  "for better visualization."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 3,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7fd00fd56640>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"402pt\" height=\"995pt\"\n viewBox=\"0.00 0.00 402.00 995.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 991)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-991 398,-991 398,4 "
+                "-4,4\"/>\n<!-- 140531595614064 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140531595614064</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"298.5,-30 244.5,-30 244.5,0 298.5,0 "
+                "298.5,-30\"/>\n<text text-anchor=\"middle\" x=\"271.5\" y=\"-18\" "
+                "font-family=\"monospace\" font-size=\"10.00\">loss</text>\n<text "
+                "text-anchor=\"middle\" x=\"271.5\" y=\"-7\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140531595567168 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140531595567168</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"328,-85 215,-85 215,-66 328,-66 328,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"271.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140531595567168&#45;&gt;140531595614064 -->\n<g id=\"edge28\" "
+                "class=\"edge\">\n<title>140531595567168&#45;&gt;140531595614064</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M271.5,-65.87C271.5,-59.11 271.5,-49.35 "
+                "271.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"275,-40.11 "
+                "271.5,-30.11 268,-40.11 275,-40.11\"/>\n</g>\n<!-- 140531595569232 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140531595569232</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"316,-140 227,-140 227,-121 316,-121 "
+                "316,-140\"/>\n<text text-anchor=\"middle\" x=\"271.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140531595569232&#45;&gt;140531595567168 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140531595569232&#45;&gt;140531595567168</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M271.5,-120.75C271.5,-113.8 271.5,-103.85 "
+                "271.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"275,-95.09 "
+                "271.5,-85.09 268,-95.09 275,-95.09\"/>\n</g>\n<!-- 140531595568800 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140531595568800</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"281,-195 180,-195 180,-176 281,-176 "
+                "281,-195\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-183\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140531595568800&#45;&gt;140531595569232 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140531595568800&#45;&gt;140531595569232</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M237.27,-175.75C243.06,-168.26 251.56,-157.28 "
+                "258.64,-148.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"261.51,-150.14 264.86,-140.09 255.97,-145.86 "
+                "261.51,-150.14\"/>\n</g>\n<!-- 140534660247264 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140534660247264</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"135,-459 40,-459 40,-418 135,-418 135,-459\"/>\n<text "
+                "text-anchor=\"middle\" x=\"87.5\" y=\"-447\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"87.5\" "
+                "y=\"-436\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.fc.bias</text>\n<text text-anchor=\"middle\" x=\"87.5\" "
+                "y=\"-425\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140534660247264&#45;&gt;140531595568800 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140534660247264&#45;&gt;140531595568800</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M93.61,-417.94C106.18,-379.61 137.57,-292.99 "
+                "182.5,-231 190.44,-220.04 201.39,-209.65 210.82,-201.66\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"213.06,-204.35 218.6,-195.32 "
+                "208.64,-198.92 213.06,-204.35\"/>\n</g>\n<!-- 140534553595376 -->\n<g "
+                "id=\"node6\" class=\"node\">\n<title>140534553595376</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"101,-849.5 0,-849.5 0,-830.5 "
+                "101,-830.5 101,-849.5\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-837.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140534553595376&#45;&gt;140534660247264 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140534553595376&#45;&gt;140534660247264</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M45.24,-830.23C34.59,-811.63 11.5,-766.55 "
+                "11.5,-725.5 11.5,-725.5 11.5,-725.5 11.5,-558.5 11.5,-528.63 17.82,-520.42 "
+                "33.5,-495 39.83,-484.73 48.36,-474.85 56.76,-466.33\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"59.33,-468.71 64.06,-459.23 54.45,-463.7 "
+                "59.33,-468.71\"/>\n</g>\n<!-- 140534553592832 -->\n<g id=\"node13\" "
+                "class=\"node\">\n<title>140534553592832</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"216,-789 115,-789 115,-770 216,-770 216,-789\"/>\n<text "
+                "text-anchor=\"middle\" x=\"165.5\" y=\"-777\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140534553595376&#45;&gt;140534553592832 -->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140534553595376&#45;&gt;140534553592832</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M67.47,-830.37C86.41,-820.73 117.29,-805.03 "
+                "139.33,-793.81\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"141.14,-796.82 148.46,-789.17 137.96,-790.58 "
+                "141.14,-796.82\"/>\n</g>\n<!-- 140534064448352 -->\n<g id=\"node7\" "
+                "class=\"node\">\n<title>140534064448352</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"98,-921 3,-921 3,-891 98,-891 98,-921\"/>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-909\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"50.5\" "
+                "y=\"-898\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140534064448352&#45;&gt;140534553595376 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140534064448352&#45;&gt;140534553595376</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-890.8C50.5,-881.7 50.5,-869.79 "
+                "50.5,-859.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-859.84 "
+                "50.5,-849.84 47,-859.84 54,-859.84\"/>\n</g>\n<!-- 140534553595616 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140534553595616</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"132,-514 43,-514 43,-495 132,-495 "
+                "132,-514\"/>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-502\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140534553595616&#45;&gt;140534660247264 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140534553595616&#45;&gt;140534660247264</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M87.5,-494.87C87.5,-488.22 87.5,-478.63 "
+                "87.5,-469.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"91,-469.01 "
+                "87.5,-459.01 84,-469.01 91,-469.01\"/>\n</g>\n<!-- 140534553594848 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140534553594848</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"135,-569 40,-569 40,-550 135,-550 "
+                "135,-569\"/>\n<text text-anchor=\"middle\" x=\"87.5\" y=\"-557\" "
+                "font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- "
+                "140534553594848&#45;&gt;140534553595616 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140534553594848&#45;&gt;140534553595616</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M87.5,-549.75C87.5,-542.8 87.5,-532.85 "
+                "87.5,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"91,-524.09 "
+                "87.5,-514.09 84,-524.09 91,-524.09\"/>\n</g>\n<!-- 140534553594992 -->\n<g "
+                "id=\"node10\" class=\"node\">\n<title>140534553594992</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"133,-624 44,-624 44,-605 133,-605 "
+                "133,-624\"/>\n<text text-anchor=\"middle\" x=\"88.5\" y=\"-612\" "
+                "font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- "
+                "140534553594992&#45;&gt;140534553594848 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140534553594992&#45;&gt;140534553594848</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M88.33,-604.75C88.2,-597.8 88.02,-587.85 "
+                "87.85,-579.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"91.35,-579.02 87.66,-569.09 84.35,-579.15 91.35,-579.02\"/>\n</g>\n<!-- "
+                "140534553594800 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140534553594800</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"236,-679 75,-679 75,-660 236,-660 236,-679\"/>\n<text "
+                "text-anchor=\"middle\" x=\"155.5\" y=\"-667\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140534553594800&#45;&gt;140534553594992 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140534553594800&#45;&gt;140534553594992</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M144.74,-659.98C134.73,-652.07 119.61,-640.11 "
+                "107.57,-630.58\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"109.47,-627.63 99.46,-624.17 105.13,-633.12 "
+                "109.47,-627.63\"/>\n</g>\n<!-- 140531595617904 -->\n<g id=\"node25\" "
+                "class=\"node\">\n<title>140531595617904</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"230,-624 153,-624 153,-605 230,-605 230,-624\"/>\n<text "
+                "text-anchor=\"middle\" x=\"191.5\" y=\"-612\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140534553594800&#45;&gt;140531595617904 -->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140534553594800&#45;&gt;140531595617904</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M161.44,-659.75C166.48,-652.34 173.84,-641.5 "
+                "180.01,-632.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"182.94,-634.33 185.67,-624.09 177.15,-630.39 "
+                "182.94,-634.33\"/>\n</g>\n<!-- 140534553593072 -->\n<g id=\"node12\" "
+                "class=\"node\">\n<title>140534553593072</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"210,-734 121,-734 121,-715 210,-715 210,-734\"/>\n<text "
+                "text-anchor=\"middle\" x=\"165.5\" y=\"-722\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140534553593072&#45;&gt;140534553594800 -->\n<g id=\"edge10\" "
+                "class=\"edge\">\n<title>140534553593072&#45;&gt;140534553594800</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M163.85,-714.75C162.54,-707.8 160.66,-697.85 "
+                "159.02,-689.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"162.41,-688.27 157.12,-679.09 155.54,-689.56 "
+                "162.41,-688.27\"/>\n</g>\n<!-- 140534553592832&#45;&gt;140534553593072 -->\n<g "
+                "id=\"edge11\" "
+                "class=\"edge\">\n<title>140534553592832&#45;&gt;140534553593072</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M165.5,-769.75C165.5,-762.8 165.5,-752.85 "
+                "165.5,-744.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"169,-744.09 "
+                "165.5,-734.09 162,-744.09 169,-744.09\"/>\n</g>\n<!-- 140534553593456 -->\n<g "
+                "id=\"node14\" class=\"node\">\n<title>140534553593456</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"215,-849.5 138,-849.5 138,-830.5 "
+                "215,-830.5 215,-849.5\"/>\n<text text-anchor=\"middle\" x=\"176.5\" y=\"-837.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140534553593456&#45;&gt;140534553592832 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140534553593456&#45;&gt;140534553592832</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M174.88,-830.37C173.33,-822.16 170.96,-809.54 "
+                "168.99,-799.05\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"172.42,-798.35 167.13,-789.17 165.54,-799.64 "
+                "172.42,-798.35\"/>\n</g>\n<!-- 140534553593888 -->\n<g id=\"node15\" "
+                "class=\"node\">\n<title>140534553593888</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"246,-915.5 145,-915.5 145,-896.5 246,-896.5 "
+                "246,-915.5\"/>\n<text text-anchor=\"middle\" x=\"195.5\" y=\"-903.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140534553593888&#45;&gt;140534553593456 -->\n<g id=\"edge14\" "
+                "class=\"edge\">\n<title>140534553593888&#45;&gt;140534553593456</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M192.94,-896.37C190.15,-886.97 185.61,-871.67 "
+                "182,-859.53\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"185.35,-858.5 "
+                "179.14,-849.91 178.63,-860.49 185.35,-858.5\"/>\n</g>\n<!-- 140531595572368 "
+                "-->\n<g id=\"node20\" class=\"node\">\n<title>140531595572368</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"284,-327 177,-327 177,-286 284,-286 "
+                "284,-327\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-315\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text "
+                "text-anchor=\"middle\" x=\"230.5\" y=\"-304\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.fc.weight</text>\n<text text-anchor=\"middle\" "
+                "x=\"230.5\" y=\"-293\" font-family=\"monospace\" font-size=\"10.00\">(1, "
+                "5)</text>\n</g>\n<!-- 140534553593888&#45;&gt;140531595572368 -->\n<g "
+                "id=\"edge20\" "
+                "class=\"edge\">\n<title>140534553593888&#45;&gt;140531595572368</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M204.52,-896.26C223.22,-877.15 264.5,-829.43 "
+                "264.5,-780.5 264.5,-780.5 264.5,-780.5 264.5,-437.5 264.5,-402.08 252.54,-362.94 "
+                "242.8,-336.96\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"245.96,-335.4 239.08,-327.34 239.43,-337.93 "
+                "245.96,-335.4\"/>\n</g>\n<!-- 140531595612944 -->\n<g id=\"node16\" "
+                "class=\"node\">\n<title>140531595612944</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"249,-987 142,-987 142,-957 249,-957 249,-987\"/>\n<text "
+                "text-anchor=\"middle\" x=\"195.5\" y=\"-975\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step0.fc.weight</text>\n<text text-anchor=\"middle\" "
+                "x=\"195.5\" y=\"-964\" font-family=\"monospace\" font-size=\"10.00\">(1, "
+                "5)</text>\n</g>\n<!-- 140531595612944&#45;&gt;140534553593888 -->\n<g "
+                "id=\"edge15\" "
+                "class=\"edge\">\n<title>140531595612944&#45;&gt;140534553593888</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M195.5,-956.8C195.5,-947.7 195.5,-935.79 "
+                "195.5,-925.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"199,-925.84 "
+                "195.5,-915.84 192,-925.84 199,-925.84\"/>\n</g>\n<!-- 140531595567888 -->\n<g "
+                "id=\"node17\" class=\"node\">\n<title>140531595567888</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"394,-789 293,-789 293,-770 394,-770 "
+                "394,-789\"/>\n<text text-anchor=\"middle\" x=\"343.5\" y=\"-777\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140531595567888&#45;&gt;140531595569232 -->\n<g id=\"edge27\" "
+                "class=\"edge\">\n<title>140531595567888&#45;&gt;140531595569232</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M340.67,-769.93C334.94,-751.8 322.5,-708.13 "
+                "322.5,-670.5 322.5,-670.5 322.5,-670.5 322.5,-239.5 322.5,-204.28 300.05,-168.25 "
+                "284.94,-147.99\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"287.7,-145.84 278.81,-140.09 282.17,-150.13 "
+                "287.7,-145.84\"/>\n</g>\n<!-- 140531595567888&#45;&gt;140534553593072 -->\n<g "
+                "id=\"edge16\" "
+                "class=\"edge\">\n<title>140531595567888&#45;&gt;140534553593072</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M314.9,-769.98C284.89,-761.05 237.52,-746.94 "
+                "204.18,-737.02\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"204.91,-733.58 194.32,-734.08 202.91,-740.29 "
+                "204.91,-733.58\"/>\n</g>\n<!-- 140531595613184 -->\n<g id=\"node18\" "
+                "class=\"node\">\n<title>140531595613184</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"382,-855 305,-855 305,-825 382,-825 382,-855\"/>\n<text "
+                "text-anchor=\"middle\" x=\"343.5\" y=\"-843\" font-family=\"monospace\" "
+                "font-size=\"10.00\">meta_param</text>\n<text text-anchor=\"middle\" x=\"343.5\" "
+                "y=\"-832\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140531595613184&#45;&gt;140531595567888 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140531595613184&#45;&gt;140531595567888</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M343.5,-824.84C343.5,-817.21 343.5,-807.7 "
+                "343.5,-799.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"347,-799.27 "
+                "343.5,-789.27 340,-799.27 347,-799.27\"/>\n</g>\n<!-- 140534553594272 -->\n<g "
+                "id=\"node19\" class=\"node\">\n<title>140534553594272</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"269,-250 192,-250 192,-231 269,-231 "
+                "269,-250\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-238\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140534553594272&#45;&gt;140531595568800 -->\n<g id=\"edge18\" "
+                "class=\"edge\">\n<title>140534553594272&#45;&gt;140531595568800</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M230.5,-230.75C230.5,-223.8 230.5,-213.85 "
+                "230.5,-205.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"234,-205.09 "
+                "230.5,-195.09 227,-205.09 234,-205.09\"/>\n</g>\n<!-- "
+                "140531595572368&#45;&gt;140534553594272 -->\n<g id=\"edge19\" "
+                "class=\"edge\">\n<title>140531595572368&#45;&gt;140534553594272</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M230.5,-285.95C230.5,-277.85 230.5,-268.5 "
+                "230.5,-260.47\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"234,-260.26 "
+                "230.5,-250.26 227,-260.26 234,-260.26\"/>\n</g>\n<!-- 140534553593504 -->\n<g "
+                "id=\"node21\" class=\"node\">\n<title>140534553593504</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"236,-382 147,-382 147,-363 236,-363 "
+                "236,-382\"/>\n<text text-anchor=\"middle\" x=\"191.5\" y=\"-370\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140534553593504&#45;&gt;140531595572368 -->\n<g id=\"edge21\" "
+                "class=\"edge\">\n<title>140534553593504&#45;&gt;140531595572368</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M196.76,-362.87C201.03,-355.87 207.29,-345.59 "
+                "213.27,-335.78\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"216.4,-337.37 218.61,-327.01 210.42,-333.73 "
+                "216.4,-337.37\"/>\n</g>\n<!-- 140534553592976 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140534553592976</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"231,-448 154,-448 154,-429 231,-429 231,-448\"/>\n<text "
+                "text-anchor=\"middle\" x=\"192.5\" y=\"-436\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140534553592976&#45;&gt;140534553593504 -->\n<g id=\"edge22\" "
+                "class=\"edge\">\n<title>140534553592976&#45;&gt;140534553593504</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M192.37,-428.87C192.22,-419.66 191.99,-404.79 "
+                "191.8,-392.77\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"195.3,-392.35 191.64,-382.41 188.3,-392.46 195.3,-392.35\"/>\n</g>\n<!-- "
+                "140534553593216 -->\n<g id=\"node23\" "
+                "class=\"node\">\n<title>140534553593216</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"232,-514 155,-514 155,-495 232,-495 232,-514\"/>\n<text "
+                "text-anchor=\"middle\" x=\"193.5\" y=\"-502\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140534553593216&#45;&gt;140534553592976 -->\n<g id=\"edge23\" "
+                "class=\"edge\">\n<title>140534553593216&#45;&gt;140534553592976</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M193.37,-494.87C193.22,-485.66 192.99,-470.79 "
+                "192.8,-458.77\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"196.3,-458.35 192.64,-448.41 189.3,-458.46 196.3,-458.35\"/>\n</g>\n<!-- "
+                "140534553593552 -->\n<g id=\"node24\" "
+                "class=\"node\">\n<title>140534553593552</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"236,-569 153,-569 153,-550 236,-550 236,-569\"/>\n<text "
+                "text-anchor=\"middle\" x=\"194.5\" y=\"-557\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- "
+                "140534553593552&#45;&gt;140534553593216 -->\n<g id=\"edge24\" "
+                "class=\"edge\">\n<title>140534553593552&#45;&gt;140534553593216</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M194.33,-549.75C194.2,-542.8 194.02,-532.85 "
+                "193.85,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"197.35,-524.02 193.66,-514.09 190.35,-524.15 "
+                "197.35,-524.02\"/>\n</g>\n<!-- 140531595617904&#45;&gt;140534553593552 -->\n<g "
+                "id=\"edge25\" "
+                "class=\"edge\">\n<title>140531595617904&#45;&gt;140534553593552</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M192,-604.75C192.39,-597.8 192.95,-587.85 "
+                "193.45,-579.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"196.94,-579.27 194.01,-569.09 189.95,-578.88 "
+                "196.94,-579.27\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "class MetaNet(nn.Module):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, 1, bias=True)\n",
+        "\n",
+        "    def forward(self, x, meta_param):\n",
+        "        return self.fc(x) + meta_param\n",
+        "\n",
+        "\n",
+        "dim = 5\n",
+        "batch_size = 2\n",
+        "net = MetaNet(dim)\n",
+        "\n",
+        "xs = torch.ones((batch_size, dim))\n",
+        "ys = torch.ones((batch_size, 1))\n",
+        "\n",
+        "optimizer = torchopt.MetaSGD(net, lr=1e-3)\n",
+        "meta_param = torch.tensor(1.0, requires_grad=True)\n",
+        "\n",
+        "# Set enable_visual\n",
+        "net_state_0 = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step0.')\n",
+        "\n",
+        "pred = net(xs, meta_param)\n",
+        "loss = F.mse_loss(pred, ys)\n",
+        "optimizer.step(loss)\n",
+        "\n",
+        "# Set enable_visual\n",
+        "net_state_1 = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step1.')\n",
+        "\n",
+        "pred = net(xs, meta_param)\n",
+        "loss = F.mse_loss(pred, torch.ones_like(pred))\n",
+        "\n",
+        "# Draw computation graph\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        loss, [net_state_0, net_state_1, {'meta_param': meta_param, 'loss': loss}]\n",
+        "    )\n",
+        ")"
+      ]
+    }
+  ],
+  "metadata" : {
+    "kernelspec" :
+        {"display_name" : "Python 3 (ipykernel)", "language" : "python", "name" : "python3"},
+    "language_info" : {
+      "codemirror_mode" : {"name" : "ipython", "version" : 3},
+      "file_extension" : ".py",
+      "mimetype" : "text/x-python",
+      "name" : "python",
+      "nbconvert_exporter" : "python",
+      "pygments_lexer" : "ipython3",
+      "version" : "3.9.15"
+    },
+    "vscode" : {
+      "interpreter" :
+          {"hash" : "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"}
     }
-   ],
-   "source": [
-    "class MetaNet(nn.Module):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, 1, bias=True)\n",
-    "\n",
-    "    def forward(self, x, meta_param):\n",
-    "        return self.fc(x) + meta_param\n",
-    "\n",
-    "\n",
-    "dim = 5\n",
-    "batch_size = 2\n",
-    "net = MetaNet(dim)\n",
-    "\n",
-    "xs = torch.ones((batch_size, dim))\n",
-    "ys = torch.ones((batch_size, 1))\n",
-    "\n",
-    "optimizer = torchopt.MetaSGD(net, lr=1e-3)\n",
-    "meta_param = torch.tensor(1.0, requires_grad=True)\n",
-    "\n",
-    "# Set enable_visual\n",
-    "net_state_0 = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step0.')\n",
-    "\n",
-    "pred = net(xs, meta_param)\n",
-    "loss = F.mse_loss(pred, ys)\n",
-    "optimizer.step(loss)\n",
-    "\n",
-    "# Set enable_visual\n",
-    "net_state_1 = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step1.')\n",
-    "\n",
-    "pred = net(xs, meta_param)\n",
-    "loss = F.mse_loss(pred, torch.ones_like(pred))\n",
-    "\n",
-    "# Draw computation graph\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        loss, [net_state_0, net_state_1, {'meta_param': meta_param, 'loss': loss}]\n",
-    "    )\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
   },
-  "vscode": {
-   "interpreter": {
-    "hash": "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+  "nbformat" : 4,
+  "nbformat_minor" : 4
 }
diff --git a/tutorials/3_Meta_Optimizer.ipynb b/tutorials/3_Meta_Optimizer.ipynb
index 69be77ed..23d7a575 100644
--- a/tutorials/3_Meta_Optimizer.ipynb
+++ b/tutorials/3_Meta_Optimizer.ipynb
@@ -1,713 +1,1584 @@
 {
-  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# TorchOpt as Meta-Optimizer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[<img align=\"left\" src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/3_Meta_Optimizer.ipynb)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will show how to treat TorchOpt as a differentiable optimizer with traditional PyTorch optimization API. In addition, we also provide many other API for easy meta-learning algorithm implementations."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Basic API for Differentiable Optimizer\n",
-    "\n",
-    "`MetaOptimizer` is the main class for our differentiable optimizer. Combined with the functional optimizer `torchopt.sgd` and `torchopt.adam` mentioned in the tutorial 1, we can define our high-level API `torchopt.MetaSGD` and `torchopt.MetaAdam`. We will discuss how this combination happens with `torchopt.chain` in Section 3. Let us consider the problem below."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Assume a tensor $x$ is a meta-parameter and $a$ is a normal parameters (such as network parameters). We have inner loss $\\mathcal{L}^{\\textrm{in}} = a_0 \\cdot x^2$ and we update $a$ use the gradient $\\frac{\\partial \\mathcal{L}^{\\textrm{in}}}{\\partial a_0} = x^2$ and $a_1 = a_0 - \\eta \\, \\frac{\\partial \\mathcal{L}^{\\textrm{in}}}{\\partial a_0} = a_0 - \\eta \\, x^2$. Then we compute the outer loss $\\mathcal{L}^{\\textrm{out}} = a_1 \\cdot x^2$. So the gradient of outer loss to $x$ would be:\n",
-    "\n",
-    "$$\n",
-    "\\begin{split}\n",
-    "        \\frac{\\partial \\mathcal{L}^{\\textrm{out}}}{\\partial x}\n",
-    "    & = \\frac{\\partial (a_1 \\cdot x^2)}{\\partial x} \\\\\n",
-    "    & = \\frac{\\partial a_1}{\\partial x} \\cdot x^2 + a_1 \\cdot \\frac{\\partial (x^2)}{\\partial x} \\\\\n",
-    "    & = \\frac{\\partial (a_0 - \\eta \\, x^2)}{\\partial x} \\cdot x^2 + (a_0 - \\eta \\, x^2) \\cdot 2 x \\\\\n",
-    "    & = (- \\eta \\cdot 2 x) \\cdot x^2 + (a_0 - \\eta \\, x^2) \\cdot 2 x \\\\\n",
-    "    & = - 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x\n",
-    "\\end{split}\n",
-    "$$"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Given the analytical solution above. Let's try to verify it with TorchOpt. Define the net work first."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import display\n",
-    "\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import torchopt\n",
-    "\n",
-    "\n",
-    "class Net(nn.Module):\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.a = nn.Parameter(torch.tensor(1.0), requires_grad=True)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.a * (x**2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we declare the network (parameterized by `a`) and the meta-parameter `x`. Do not forget to set flag `requires_grad=True` for `x`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "net = Net()\n",
-    "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next we declare the meta-optimizer. Here we show two equivalent ways of defining the meta-optimizer. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Low-level API\n",
-    "optim = torchopt.MetaOptimizer(net, torchopt.sgd(lr=1.0))\n",
-    "\n",
-    "# High-level API\n",
-    "optim = torchopt.MetaSGD(net, lr=1.0)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The meta-optimizer takes the network as input and use method `step` to update the network (parameterized by `a`). Finally, we show how a bi-level process works."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "x.grad = tensor(-28.)\n"
-     ]
-    }
-   ],
-   "source": [
-    "inner_loss = net(x)\n",
-    "optim.step(inner_loss)\n",
-    "\n",
-    "outer_loss = net(x)\n",
-    "outer_loss.backward()\n",
-    "# x.grad = - 4 * lr * x^3 + 2 * a_0 * x\n",
-    "#        = - 4 * 1 * 2^3 + 2 * 1 * 2\n",
-    "#        = -32 + 4\n",
-    "#        = -28\n",
-    "print(f'x.grad = {x.grad!r}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.1 Track the Gradient of Momentum\n",
-    "\n",
-    "Note that most modern optimizers involve moment term in the gradient update (basically only SGD with `momentum=0` does not involve). We provide an option for user to choose whether to also track the meta-gradient through moment term. The default option is `moment_requires_grad=True`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- When you do not track the meta-gradient through moment (`moment_requires_grad=False`)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7fbc7e823310>\n"
-     ]
-    },
-    {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"344pt\" height=\"962pt\"\n viewBox=\"0.00 0.00 343.50 962.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 958)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-958 339.5,-958 339.5,4 -4,4\"/>\n<!-- 140447553047184 -->\n<g id=\"node1\" class=\"node\">\n<title>140447553047184</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"179,-30 102,-30 102,0 179,0 179,-30\"/>\n<text text-anchor=\"middle\" x=\"140.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"140.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553041216 -->\n<g id=\"node2\" class=\"node\">\n<title>140447553041216</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"197,-85 84,-85 84,-66 197,-66 197,-85\"/>\n<text text-anchor=\"middle\" x=\"140.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140447553041216&#45;&gt;140447553047184 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140447553041216&#45;&gt;140447553047184</title>\n<path fill=\"none\" stroke=\"black\" d=\"M140.5,-65.87C140.5,-59.11 140.5,-49.35 140.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"144,-40.11 140.5,-30.11 137,-40.11 144,-40.11\"/>\n</g>\n<!-- 140447553042896 -->\n<g id=\"node3\" class=\"node\">\n<title>140447553042896</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"185,-140 96,-140 96,-121 185,-121 185,-140\"/>\n<text text-anchor=\"middle\" x=\"140.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553042896&#45;&gt;140447553041216 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140447553042896&#45;&gt;140447553041216</title>\n<path fill=\"none\" stroke=\"black\" d=\"M140.5,-120.75C140.5,-113.8 140.5,-103.85 140.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"144,-95.09 140.5,-85.09 137,-95.09 144,-95.09\"/>\n</g>\n<!-- 140447553019088 -->\n<g id=\"node4\" class=\"node\">\n<title>140447553019088</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"208,-217 119,-217 119,-176 208,-176 208,-217\"/>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-205\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-194\" font-family=\"monospace\" font-size=\"10.00\">step1.a</text>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553019088&#45;&gt;140447553042896 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140447553019088&#45;&gt;140447553042896</title>\n<path fill=\"none\" stroke=\"black\" d=\"M156.47,-175.95C153.5,-167.67 150.05,-158.07 147.12,-149.92\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"150.32,-148.49 143.65,-140.26 143.74,-150.86 150.32,-148.49\"/>\n</g>\n<!-- 140447553041072 -->\n<g id=\"node5\" class=\"node\">\n<title>140447553041072</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"333,-822 232,-822 232,-803 333,-803 333,-822\"/>\n<text text-anchor=\"middle\" x=\"282.5\" y=\"-810\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553041072&#45;&gt;140447553019088 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140447553041072&#45;&gt;140447553019088</title>\n<path fill=\"none\" stroke=\"black\" d=\"M290.09,-802.96C304.75,-785.57 335.5,-744.23 335.5,-703.5 335.5,-703.5 335.5,-703.5 335.5,-316.5 335.5,-258.46 268.44,-226.5 218.07,-210.69\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"218.83,-207.26 208.24,-207.74 216.81,-213.97 218.83,-207.26\"/>\n</g>\n<!-- 140447553043664 -->\n<g id=\"node13\" class=\"node\">\n<title>140447553043664</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"271,-767 182,-767 182,-748 271,-748 271,-767\"/>\n<text text-anchor=\"middle\" x=\"226.5\" y=\"-755\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553041072&#45;&gt;140447553043664 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140447553041072&#45;&gt;140447553043664</title>\n<path fill=\"none\" stroke=\"black\" d=\"M273.5,-802.98C265.31,-795.23 252.99,-783.58 243.03,-774.14\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"245.33,-771.5 235.66,-767.17 240.52,-776.59 245.33,-771.5\"/>\n</g>\n<!-- 140447553045344 -->\n<g id=\"node6\" class=\"node\">\n<title>140447553045344</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"312,-888 253,-888 253,-858 312,-858 312,-888\"/>\n<text text-anchor=\"middle\" x=\"282.5\" y=\"-876\" font-family=\"monospace\" font-size=\"10.00\">step0.a</text>\n<text text-anchor=\"middle\" x=\"282.5\" y=\"-865\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553045344&#45;&gt;140447553041072 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140447553045344&#45;&gt;140447553041072</title>\n<path fill=\"none\" stroke=\"black\" d=\"M282.5,-857.84C282.5,-850.21 282.5,-840.7 282.5,-832.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"286,-832.27 282.5,-822.27 279,-832.27 286,-832.27\"/>\n</g>\n<!-- 140447553041120 -->\n<g id=\"node7\" class=\"node\">\n<title>140447553041120</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"208,-272 119,-272 119,-253 208,-253 208,-272\"/>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-260\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553041120&#45;&gt;140447553019088 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140447553041120&#45;&gt;140447553019088</title>\n<path fill=\"none\" stroke=\"black\" d=\"M163.5,-252.87C163.5,-246.22 163.5,-236.63 163.5,-227.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"167,-227.01 163.5,-217.01 160,-227.01 167,-227.01\"/>\n</g>\n<!-- 140447553043040 -->\n<g id=\"node8\" class=\"node\">\n<title>140447553043040</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"208,-327 119,-327 119,-308 208,-308 208,-327\"/>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- 140447553043040&#45;&gt;140447553041120 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140447553043040&#45;&gt;140447553041120</title>\n<path fill=\"none\" stroke=\"black\" d=\"M163.5,-307.75C163.5,-300.8 163.5,-290.85 163.5,-282.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"167,-282.09 163.5,-272.09 160,-282.09 167,-282.09\"/>\n</g>\n<!-- 140447553043184 -->\n<g id=\"node9\" class=\"node\">\n<title>140447553043184</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"164,-492 75,-492 75,-473 164,-473 164,-492\"/>\n<text text-anchor=\"middle\" x=\"119.5\" y=\"-480\" font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- 140447553043184&#45;&gt;140447553043040 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140447553043184&#45;&gt;140447553043040</title>\n<path fill=\"none\" stroke=\"black\" d=\"M121.52,-472.83C126.12,-453.19 137.92,-403.83 149.5,-363 151.93,-354.43 154.86,-345.01 157.41,-337.05\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"160.8,-337.91 160.56,-327.32 154.15,-335.75 160.8,-337.91\"/>\n</g>\n<!-- 140447553043328 -->\n<g id=\"node10\" class=\"node\">\n<title>140447553043328</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"162,-602 73,-602 73,-583 162,-583 162,-602\"/>\n<text text-anchor=\"middle\" x=\"117.5\" y=\"-590\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140447553043328&#45;&gt;140447553043184 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140447553043328&#45;&gt;140447553043184</title>\n<path fill=\"none\" stroke=\"black\" d=\"M117.66,-582.66C117.99,-565.17 118.72,-525.8 119.15,-502.27\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"122.65,-502.22 119.34,-492.16 115.66,-502.09 122.65,-502.22\"/>\n</g>\n<!-- 140447553043424 -->\n<g id=\"node11\" class=\"node\">\n<title>140447553043424</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"271,-657 182,-657 182,-638 271,-638 271,-657\"/>\n<text text-anchor=\"middle\" x=\"226.5\" y=\"-645\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553043424&#45;&gt;140447553043328 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140447553043424&#45;&gt;140447553043328</title>\n<path fill=\"none\" stroke=\"black\" d=\"M208.99,-637.98C191.53,-629.5 164.49,-616.35 144.33,-606.54\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"145.86,-603.4 135.33,-602.17 142.79,-609.69 145.86,-603.4\"/>\n</g>\n<!-- 140447553043856 -->\n<g id=\"node21\" class=\"node\">\n<title>140447553043856</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"293,-602 180,-602 180,-583 293,-583 293,-602\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-590\" font-family=\"monospace\" font-size=\"10.00\">AddcmulBackward0</text>\n</g>\n<!-- 140447553043424&#45;&gt;140447553043856 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140447553043424&#45;&gt;140447553043856</title>\n<path fill=\"none\" stroke=\"black\" d=\"M222.98,-637.75C222.8,-630.72 224.28,-620.62 226.58,-611.84\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"229.99,-612.68 229.75,-602.09 223.33,-610.52 229.99,-612.68\"/>\n</g>\n<!-- 140447553043424&#45;&gt;140447553043856 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140447553043424&#45;&gt;140447553043856</title>\n<path fill=\"none\" stroke=\"black\" d=\"M233.32,-637.75C236.12,-630.8 238.44,-620.85 239.46,-612.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"242.96,-612.26 240.01,-602.09 235.97,-611.88 242.96,-612.26\"/>\n</g>\n<!-- 140447553043520 -->\n<g id=\"node12\" class=\"node\">\n<title>140447553043520</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"307,-712 146,-712 146,-693 307,-693 307,-712\"/>\n<text text-anchor=\"middle\" x=\"226.5\" y=\"-700\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140447553043520&#45;&gt;140447553043424 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140447553043520&#45;&gt;140447553043424</title>\n<path fill=\"none\" stroke=\"black\" d=\"M226.5,-692.75C226.5,-685.8 226.5,-675.85 226.5,-667.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230,-667.09 226.5,-657.09 223,-667.09 230,-667.09\"/>\n</g>\n<!-- 140447553043664&#45;&gt;140447553043520 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140447553043664&#45;&gt;140447553043520</title>\n<path fill=\"none\" stroke=\"black\" d=\"M226.5,-747.75C226.5,-740.8 226.5,-730.85 226.5,-722.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230,-722.09 226.5,-712.09 223,-722.09 230,-722.09\"/>\n</g>\n<!-- 140447553043472 -->\n<g id=\"node14\" class=\"node\">\n<title>140447553043472</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"162,-822 73,-822 73,-803 162,-803 162,-822\"/>\n<text text-anchor=\"middle\" x=\"117.5\" y=\"-810\" font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- 140447553043472&#45;&gt;140447553043424 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140447553043472&#45;&gt;140447553043424</title>\n<path fill=\"none\" stroke=\"black\" d=\"M116.38,-802.74C114.16,-781.61 111.4,-727.04 136.5,-693 147.65,-677.87 165.38,-667.59 182.16,-660.76\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"183.74,-663.91 191.88,-657.13 181.29,-657.35 183.74,-663.91\"/>\n</g>\n<!-- 140447553043472&#45;&gt;140447553043664 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140447553043472&#45;&gt;140447553043664</title>\n<path fill=\"none\" stroke=\"black\" d=\"M135.01,-802.98C152.47,-794.5 179.51,-781.35 199.67,-771.54\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"201.21,-774.69 208.67,-767.17 198.14,-768.4 201.21,-774.69\"/>\n</g>\n<!-- 140447553043808 -->\n<g id=\"node15\" class=\"node\">\n<title>140447553043808</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"132,-882.5 31,-882.5 31,-863.5 132,-863.5 132,-882.5\"/>\n<text text-anchor=\"middle\" x=\"81.5\" y=\"-870.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553043808&#45;&gt;140447553043472 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140447553043808&#45;&gt;140447553043472</title>\n<path fill=\"none\" stroke=\"black\" d=\"M86.81,-863.37C92.08,-854.81 100.29,-841.47 106.89,-830.74\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"109.91,-832.52 112.17,-822.17 103.94,-828.85 109.91,-832.52\"/>\n</g>\n<!-- 140447553041264 -->\n<g id=\"node22\" class=\"node\">\n<title>140447553041264</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"89,-767 0,-767 0,-748 89,-748 89,-767\"/>\n<text text-anchor=\"middle\" x=\"44.5\" y=\"-755\" font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- 140447553043808&#45;&gt;140447553041264 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140447553043808&#45;&gt;140447553041264</title>\n<path fill=\"none\" stroke=\"black\" d=\"M78.17,-863.2C74.44,-853.25 68.29,-836.55 63.5,-822 58.54,-806.95 53.42,-789.73 49.75,-777.01\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"53.05,-775.82 46.93,-767.17 46.32,-777.75 53.05,-775.82\"/>\n</g>\n<!-- 140447553045584 -->\n<g id=\"node16\" class=\"node\">\n<title>140447553045584</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"108.5,-954 54.5,-954 54.5,-924 108.5,-924 108.5,-954\"/>\n<text text-anchor=\"middle\" x=\"81.5\" y=\"-942\" font-family=\"monospace\" font-size=\"10.00\">x</text>\n<text text-anchor=\"middle\" x=\"81.5\" y=\"-931\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553045584&#45;&gt;140447553043808 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140447553045584&#45;&gt;140447553043808</title>\n<path fill=\"none\" stroke=\"black\" d=\"M81.5,-923.8C81.5,-914.7 81.5,-902.79 81.5,-892.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"85,-892.84 81.5,-882.84 78,-892.84 85,-892.84\"/>\n</g>\n<!-- 140447553043136 -->\n<g id=\"node17\" class=\"node\">\n<title>140447553043136</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"247,-382 158,-382 158,-363 247,-363 247,-382\"/>\n<text text-anchor=\"middle\" x=\"202.5\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140447553043136&#45;&gt;140447553043040 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140447553043136&#45;&gt;140447553043040</title>\n<path fill=\"none\" stroke=\"black\" d=\"M196.06,-362.75C190.61,-355.34 182.64,-344.5 175.94,-335.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"178.57,-333.07 169.82,-327.09 172.93,-337.22 178.57,-333.07\"/>\n</g>\n<!-- 140447553043232 -->\n<g id=\"node18\" class=\"node\">\n<title>140447553043232</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"267,-437 172,-437 172,-418 267,-418 267,-437\"/>\n<text text-anchor=\"middle\" x=\"219.5\" y=\"-425\" font-family=\"monospace\" font-size=\"10.00\">SqrtBackward0</text>\n</g>\n<!-- 140447553043232&#45;&gt;140447553043136 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140447553043232&#45;&gt;140447553043136</title>\n<path fill=\"none\" stroke=\"black\" d=\"M216.69,-417.75C214.44,-410.72 211.2,-400.62 208.38,-391.84\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"211.64,-390.54 205.25,-382.09 204.98,-392.68 211.64,-390.54\"/>\n</g>\n<!-- 140447553043760 -->\n<g id=\"node19\" class=\"node\">\n<title>140447553043760</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"274,-492 185,-492 185,-473 274,-473 274,-492\"/>\n<text text-anchor=\"middle\" x=\"229.5\" y=\"-480\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140447553043760&#45;&gt;140447553043232 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140447553043760&#45;&gt;140447553043232</title>\n<path fill=\"none\" stroke=\"black\" d=\"M227.85,-472.75C226.54,-465.8 224.66,-455.85 223.02,-447.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"226.41,-446.27 221.12,-437.09 219.54,-447.56 226.41,-446.27\"/>\n</g>\n<!-- 140447553043904 -->\n<g id=\"node20\" class=\"node\">\n<title>140447553043904</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"275,-547 186,-547 186,-528 275,-528 275,-547\"/>\n<text text-anchor=\"middle\" x=\"230.5\" y=\"-535\" font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- 140447553043904&#45;&gt;140447553043760 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140447553043904&#45;&gt;140447553043760</title>\n<path fill=\"none\" stroke=\"black\" d=\"M230.33,-527.75C230.2,-520.8 230.02,-510.85 229.85,-502.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"233.35,-502.02 229.66,-492.09 226.35,-502.15 233.35,-502.02\"/>\n</g>\n<!-- 140447553043856&#45;&gt;140447553043904 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140447553043856&#45;&gt;140447553043904</title>\n<path fill=\"none\" stroke=\"black\" d=\"M235.51,-582.75C234.72,-575.8 233.6,-565.85 232.61,-557.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"236.08,-556.63 231.47,-547.09 229.12,-557.42 236.08,-556.63\"/>\n</g>\n<!-- 140447553041264&#45;&gt;140447553042896 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140447553041264&#45;&gt;140447553042896</title>\n<path fill=\"none\" stroke=\"black\" d=\"M44.5,-747.82C44.5,-729.48 44.5,-685.44 44.5,-648.5 44.5,-648.5 44.5,-648.5 44.5,-261.5 44.5,-211.41 91.19,-167.96 119.45,-146.25\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"121.61,-149 127.55,-140.23 117.44,-143.38 121.61,-149\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "net = Net()\n",
-    "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
-    "y = torch.tensor(1.0)\n",
-    "\n",
-    "optim = torchopt.MetaAdam(net, lr=1.0, moment_requires_grad=False)\n",
-    "\n",
-    "net_state_0 = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step0.')\n",
-    "inner_loss = F.mse_loss(net(x), y)\n",
-    "optim.step(inner_loss)\n",
-    "net_state_1 = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step1.')\n",
-    "\n",
-    "outer_loss = F.mse_loss(net(x), y)\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        outer_loss, params=[net_state_0, net_state_1, {'x': x, 'outer_loss': outer_loss}]\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- When you track the meta-gradient through moment (`moment_requires_grad=True`, default for `torchopt.MetaAdam`)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7fbc7e8238e0>\n"
-     ]
-    },
-    {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"509pt\" height=\"974pt\"\n viewBox=\"0.00 0.00 508.50 974.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 970)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-970 504.5,-970 504.5,4 -4,4\"/>\n<!-- 140447553148704 -->\n<g id=\"node1\" class=\"node\">\n<title>140447553148704</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"323.5,-30 246.5,-30 246.5,0 323.5,0 323.5,-30\"/>\n<text text-anchor=\"middle\" x=\"285\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"285\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553041024 -->\n<g id=\"node2\" class=\"node\">\n<title>140447553041024</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"341.5,-85 228.5,-85 228.5,-66 341.5,-66 341.5,-85\"/>\n<text text-anchor=\"middle\" x=\"285\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140447553041024&#45;&gt;140447553148704 -->\n<g id=\"edge32\" class=\"edge\">\n<title>140447553041024&#45;&gt;140447553148704</title>\n<path fill=\"none\" stroke=\"black\" d=\"M285,-65.87C285,-59.11 285,-49.35 285,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"288.5,-40.11 285,-30.11 281.5,-40.11 288.5,-40.11\"/>\n</g>\n<!-- 140447553043424 -->\n<g id=\"node3\" class=\"node\">\n<title>140447553043424</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"329.5,-140 240.5,-140 240.5,-121 329.5,-121 329.5,-140\"/>\n<text text-anchor=\"middle\" x=\"285\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553043424&#45;&gt;140447553041024 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140447553043424&#45;&gt;140447553041024</title>\n<path fill=\"none\" stroke=\"black\" d=\"M285,-120.75C285,-113.8 285,-103.85 285,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"288.5,-95.09 285,-85.09 281.5,-95.09 288.5,-95.09\"/>\n</g>\n<!-- 140450536407152 -->\n<g id=\"node4\" class=\"node\">\n<title>140450536407152</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"232.5,-217 143.5,-217 143.5,-176 232.5,-176 232.5,-217\"/>\n<text text-anchor=\"middle\" x=\"188\" y=\"-205\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"188\" y=\"-194\" font-family=\"monospace\" font-size=\"10.00\">step1.a</text>\n<text text-anchor=\"middle\" x=\"188\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140450536407152&#45;&gt;140447553043424 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140450536407152&#45;&gt;140447553043424</title>\n<path fill=\"none\" stroke=\"black\" d=\"M217.63,-175.95C232.39,-166.21 249.91,-154.65 263.38,-145.76\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"265.56,-148.52 271.98,-140.09 261.7,-142.68 265.56,-148.52\"/>\n</g>\n<!-- 140447553041264 -->\n<g id=\"node5\" class=\"node\">\n<title>140447553041264</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"162.5,-834 61.5,-834 61.5,-815 162.5,-815 162.5,-834\"/>\n<text text-anchor=\"middle\" x=\"112\" y=\"-822\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553041264&#45;&gt;140450536407152 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140447553041264&#45;&gt;140450536407152</title>\n<path fill=\"none\" stroke=\"black\" d=\"M94.01,-814.75C62.35,-797.94 0,-757.93 0,-703.5 0,-703.5 0,-703.5 0,-316.5 0,-252.73 78.18,-221.72 133.71,-207.74\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"134.54,-211.14 143.45,-205.4 132.91,-204.33 134.54,-211.14\"/>\n</g>\n<!-- 140447553019232 -->\n<g id=\"node16\" class=\"node\">\n<title>140447553019232</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"272.5,-773 183.5,-773 183.5,-754 272.5,-754 272.5,-773\"/>\n<text text-anchor=\"middle\" x=\"228\" y=\"-761\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553041264&#45;&gt;140447553019232 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140447553041264&#45;&gt;140447553019232</title>\n<path fill=\"none\" stroke=\"black\" d=\"M129.12,-814.79C148.33,-805.02 179.72,-789.05 201.98,-777.73\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"203.86,-780.7 211.19,-773.05 200.69,-774.46 203.86,-780.7\"/>\n</g>\n<!-- 140447553148064 -->\n<g id=\"node6\" class=\"node\">\n<title>140447553148064</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"141.5,-900 82.5,-900 82.5,-870 141.5,-870 141.5,-900\"/>\n<text text-anchor=\"middle\" x=\"112\" y=\"-888\" font-family=\"monospace\" font-size=\"10.00\">step0.a</text>\n<text text-anchor=\"middle\" x=\"112\" y=\"-877\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553148064&#45;&gt;140447553041264 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140447553148064&#45;&gt;140447553041264</title>\n<path fill=\"none\" stroke=\"black\" d=\"M112,-869.84C112,-862.21 112,-852.7 112,-844.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"115.5,-844.27 112,-834.27 108.5,-844.27 115.5,-844.27\"/>\n</g>\n<!-- 140447553041216 -->\n<g id=\"node7\" class=\"node\">\n<title>140447553041216</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"232.5,-272 143.5,-272 143.5,-253 232.5,-253 232.5,-272\"/>\n<text text-anchor=\"middle\" x=\"188\" y=\"-260\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553041216&#45;&gt;140450536407152 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140447553041216&#45;&gt;140450536407152</title>\n<path fill=\"none\" stroke=\"black\" d=\"M188,-252.87C188,-246.22 188,-236.63 188,-227.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"191.5,-227.01 188,-217.01 184.5,-227.01 191.5,-227.01\"/>\n</g>\n<!-- 140447553041312 -->\n<g id=\"node8\" class=\"node\">\n<title>140447553041312</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"232.5,-327 143.5,-327 143.5,-308 232.5,-308 232.5,-327\"/>\n<text text-anchor=\"middle\" x=\"188\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- 140447553041312&#45;&gt;140447553041216 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140447553041312&#45;&gt;140447553041216</title>\n<path fill=\"none\" stroke=\"black\" d=\"M188,-307.75C188,-300.8 188,-290.85 188,-282.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"191.5,-282.09 188,-272.09 184.5,-282.09 191.5,-282.09\"/>\n</g>\n<!-- 140447553041408 -->\n<g id=\"node9\" class=\"node\">\n<title>140447553041408</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"176.5,-437 87.5,-437 87.5,-418 176.5,-418 176.5,-437\"/>\n<text text-anchor=\"middle\" x=\"132\" y=\"-425\" font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- 140447553041408&#45;&gt;140447553041312 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140447553041408&#45;&gt;140447553041312</title>\n<path fill=\"none\" stroke=\"black\" d=\"M136.58,-417.66C145.78,-399.93 166.62,-359.73 178.76,-336.32\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"182.01,-337.65 183.51,-327.16 175.8,-334.42 182.01,-337.65\"/>\n</g>\n<!-- 140447553043376 -->\n<g id=\"node10\" class=\"node\">\n<title>140447553043376</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"173.5,-602 84.5,-602 84.5,-583 173.5,-583 173.5,-602\"/>\n<text text-anchor=\"middle\" x=\"129\" y=\"-590\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140447553043376&#45;&gt;140447553041408 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140447553043376&#45;&gt;140447553041408</title>\n<path fill=\"none\" stroke=\"black\" d=\"M129.16,-582.74C129.63,-557.31 131,-483.08 131.65,-447.69\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"135.15,-447.37 131.84,-437.31 128.15,-447.24 135.15,-447.37\"/>\n</g>\n<!-- 140447553041168 -->\n<g id=\"node11\" class=\"node\">\n<title>140447553041168</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"158.5,-657 69.5,-657 69.5,-638 158.5,-638 158.5,-657\"/>\n<text text-anchor=\"middle\" x=\"114\" y=\"-645\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553041168&#45;&gt;140447553043376 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140447553041168&#45;&gt;140447553043376</title>\n<path fill=\"none\" stroke=\"black\" d=\"M116.48,-637.75C118.46,-630.72 121.32,-620.62 123.81,-611.84\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"127.21,-612.66 126.57,-602.09 120.48,-610.76 127.21,-612.66\"/>\n</g>\n<!-- 140447553042272 -->\n<g id=\"node12\" class=\"node\">\n<title>140447553042272</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"129.5,-712 28.5,-712 28.5,-693 129.5,-693 129.5,-712\"/>\n<text text-anchor=\"middle\" x=\"79\" y=\"-700\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553042272&#45;&gt;140447553041168 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140447553042272&#45;&gt;140447553041168</title>\n<path fill=\"none\" stroke=\"black\" d=\"M84.78,-692.75C89.62,-685.42 96.68,-674.73 102.64,-665.7\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"105.74,-667.36 108.33,-657.09 99.9,-663.5 105.74,-667.36\"/>\n</g>\n<!-- 140450290826352 -->\n<g id=\"node13\" class=\"node\">\n<title>140450290826352</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"106,-779 52,-779 52,-748 106,-748 106,-779\"/>\n<text text-anchor=\"middle\" x=\"79\" y=\"-755\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140450290826352&#45;&gt;140447553042272 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140450290826352&#45;&gt;140447553042272</title>\n<path fill=\"none\" stroke=\"black\" d=\"M79,-747.92C79,-740.22 79,-730.69 79,-722.43\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"82.5,-722.25 79,-712.25 75.5,-722.25 82.5,-722.25\"/>\n</g>\n<!-- 140447553044432 -->\n<g id=\"node14\" class=\"node\">\n<title>140447553044432</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"292.5,-657 203.5,-657 203.5,-638 292.5,-638 292.5,-657\"/>\n<text text-anchor=\"middle\" x=\"248\" y=\"-645\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553044432&#45;&gt;140447553043376 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140447553044432&#45;&gt;140447553043376</title>\n<path fill=\"none\" stroke=\"black\" d=\"M228.88,-637.98C209.65,-629.42 179.77,-616.11 157.69,-606.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"159.03,-603.04 148.47,-602.17 156.18,-609.44 159.03,-603.04\"/>\n</g>\n<!-- 140447553018320 -->\n<g id=\"node24\" class=\"node\">\n<title>140447553018320</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"304.5,-602 191.5,-602 191.5,-583 304.5,-583 304.5,-602\"/>\n<text text-anchor=\"middle\" x=\"248\" y=\"-590\" font-family=\"monospace\" font-size=\"10.00\">AddcmulBackward0</text>\n</g>\n<!-- 140447553044432&#45;&gt;140447553018320 -->\n<g id=\"edge28\" class=\"edge\">\n<title>140447553044432&#45;&gt;140447553018320</title>\n<path fill=\"none\" stroke=\"black\" d=\"M242.83,-637.75C241.34,-630.8 240.9,-620.85 241.52,-612.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"245.01,-612.47 242.87,-602.09 238.07,-611.53 245.01,-612.47\"/>\n</g>\n<!-- 140447553044432&#45;&gt;140447553018320 -->\n<g id=\"edge29\" class=\"edge\">\n<title>140447553044432&#45;&gt;140447553018320</title>\n<path fill=\"none\" stroke=\"black\" d=\"M253.17,-637.75C254.66,-630.8 255.1,-620.85 254.48,-612.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"257.93,-611.53 253.13,-602.09 250.99,-612.47 257.93,-611.53\"/>\n</g>\n<!-- 140447553042080 -->\n<g id=\"node15\" class=\"node\">\n<title>140447553042080</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"308.5,-712 147.5,-712 147.5,-693 308.5,-693 308.5,-712\"/>\n<text text-anchor=\"middle\" x=\"228\" y=\"-700\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140447553042080&#45;&gt;140447553044432 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140447553042080&#45;&gt;140447553044432</title>\n<path fill=\"none\" stroke=\"black\" d=\"M231.3,-692.75C233.98,-685.65 237.85,-675.4 241.19,-666.56\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"244.5,-667.68 244.76,-657.09 237.95,-665.21 244.5,-667.68\"/>\n</g>\n<!-- 140447553019232&#45;&gt;140447553042080 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140447553019232&#45;&gt;140447553042080</title>\n<path fill=\"none\" stroke=\"black\" d=\"M228,-753.79C228,-745.6 228,-733.06 228,-722.55\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"231.5,-722.24 228,-712.24 224.5,-722.24 231.5,-722.24\"/>\n</g>\n<!-- 140447553019088 -->\n<g id=\"node17\" class=\"node\">\n<title>140447553019088</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"381.5,-834 292.5,-834 292.5,-815 381.5,-815 381.5,-834\"/>\n<text text-anchor=\"middle\" x=\"337\" y=\"-822\" font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- 140447553019088&#45;&gt;140447553044432 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140447553019088&#45;&gt;140447553044432</title>\n<path fill=\"none\" stroke=\"black\" d=\"M338.08,-814.72C340.35,-792.38 343.51,-732.4 318,-693 309.03,-679.15 294.07,-668.79 280.28,-661.56\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"281.53,-658.27 271.01,-657.05 278.47,-664.57 281.53,-658.27\"/>\n</g>\n<!-- 140447553019088&#45;&gt;140447553019232 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140447553019088&#45;&gt;140447553019232</title>\n<path fill=\"none\" stroke=\"black\" d=\"M320.92,-814.79C302.94,-805.07 273.63,-789.2 252.73,-777.89\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"254.26,-774.73 243.79,-773.05 250.92,-780.89 254.26,-774.73\"/>\n</g>\n<!-- 140447553018464 -->\n<g id=\"node18\" class=\"node\">\n<title>140447553018464</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"466.5,-894.5 365.5,-894.5 365.5,-875.5 466.5,-875.5 466.5,-894.5\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-882.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553018464&#45;&gt;140447553019088 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140447553018464&#45;&gt;140447553019088</title>\n<path fill=\"none\" stroke=\"black\" d=\"M404.34,-875.37C391.86,-866.12 371.82,-851.28 356.84,-840.19\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"358.82,-837.31 348.7,-834.17 354.66,-842.93 358.82,-837.31\"/>\n</g>\n<!-- 140447553043328 -->\n<g id=\"node28\" class=\"node\">\n<title>140447553043328</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"500.5,-492 411.5,-492 411.5,-473 500.5,-473 500.5,-492\"/>\n<text text-anchor=\"middle\" x=\"456\" y=\"-480\" font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- 140447553018464&#45;&gt;140447553043328 -->\n<g id=\"edge31\" class=\"edge\">\n<title>140447553018464&#45;&gt;140447553043328</title>\n<path fill=\"none\" stroke=\"black\" d=\"M426.98,-875.42C448.69,-857.4 495,-813.25 495,-764.5 495,-764.5 495,-764.5 495,-591.5 495,-557.92 478.12,-521.78 466.57,-500.98\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"469.46,-498.98 461.42,-492.08 463.4,-502.49 469.46,-498.98\"/>\n</g>\n<!-- 140447553148144 -->\n<g id=\"node19\" class=\"node\">\n<title>140447553148144</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"443,-966 389,-966 389,-936 443,-936 443,-966\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-954\" font-family=\"monospace\" font-size=\"10.00\">x</text>\n<text text-anchor=\"middle\" x=\"416\" y=\"-943\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553148144&#45;&gt;140447553018464 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140447553148144&#45;&gt;140447553018464</title>\n<path fill=\"none\" stroke=\"black\" d=\"M416,-935.8C416,-926.7 416,-914.79 416,-904.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-904.84 416,-894.84 412.5,-904.84 419.5,-904.84\"/>\n</g>\n<!-- 140447553041456 -->\n<g id=\"node20\" class=\"node\">\n<title>140447553041456</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"264.5,-382 175.5,-382 175.5,-363 264.5,-363 264.5,-382\"/>\n<text text-anchor=\"middle\" x=\"220\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140447553041456&#45;&gt;140447553041312 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140447553041456&#45;&gt;140447553041312</title>\n<path fill=\"none\" stroke=\"black\" d=\"M214.72,-362.75C210.29,-355.42 203.84,-344.73 198.38,-335.7\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"201.35,-333.84 193.19,-327.09 195.36,-337.46 201.35,-333.84\"/>\n</g>\n<!-- 140447553041360 -->\n<g id=\"node21\" class=\"node\">\n<title>140447553041360</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"290.5,-437 195.5,-437 195.5,-418 290.5,-418 290.5,-437\"/>\n<text text-anchor=\"middle\" x=\"243\" y=\"-425\" font-family=\"monospace\" font-size=\"10.00\">SqrtBackward0</text>\n</g>\n<!-- 140447553041360&#45;&gt;140447553041456 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140447553041360&#45;&gt;140447553041456</title>\n<path fill=\"none\" stroke=\"black\" d=\"M239.2,-417.75C236.09,-410.57 231.58,-400.18 227.71,-391.27\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230.92,-389.87 223.73,-382.09 224.5,-392.66 230.92,-389.87\"/>\n</g>\n<!-- 140447553015920 -->\n<g id=\"node22\" class=\"node\">\n<title>140447553015920</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"288.5,-492 199.5,-492 199.5,-473 288.5,-473 288.5,-492\"/>\n<text text-anchor=\"middle\" x=\"244\" y=\"-480\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140447553015920&#45;&gt;140447553041360 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140447553015920&#45;&gt;140447553041360</title>\n<path fill=\"none\" stroke=\"black\" d=\"M243.83,-472.75C243.7,-465.8 243.52,-455.85 243.35,-447.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"246.85,-447.02 243.16,-437.09 239.85,-447.15 246.85,-447.02\"/>\n</g>\n<!-- 140447553018560 -->\n<g id=\"node23\" class=\"node\">\n<title>140447553018560</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"288.5,-547 199.5,-547 199.5,-528 288.5,-528 288.5,-547\"/>\n<text text-anchor=\"middle\" x=\"244\" y=\"-535\" font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- 140447553018560&#45;&gt;140447553015920 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140447553018560&#45;&gt;140447553015920</title>\n<path fill=\"none\" stroke=\"black\" d=\"M244,-527.75C244,-520.8 244,-510.85 244,-502.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"247.5,-502.09 244,-492.09 240.5,-502.09 247.5,-502.09\"/>\n</g>\n<!-- 140447553018320&#45;&gt;140447553018560 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140447553018320&#45;&gt;140447553018560</title>\n<path fill=\"none\" stroke=\"black\" d=\"M247.34,-582.75C246.82,-575.8 246.06,-565.85 245.41,-557.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"248.89,-556.8 244.65,-547.09 241.91,-557.32 248.89,-556.8\"/>\n</g>\n<!-- 140447553018272 -->\n<g id=\"node25\" class=\"node\">\n<title>140447553018272</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"420.5,-657 331.5,-657 331.5,-638 420.5,-638 420.5,-657\"/>\n<text text-anchor=\"middle\" x=\"376\" y=\"-645\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447553018272&#45;&gt;140447553018320 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140447553018272&#45;&gt;140447553018320</title>\n<path fill=\"none\" stroke=\"black\" d=\"M355.43,-637.98C334.57,-629.34 302.03,-615.87 278.23,-606.02\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"279.52,-602.76 268.94,-602.17 276.84,-609.23 279.52,-602.76\"/>\n</g>\n<!-- 140447553018944 -->\n<g id=\"node26\" class=\"node\">\n<title>140447553018944</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"466.5,-712 365.5,-712 365.5,-693 466.5,-693 466.5,-712\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-700\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553018944&#45;&gt;140447553018272 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140447553018944&#45;&gt;140447553018272</title>\n<path fill=\"none\" stroke=\"black\" d=\"M409.39,-692.75C403.74,-685.26 395.46,-674.28 388.55,-665.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"391.3,-662.96 382.48,-657.09 385.71,-667.18 391.3,-662.96\"/>\n</g>\n<!-- 140450290824272 -->\n<g id=\"node27\" class=\"node\">\n<title>140450290824272</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"443,-779 389,-779 389,-748 443,-748 443,-779\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-755\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140450290824272&#45;&gt;140447553018944 -->\n<g id=\"edge27\" class=\"edge\">\n<title>140450290824272&#45;&gt;140447553018944</title>\n<path fill=\"none\" stroke=\"black\" d=\"M416,-747.92C416,-740.22 416,-730.69 416,-722.43\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-722.25 416,-712.25 412.5,-722.25 419.5,-722.25\"/>\n</g>\n<!-- 140447553043328&#45;&gt;140447553043424 -->\n<g id=\"edge30\" class=\"edge\">\n<title>140447553043328&#45;&gt;140447553043424</title>\n<path fill=\"none\" stroke=\"black\" d=\"M450.44,-472.94C439.45,-455.18 416,-412.69 416,-373.5 416,-373.5 416,-373.5 416,-261.5 416,-204.49 352.72,-163.93 314.12,-144.49\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"315.42,-141.23 304.9,-140.01 312.36,-147.53 315.42,-141.23\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "net = Net()\n",
-    "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
-    "y = torch.tensor(1.0)\n",
-    "\n",
-    "optim = torchopt.MetaAdam(net, lr=1.0, moment_requires_grad=True)\n",
-    "\n",
-    "net_state_0 = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step0.')\n",
-    "inner_loss = F.mse_loss(net(x), y)\n",
-    "optim.step(inner_loss)\n",
-    "net_state_1 = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step1.')\n",
-    "\n",
-    "outer_loss = F.mse_loss(net(x), y)\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        outer_loss, params=[net_state_0, net_state_1, {'x': x, 'outer_loss': outer_loss}]\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can see that the additional moment terms are added into the computational graph when we set `moment_requires_grad=True`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Extract and Recover"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.1 Basic API\n",
-    "\n",
-    "We observe that how to reinitialize the inner-loop parameter in a new bi-level process vary in different meta-learning algorithms. For instance, in algorithm like Model-Agnostic Meta-Learning (MAML) ([arXiv:1703.03400](https://arxiv.org/abs/1703.03400)), every time a new task comes, we need to reset the parameters to the initial ones. In other cases such as Meta-Gradient Reinforcement Learning (MGRL) ([arXiv:1805.09801](https://arxiv.org/abs/1805.09801)), the inner-loop network parameter just inherit previous updated parameter to continue the new bi-level process.\n",
-    "\n",
-    "We provide the `torchopt.extract_state_dict` and `torchopt.recover_state_dict` functions to extract and restore the state of network and optimizer. By default, the extracted state dictionary is a reference (this design is for accumulating gradient of multi-task batch training, MAML for example). You can also set `by='copy'` to extract the copy of the state dictionary or set `by='deepcopy'` to have a detached copy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "a = tensor(-1.0000, grad_fn=<AddBackward0>)\n",
-      "a = tensor(-1.0000, grad_fn=<AddBackward0>)\n"
-     ]
-    }
-   ],
-   "source": [
-    "net = Net()\n",
-    "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
-    "\n",
-    "optim = torchopt.MetaAdam(net, lr=1.0)\n",
-    "\n",
-    "# Get the reference of state dictionary\n",
-    "init_net_state = torchopt.extract_state_dict(net, by='reference')\n",
-    "init_optim_state = torchopt.extract_state_dict(optim, by='reference')\n",
-    "# If set `detach_buffers=True`, the parameters are referenced as references while buffers are detached copies\n",
-    "init_net_state = torchopt.extract_state_dict(net, by='reference', detach_buffers=True)\n",
-    "\n",
-    "# Set `copy` to get the copy of the state dictionary\n",
-    "init_net_state_copy = torchopt.extract_state_dict(net, by='copy')\n",
-    "init_optim_state_copy = torchopt.extract_state_dict(optim, by='copy')\n",
-    "\n",
-    "# Set `deepcopy` to get the detached copy of state dictionary\n",
-    "init_net_state_deepcopy = torchopt.extract_state_dict(net, by='deepcopy')\n",
-    "init_optim_state_deepcopy = torchopt.extract_state_dict(optim, by='deepcopy')\n",
-    "\n",
-    "# Conduct 2 inner-loop optimization\n",
-    "for i in range(2):\n",
-    "    inner_loss = net(x)\n",
-    "    optim.step(inner_loss)\n",
-    "\n",
-    "print(f'a = {net.a!r}')\n",
-    "\n",
-    "# Recover and reconduct 2 inner-loop optimization\n",
-    "torchopt.recover_state_dict(net, init_net_state)\n",
-    "torchopt.recover_state_dict(optim, init_optim_state)\n",
-    "\n",
-    "for i in range(2):\n",
-    "    inner_loss = net(x)\n",
-    "    optim.step(inner_loss)\n",
-    "\n",
-    "print(f'a = {net.a!r}')  # the same result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.2 Multi-task Example with `extract_state_dict` and `recover_state_dict`\n",
-    "\n",
-    "Let's move to another more complex setting. Meta-Learning algorithms always fix network on several different tasks and accumulate outer loss of each task to the meta-gradient.\n",
-    "\n",
-    "Assume $x$ is a meta-parameter and $a$ is a normal parameter. We firstly update $a$ use inner loss $\\mathcal{L}_1^{\\textrm{in}} = a_0 \\cdot x^2$ to $a_1$. Then we use $a_1$ to compute the outer loss $\\mathcal{L}_1^{\\textrm{out}} = a_1 \\cdot x^2$ and backpropagate it. Then we use $a_0$ to compute the inner loss $\\mathcal{L}_2^{\\textrm{in}} = a_0 \\cdot x$ and update $a_0$ to $a_2 = a_0 - \\eta \\, \\frac{\\partial \\mathcal{L}_2^{\\textrm{in}}}{\\partial a_0} = a_0 - \\eta \\, x$. Then we compute outer loss $\\mathcal{L}_2^{\\textrm{out}} = a_2 \\cdot x$ and backpropagate it. So the accumulated meta-gradient would be:\n",
-    "\n",
-    "$$\n",
-    "\\begin{split}\n",
-    "        \\frac{\\partial \\mathcal{L}_1^{\\textrm{out}}}{\\partial x} + \\frac{\\partial \\mathcal{L}_2^{\\textrm{out}}}{\\partial x}\n",
-    "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + \\frac{\\partial (a_2 \\cdot x)}{\\partial x} \\\\\n",
-    "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + (\\frac{\\partial a_2}{\\partial x} \\cdot x + a_2) \\\\\n",
-    "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + [\\frac{\\partial (a_0 - \\eta \\, x)}{\\partial x} \\cdot x + (a_0 - \\eta \\, x)] \\\\\n",
-    "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + [(- \\eta) \\cdot x + (a_0 - \\eta \\, x)] \\\\\n",
-    "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + (- 2 \\, \\eta \\, x + a_0)\n",
-    "\\end{split}\n",
-    "$$"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's define the network and variables first."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Net2Tasks(nn.Module):\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.a = nn.Parameter(torch.tensor(1.0), requires_grad=True)\n",
-    "\n",
-    "    def task1(self, x):\n",
-    "        return self.a * x**2\n",
-    "\n",
-    "    def task2(self, x):\n",
-    "        return self.a * x\n",
-    "\n",
-    "\n",
-    "net = Net2Tasks()\n",
-    "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
-    "\n",
-    "optim = torchopt.MetaSGD(net, lr=1.0)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Once we call `step` method of `MetaOptimizer`, the parameters of the network would be changed. We should use `torchopt.extract_state_dict` to extract state and use `torchopt.recover_state_dict` to recover the state. Note that if we use optimizers that have momentum buffers, we should also extract and recover them, vanilla SGD does not have momentum buffers so code `init_optim_state = torchopt.extract_state_dict(optim)` and `torchopt.recover_state_dict(optim, init_optim_state)` have no effect."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "init_optim_state = ((EmptyState(),),)\n",
-      "Task 1: x.grad = tensor(-28.)\n",
-      "Accumulated: x.grad = tensor(-31.)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Get the reference of state dictionary\n",
-    "init_net_state = torchopt.extract_state_dict(net, by='reference')\n",
-    "init_optim_state = torchopt.extract_state_dict(optim, by='reference')\n",
-    "# The `state_dict` is empty for vanilla SGD optimizer\n",
-    "print(f'init_optim_state = {init_optim_state!r}')\n",
-    "\n",
-    "inner_loss_1 = net.task1(x)\n",
-    "optim.step(inner_loss_1)\n",
-    "outer_loss_1 = net.task1(x)\n",
-    "outer_loss_1.backward()\n",
-    "print(f'Task 1: x.grad = {x.grad!r}')\n",
-    "\n",
-    "torchopt.recover_state_dict(net, init_net_state)\n",
-    "torchopt.recover_state_dict(optim, init_optim_state)\n",
-    "inner_loss_2 = net.task2(x)\n",
-    "optim.step(inner_loss_2)\n",
-    "outer_loss_2 = net.task2(x)\n",
-    "outer_loss_2.backward()\n",
-    "\n",
-    "# `extract_state_dict`` extracts the reference so gradient accumulates\n",
-    "# x.grad = (- 4 * lr * x^3 + 2 * a_0 * x) + (- 2 * lr * x + a_0)\n",
-    "#        = (- 4 * 1 * 2^3 + 2 * 1 * 2) + (- 2 * 1 * 2 + 1)\n",
-    "#        = -28 - 3\n",
-    "#        = -31\n",
-    "print(f'Accumulated: x.grad = {x.grad!r}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Gradient Transformation in `MetaOptimizer`"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can also use some gradient normalization tricks in our `MetaOptimizer`. In fact `MetaOptimizer` decedents like `MetaSGD` are specializations of `MetaOptimizer`. Specifically, `MetaSGD(net, lr=1.)` is `MetaOptimizer(net, alias.sgd(lr=1., moment_requires_grad=True))`, where flag `moment_requires_grad=True` means the momentums are created with flag `requires_grad=True` so the momentums will also be the part of the computation graph.\n",
-    "\n",
-    "In the designing of TorchOpt, we treat these functions as derivations of `combine.chain`. So we can build our own chain like `combine.chain(clip.clip_grad_norm(max_norm=1.), sgd(lr=1., requires_grad=True))` to clip the gradient and update parameters using `sgd`.\n",
-    "\n",
-    "$$\n",
-    "\\begin{aligned}\n",
-    "        \\frac{\\partial \\mathcal{L}^{\\textrm{out}}}{\\partial x}\n",
-    "    & = \\frac{\\partial (a_1 \\cdot x^2)}{\\partial x} \\\\\n",
-    "    & = \\frac{\\partial a_1}{\\partial x} \\cdot x^2 + a_1 \\cdot \\frac{\\partial (x^2)}{\\partial x} \\\\\n",
-    "    & = \\frac{\\partial (a_0 - \\eta \\, g)}{\\partial x} \\cdot x^2 + (a_0 - \\eta \\, g) \\cdot 2 x                                  & \\qquad (g \\propto \\frac{\\partial \\mathcal{L}^{\\textrm{in}}}{\\partial a_0} = x^2, \\  {\\lVert g \\rVert}_2 \\le G_{\\max}) \\\\\n",
-    "    & = \\frac{\\partial (a_0 - \\eta \\, \\beta^{-1} \\, x^2)}{\\partial x} \\cdot x^2 + (a_0 - \\eta \\, \\beta^{-1} \\, x^2) \\cdot 2 x  & \\qquad (g = \\beta^{-1} \\, x^2, \\   \\beta > 0, \\  {\\lVert g \\rVert}_2 \\le G_{\\max}) \\\\\n",
-    "    & = (- \\beta^{-1} \\, \\eta \\cdot 2 x) \\cdot x^2 + (a_0 - \\beta^{-1} \\, \\eta \\, x^2) \\cdot 2 x \\\\\n",
-    "    & = - 4 \\, \\beta^{-1} \\, \\eta \\, x^3 + 2 \\, a_0 \\, x\n",
-    "\\end{aligned}\n",
-    "$$"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "x.grad = tensor(-12.0000)\n"
-     ]
-    }
-   ],
-   "source": [
-    "net = Net()\n",
-    "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
-    "\n",
-    "optim_impl = torchopt.combine.chain(\n",
-    "    torchopt.clip.clip_grad_norm(max_norm=2.0),\n",
-    "    torchopt.sgd(lr=1.0, moment_requires_grad=True),\n",
-    ")\n",
-    "optim = torchopt.MetaOptimizer(net, optim_impl)\n",
-    "\n",
-    "inner_loss = net(x)\n",
-    "optim.step(inner_loss)\n",
-    "\n",
-    "outer_loss = net(x)\n",
-    "outer_loss.backward()\n",
-    "# Since `max_norm` is 2 and the gradient is x^2, so the scale = x^2 / 2 = 2^2 / 2 = 2\n",
-    "# x.grad = - 4 * lr * x^3 / scale + 2 * a_0 * x\n",
-    "#        = - 4 * 1 * 2^3 / 2 + 2 * 1 * 2\n",
-    "#        = -16 + 4\n",
-    "#        = -12\n",
-    "print(f'x.grad = {x.grad!r}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Learning Rate Scheduler\n",
-    "\n",
-    "TorchOpt also provides implementation of learning rate scheduler, which can be used as:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "functional_adam = torchopt.adam(\n",
-    "    lr=torchopt.schedule.linear_schedule(\n",
-    "        init_value=1e-3, end_value=1e-4, transition_steps=10000, transition_begin=2000\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "adam = torchopt.Adam(\n",
-    "    net.parameters(),\n",
-    "    lr=torchopt.schedule.linear_schedule(\n",
-    "        init_value=1e-3, end_value=1e-4, transition_steps=10000, transition_begin=2000\n",
-    "    ),\n",
-    ")\n",
-    "\n",
-    "meta_adam = torchopt.MetaAdam(\n",
-    "    net,\n",
-    "    lr=torchopt.schedule.linear_schedule(\n",
-    "        init_value=1e-3, end_value=1e-4, transition_steps=10000, transition_begin=2000\n",
-    "    ),\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5. Accelerated Optimizer\n",
-    "\n",
-    "Users can use accelerated optimizer by setting the `use_accelerated_op=True`. Currently we only support the Adam optimizer."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Check whether the `accelerated_op` is available:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "torchopt.accelerated_op_available(torch.device('cpu'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n"
-     ]
+  "cells" : [
+    {"cell_type" : "markdown", "metadata" : {}, "source" : ["# TorchOpt as Meta-Optimizer"]},
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["[<img align=\"left\" "
+                  "src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://"
+                  "colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/"
+                  "3_Meta_Optimizer.ipynb)"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["In this tutorial, we will show how to treat TorchOpt as a differentiable "
+                  "optimizer with traditional PyTorch optimization API. In addition, we also "
+                  "provide many other API for easy meta-learning algorithm implementations."]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 1. Basic API for Differentiable Optimizer\n",
+        "\n",
+        "`MetaOptimizer` is the main class for our differentiable optimizer. Combined with the "
+        "functional optimizer `torchopt.sgd` and `torchopt.adam` mentioned in the tutorial 1, we "
+        "can define our high-level API `torchopt.MetaSGD` and `torchopt.MetaAdam`. We will discuss "
+        "how this combination happens with `torchopt.chain` in Section 3. Let us consider the "
+        "problem below."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "Assume a tensor $x$ is a meta-parameter and $a$ is a normal parameters (such as network "
+        "parameters). We have inner loss $\\mathcal{L}^{\\textrm{in}} = a_0 \\cdot x^2$ and we "
+        "update $a$ use the gradient $\\frac{\\partial \\mathcal{L}^{\\textrm{in}}}{\\partial a_0} "
+        "= x^2$ and $a_1 = a_0 - \\eta \\, \\frac{\\partial \\mathcal{L}^{\\textrm{in}}}{\\partial "
+        "a_0} = a_0 - \\eta \\, x^2$. Then we compute the outer loss $\\mathcal{L}^{\\textrm{out}} "
+        "= a_1 \\cdot x^2$. So the gradient of outer loss to $x$ would be:\n",
+        "\n",
+        "$$\n",
+        "\\begin{split}\n",
+        "        \\frac{\\partial \\mathcal{L}^{\\textrm{out}}}{\\partial x}\n",
+        "    & = \\frac{\\partial (a_1 \\cdot x^2)}{\\partial x} \\\\\n",
+        "    & = \\frac{\\partial a_1}{\\partial x} \\cdot x^2 + a_1 \\cdot \\frac{\\partial "
+        "(x^2)}{\\partial x} \\\\\n",
+        "    & = \\frac{\\partial (a_0 - \\eta \\, x^2)}{\\partial x} \\cdot x^2 + (a_0 - \\eta "
+        "\\, x^2) \\cdot 2 x \\\\\n",
+        "    & = (- \\eta \\cdot 2 x) \\cdot x^2 + (a_0 - \\eta \\, x^2) \\cdot 2 x \\\\\n",
+        "    & = - 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x\n",
+        "\\end{split}\n",
+        "$$"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Given the analytical solution above. Let's try to verify it with TorchOpt. "
+                  "Define the net work first."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 1,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "from IPython.display import display\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "import torchopt\n",
+        "\n",
+        "\n",
+        "class Net(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super().__init__()\n",
+        "        self.a = nn.Parameter(torch.tensor(1.0), requires_grad=True)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.a * (x**2)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Then we declare the network (parameterized by `a`) and the meta-parameter `x`. "
+                  "Do not forget to set flag `requires_grad=True` for `x`."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 2,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [ "net = Net()\n", "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)" ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Next we declare the meta-optimizer. Here we show two equivalent ways of "
+                  "defining the meta-optimizer. "]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 3,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "# Low-level API\n",
+        "optim = torchopt.MetaOptimizer(net, torchopt.sgd(lr=1.0))\n",
+        "\n",
+        "# High-level API\n",
+        "optim = torchopt.MetaSGD(net, lr=1.0)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" :
+          ["The meta-optimizer takes the network as input and use method `step` to update the "
+           "network (parameterized by `a`). Finally, we show how a bi-level process works."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 4,
+      "metadata" : {},
+      "outputs" :
+          [ {"name" : "stdout", "output_type" : "stream", "text" : ["x.grad = tensor(-28.)\n"]} ],
+      "source" : [
+        "inner_loss = net(x)\n",
+        "optim.step(inner_loss)\n",
+        "\n",
+        "outer_loss = net(x)\n",
+        "outer_loss.backward()\n",
+        "# x.grad = - 4 * lr * x^3 + 2 * a_0 * x\n",
+        "#        = - 4 * 1 * 2^3 + 2 * 1 * 2\n",
+        "#        = -32 + 4\n",
+        "#        = -28\n",
+        "print(f'x.grad = {x.grad!r}')"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 1.1 Track the Gradient of Momentum\n",
+        "\n",
+        "Note that most modern optimizers involve moment term in the gradient update (basically "
+        "only SGD with `momentum=0` does not involve). We provide an option for user to choose "
+        "whether to also track the meta-gradient through moment term. The default option is "
+        "`moment_requires_grad=True`."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["- When you do not track the meta-gradient through moment "
+                  "(`moment_requires_grad=False`)"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 5,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7fbc7e823310>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"344pt\" height=\"962pt\"\n viewBox=\"0.00 0.00 343.50 962.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 958)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-958 339.5,-958 339.5,4 "
+                "-4,4\"/>\n<!-- 140447553047184 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140447553047184</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"179,-30 102,-30 102,0 179,0 179,-30\"/>\n<text "
+                "text-anchor=\"middle\" x=\"140.5\" y=\"-18\" font-family=\"monospace\" "
+                "font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"140.5\" "
+                "y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140447553041216 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140447553041216</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"197,-85 84,-85 84,-66 197,-66 197,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"140.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140447553041216&#45;&gt;140447553047184 -->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140447553041216&#45;&gt;140447553047184</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M140.5,-65.87C140.5,-59.11 140.5,-49.35 "
+                "140.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"144,-40.11 "
+                "140.5,-30.11 137,-40.11 144,-40.11\"/>\n</g>\n<!-- 140447553042896 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140447553042896</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"185,-140 96,-140 96,-121 185,-121 "
+                "185,-140\"/>\n<text text-anchor=\"middle\" x=\"140.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553042896&#45;&gt;140447553041216 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140447553042896&#45;&gt;140447553041216</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M140.5,-120.75C140.5,-113.8 140.5,-103.85 "
+                "140.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"144,-95.09 "
+                "140.5,-85.09 137,-95.09 144,-95.09\"/>\n</g>\n<!-- 140447553019088 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140447553019088</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"208,-217 119,-217 119,-176 208,-176 "
+                "208,-217\"/>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-205\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text "
+                "text-anchor=\"middle\" x=\"163.5\" y=\"-194\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.a</text>\n<text text-anchor=\"middle\" x=\"163.5\" "
+                "y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140447553019088&#45;&gt;140447553042896 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140447553019088&#45;&gt;140447553042896</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M156.47,-175.95C153.5,-167.67 150.05,-158.07 "
+                "147.12,-149.92\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"150.32,-148.49 143.65,-140.26 143.74,-150.86 "
+                "150.32,-148.49\"/>\n</g>\n<!-- 140447553041072 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140447553041072</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"333,-822 232,-822 232,-803 333,-803 333,-822\"/>\n<text "
+                "text-anchor=\"middle\" x=\"282.5\" y=\"-810\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553041072&#45;&gt;140447553019088 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140447553041072&#45;&gt;140447553019088</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M290.09,-802.96C304.75,-785.57 335.5,-744.23 "
+                "335.5,-703.5 335.5,-703.5 335.5,-703.5 335.5,-316.5 335.5,-258.46 268.44,-226.5 "
+                "218.07,-210.69\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"218.83,-207.26 208.24,-207.74 216.81,-213.97 "
+                "218.83,-207.26\"/>\n</g>\n<!-- 140447553043664 -->\n<g id=\"node13\" "
+                "class=\"node\">\n<title>140447553043664</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"271,-767 182,-767 182,-748 271,-748 271,-767\"/>\n<text "
+                "text-anchor=\"middle\" x=\"226.5\" y=\"-755\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553041072&#45;&gt;140447553043664 -->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140447553041072&#45;&gt;140447553043664</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M273.5,-802.98C265.31,-795.23 252.99,-783.58 "
+                "243.03,-774.14\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"245.33,-771.5 235.66,-767.17 240.52,-776.59 "
+                "245.33,-771.5\"/>\n</g>\n<!-- 140447553045344 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140447553045344</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"312,-888 253,-888 253,-858 312,-858 312,-888\"/>\n<text "
+                "text-anchor=\"middle\" x=\"282.5\" y=\"-876\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step0.a</text>\n<text text-anchor=\"middle\" x=\"282.5\" "
+                "y=\"-865\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140447553045344&#45;&gt;140447553041072 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140447553045344&#45;&gt;140447553041072</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M282.5,-857.84C282.5,-850.21 282.5,-840.7 "
+                "282.5,-832.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"286,-832.27 "
+                "282.5,-822.27 279,-832.27 286,-832.27\"/>\n</g>\n<!-- 140447553041120 -->\n<g "
+                "id=\"node7\" class=\"node\">\n<title>140447553041120</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"208,-272 119,-272 119,-253 208,-253 "
+                "208,-272\"/>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-260\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553041120&#45;&gt;140447553019088 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140447553041120&#45;&gt;140447553019088</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M163.5,-252.87C163.5,-246.22 163.5,-236.63 "
+                "163.5,-227.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"167,-227.01 "
+                "163.5,-217.01 160,-227.01 167,-227.01\"/>\n</g>\n<!-- 140447553043040 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140447553043040</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"208,-327 119,-327 119,-308 208,-308 "
+                "208,-327\"/>\n<text text-anchor=\"middle\" x=\"163.5\" y=\"-315\" "
+                "font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- "
+                "140447553043040&#45;&gt;140447553041120 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140447553043040&#45;&gt;140447553041120</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M163.5,-307.75C163.5,-300.8 163.5,-290.85 "
+                "163.5,-282.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"167,-282.09 "
+                "163.5,-272.09 160,-282.09 167,-282.09\"/>\n</g>\n<!-- 140447553043184 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140447553043184</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"164,-492 75,-492 75,-473 164,-473 "
+                "164,-492\"/>\n<text text-anchor=\"middle\" x=\"119.5\" y=\"-480\" "
+                "font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- "
+                "140447553043184&#45;&gt;140447553043040 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140447553043184&#45;&gt;140447553043040</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M121.52,-472.83C126.12,-453.19 137.92,-403.83 "
+                "149.5,-363 151.93,-354.43 154.86,-345.01 157.41,-337.05\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"160.8,-337.91 160.56,-327.32 "
+                "154.15,-335.75 160.8,-337.91\"/>\n</g>\n<!-- 140447553043328 -->\n<g "
+                "id=\"node10\" class=\"node\">\n<title>140447553043328</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"162,-602 73,-602 73,-583 162,-583 "
+                "162,-602\"/>\n<text text-anchor=\"middle\" x=\"117.5\" y=\"-590\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140447553043328&#45;&gt;140447553043184 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140447553043328&#45;&gt;140447553043184</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M117.66,-582.66C117.99,-565.17 118.72,-525.8 "
+                "119.15,-502.27\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"122.65,-502.22 119.34,-492.16 115.66,-502.09 "
+                "122.65,-502.22\"/>\n</g>\n<!-- 140447553043424 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140447553043424</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"271,-657 182,-657 182,-638 271,-638 271,-657\"/>\n<text "
+                "text-anchor=\"middle\" x=\"226.5\" y=\"-645\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553043424&#45;&gt;140447553043328 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140447553043424&#45;&gt;140447553043328</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M208.99,-637.98C191.53,-629.5 164.49,-616.35 "
+                "144.33,-606.54\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"145.86,-603.4 135.33,-602.17 142.79,-609.69 "
+                "145.86,-603.4\"/>\n</g>\n<!-- 140447553043856 -->\n<g id=\"node21\" "
+                "class=\"node\">\n<title>140447553043856</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"293,-602 180,-602 180,-583 293,-583 293,-602\"/>\n<text "
+                "text-anchor=\"middle\" x=\"236.5\" y=\"-590\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddcmulBackward0</text>\n</g>\n<!-- "
+                "140447553043424&#45;&gt;140447553043856 -->\n<g id=\"edge22\" "
+                "class=\"edge\">\n<title>140447553043424&#45;&gt;140447553043856</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M222.98,-637.75C222.8,-630.72 224.28,-620.62 "
+                "226.58,-611.84\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"229.99,-612.68 229.75,-602.09 223.33,-610.52 "
+                "229.99,-612.68\"/>\n</g>\n<!-- 140447553043424&#45;&gt;140447553043856 -->\n<g "
+                "id=\"edge23\" "
+                "class=\"edge\">\n<title>140447553043424&#45;&gt;140447553043856</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M233.32,-637.75C236.12,-630.8 238.44,-620.85 "
+                "239.46,-612.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"242.96,-612.26 240.01,-602.09 235.97,-611.88 "
+                "242.96,-612.26\"/>\n</g>\n<!-- 140447553043520 -->\n<g id=\"node12\" "
+                "class=\"node\">\n<title>140447553043520</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"307,-712 146,-712 146,-693 307,-693 307,-712\"/>\n<text "
+                "text-anchor=\"middle\" x=\"226.5\" y=\"-700\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140447553043520&#45;&gt;140447553043424 -->\n<g id=\"edge10\" "
+                "class=\"edge\">\n<title>140447553043520&#45;&gt;140447553043424</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M226.5,-692.75C226.5,-685.8 226.5,-675.85 "
+                "226.5,-667.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230,-667.09 "
+                "226.5,-657.09 223,-667.09 230,-667.09\"/>\n</g>\n<!-- "
+                "140447553043664&#45;&gt;140447553043520 -->\n<g id=\"edge11\" "
+                "class=\"edge\">\n<title>140447553043664&#45;&gt;140447553043520</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M226.5,-747.75C226.5,-740.8 226.5,-730.85 "
+                "226.5,-722.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230,-722.09 "
+                "226.5,-712.09 223,-722.09 230,-722.09\"/>\n</g>\n<!-- 140447553043472 -->\n<g "
+                "id=\"node14\" class=\"node\">\n<title>140447553043472</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"162,-822 73,-822 73,-803 162,-803 "
+                "162,-822\"/>\n<text text-anchor=\"middle\" x=\"117.5\" y=\"-810\" "
+                "font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- "
+                "140447553043472&#45;&gt;140447553043424 -->\n<g id=\"edge16\" "
+                "class=\"edge\">\n<title>140447553043472&#45;&gt;140447553043424</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M116.38,-802.74C114.16,-781.61 111.4,-727.04 "
+                "136.5,-693 147.65,-677.87 165.38,-667.59 182.16,-660.76\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"183.74,-663.91 191.88,-657.13 "
+                "181.29,-657.35 183.74,-663.91\"/>\n</g>\n<!-- "
+                "140447553043472&#45;&gt;140447553043664 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140447553043472&#45;&gt;140447553043664</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M135.01,-802.98C152.47,-794.5 179.51,-781.35 "
+                "199.67,-771.54\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"201.21,-774.69 208.67,-767.17 198.14,-768.4 "
+                "201.21,-774.69\"/>\n</g>\n<!-- 140447553043808 -->\n<g id=\"node15\" "
+                "class=\"node\">\n<title>140447553043808</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"132,-882.5 31,-882.5 31,-863.5 132,-863.5 "
+                "132,-882.5\"/>\n<text text-anchor=\"middle\" x=\"81.5\" y=\"-870.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553043808&#45;&gt;140447553043472 -->\n<g id=\"edge14\" "
+                "class=\"edge\">\n<title>140447553043808&#45;&gt;140447553043472</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M86.81,-863.37C92.08,-854.81 100.29,-841.47 "
+                "106.89,-830.74\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"109.91,-832.52 112.17,-822.17 103.94,-828.85 "
+                "109.91,-832.52\"/>\n</g>\n<!-- 140447553041264 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140447553041264</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"89,-767 0,-767 0,-748 89,-748 89,-767\"/>\n<text "
+                "text-anchor=\"middle\" x=\"44.5\" y=\"-755\" font-family=\"monospace\" "
+                "font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- "
+                "140447553043808&#45;&gt;140447553041264 -->\n<g id=\"edge25\" "
+                "class=\"edge\">\n<title>140447553043808&#45;&gt;140447553041264</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M78.17,-863.2C74.44,-853.25 68.29,-836.55 "
+                "63.5,-822 58.54,-806.95 53.42,-789.73 49.75,-777.01\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"53.05,-775.82 46.93,-767.17 46.32,-777.75 "
+                "53.05,-775.82\"/>\n</g>\n<!-- 140447553045584 -->\n<g id=\"node16\" "
+                "class=\"node\">\n<title>140447553045584</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"108.5,-954 54.5,-954 54.5,-924 108.5,-924 "
+                "108.5,-954\"/>\n<text text-anchor=\"middle\" x=\"81.5\" y=\"-942\" "
+                "font-family=\"monospace\" font-size=\"10.00\">x</text>\n<text "
+                "text-anchor=\"middle\" x=\"81.5\" y=\"-931\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553045584&#45;&gt;140447553043808 "
+                "-->\n<g id=\"edge15\" "
+                "class=\"edge\">\n<title>140447553045584&#45;&gt;140447553043808</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M81.5,-923.8C81.5,-914.7 81.5,-902.79 "
+                "81.5,-892.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"85,-892.84 "
+                "81.5,-882.84 78,-892.84 85,-892.84\"/>\n</g>\n<!-- 140447553043136 -->\n<g "
+                "id=\"node17\" class=\"node\">\n<title>140447553043136</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"247,-382 158,-382 158,-363 247,-363 "
+                "247,-382\"/>\n<text text-anchor=\"middle\" x=\"202.5\" y=\"-370\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140447553043136&#45;&gt;140447553043040 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140447553043136&#45;&gt;140447553043040</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M196.06,-362.75C190.61,-355.34 182.64,-344.5 "
+                "175.94,-335.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"178.57,-333.07 169.82,-327.09 172.93,-337.22 "
+                "178.57,-333.07\"/>\n</g>\n<!-- 140447553043232 -->\n<g id=\"node18\" "
+                "class=\"node\">\n<title>140447553043232</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"267,-437 172,-437 172,-418 267,-418 267,-437\"/>\n<text "
+                "text-anchor=\"middle\" x=\"219.5\" y=\"-425\" font-family=\"monospace\" "
+                "font-size=\"10.00\">SqrtBackward0</text>\n</g>\n<!-- "
+                "140447553043232&#45;&gt;140447553043136 -->\n<g id=\"edge18\" "
+                "class=\"edge\">\n<title>140447553043232&#45;&gt;140447553043136</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M216.69,-417.75C214.44,-410.72 211.2,-400.62 "
+                "208.38,-391.84\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"211.64,-390.54 205.25,-382.09 204.98,-392.68 "
+                "211.64,-390.54\"/>\n</g>\n<!-- 140447553043760 -->\n<g id=\"node19\" "
+                "class=\"node\">\n<title>140447553043760</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"274,-492 185,-492 185,-473 274,-473 274,-492\"/>\n<text "
+                "text-anchor=\"middle\" x=\"229.5\" y=\"-480\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140447553043760&#45;&gt;140447553043232 -->\n<g id=\"edge19\" "
+                "class=\"edge\">\n<title>140447553043760&#45;&gt;140447553043232</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M227.85,-472.75C226.54,-465.8 224.66,-455.85 "
+                "223.02,-447.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"226.41,-446.27 221.12,-437.09 219.54,-447.56 "
+                "226.41,-446.27\"/>\n</g>\n<!-- 140447553043904 -->\n<g id=\"node20\" "
+                "class=\"node\">\n<title>140447553043904</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"275,-547 186,-547 186,-528 275,-528 275,-547\"/>\n<text "
+                "text-anchor=\"middle\" x=\"230.5\" y=\"-535\" font-family=\"monospace\" "
+                "font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- "
+                "140447553043904&#45;&gt;140447553043760 -->\n<g id=\"edge20\" "
+                "class=\"edge\">\n<title>140447553043904&#45;&gt;140447553043760</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M230.33,-527.75C230.2,-520.8 230.02,-510.85 "
+                "229.85,-502.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"233.35,-502.02 229.66,-492.09 226.35,-502.15 "
+                "233.35,-502.02\"/>\n</g>\n<!-- 140447553043856&#45;&gt;140447553043904 -->\n<g "
+                "id=\"edge21\" "
+                "class=\"edge\">\n<title>140447553043856&#45;&gt;140447553043904</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M235.51,-582.75C234.72,-575.8 233.6,-565.85 "
+                "232.61,-557.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"236.08,-556.63 231.47,-547.09 229.12,-557.42 "
+                "236.08,-556.63\"/>\n</g>\n<!-- 140447553041264&#45;&gt;140447553042896 -->\n<g "
+                "id=\"edge24\" "
+                "class=\"edge\">\n<title>140447553041264&#45;&gt;140447553042896</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M44.5,-747.82C44.5,-729.48 44.5,-685.44 "
+                "44.5,-648.5 44.5,-648.5 44.5,-648.5 44.5,-261.5 44.5,-211.41 91.19,-167.96 "
+                "119.45,-146.25\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"121.61,-149 "
+                "127.55,-140.23 117.44,-143.38 121.61,-149\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "net = Net()\n",
+        "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
+        "y = torch.tensor(1.0)\n",
+        "\n",
+        "optim = torchopt.MetaAdam(net, lr=1.0, moment_requires_grad=False)\n",
+        "\n",
+        "net_state_0 = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step0.')\n",
+        "inner_loss = F.mse_loss(net(x), y)\n",
+        "optim.step(inner_loss)\n",
+        "net_state_1 = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step1.')\n",
+        "\n",
+        "outer_loss = F.mse_loss(net(x), y)\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        outer_loss, params=[net_state_0, net_state_1, {'x': x, 'outer_loss': "
+        "outer_loss}]\n",
+        "    )\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["- When you track the meta-gradient through moment (`moment_requires_grad=True`, "
+                  "default for `torchopt.MetaAdam`)"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 6,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7fbc7e8238e0>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"509pt\" height=\"974pt\"\n viewBox=\"0.00 0.00 508.50 974.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 970)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-970 504.5,-970 504.5,4 "
+                "-4,4\"/>\n<!-- 140447553148704 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140447553148704</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"323.5,-30 246.5,-30 246.5,0 323.5,0 "
+                "323.5,-30\"/>\n<text text-anchor=\"middle\" x=\"285\" y=\"-18\" "
+                "font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text "
+                "text-anchor=\"middle\" x=\"285\" y=\"-7\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553041024 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140447553041024</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"341.5,-85 228.5,-85 228.5,-66 341.5,-66 "
+                "341.5,-85\"/>\n<text text-anchor=\"middle\" x=\"285\" y=\"-73\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140447553041024&#45;&gt;140447553148704 -->\n<g id=\"edge32\" "
+                "class=\"edge\">\n<title>140447553041024&#45;&gt;140447553148704</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M285,-65.87C285,-59.11 285,-49.35 "
+                "285,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"288.5,-40.11 "
+                "285,-30.11 281.5,-40.11 288.5,-40.11\"/>\n</g>\n<!-- 140447553043424 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140447553043424</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"329.5,-140 240.5,-140 240.5,-121 "
+                "329.5,-121 329.5,-140\"/>\n<text text-anchor=\"middle\" x=\"285\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553043424&#45;&gt;140447553041024 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140447553043424&#45;&gt;140447553041024</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M285,-120.75C285,-113.8 285,-103.85 "
+                "285,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"288.5,-95.09 "
+                "285,-85.09 281.5,-95.09 288.5,-95.09\"/>\n</g>\n<!-- 140450536407152 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140450536407152</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"232.5,-217 143.5,-217 143.5,-176 "
+                "232.5,-176 232.5,-217\"/>\n<text text-anchor=\"middle\" x=\"188\" y=\"-205\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text "
+                "text-anchor=\"middle\" x=\"188\" y=\"-194\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.a</text>\n<text text-anchor=\"middle\" x=\"188\" "
+                "y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140450536407152&#45;&gt;140447553043424 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140450536407152&#45;&gt;140447553043424</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M217.63,-175.95C232.39,-166.21 249.91,-154.65 "
+                "263.38,-145.76\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"265.56,-148.52 271.98,-140.09 261.7,-142.68 "
+                "265.56,-148.52\"/>\n</g>\n<!-- 140447553041264 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140447553041264</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"162.5,-834 61.5,-834 61.5,-815 162.5,-815 "
+                "162.5,-834\"/>\n<text text-anchor=\"middle\" x=\"112\" y=\"-822\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553041264&#45;&gt;140450536407152 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140447553041264&#45;&gt;140450536407152</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M94.01,-814.75C62.35,-797.94 0,-757.93 "
+                "0,-703.5 0,-703.5 0,-703.5 0,-316.5 0,-252.73 78.18,-221.72 "
+                "133.71,-207.74\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"134.54,-211.14 143.45,-205.4 132.91,-204.33 "
+                "134.54,-211.14\"/>\n</g>\n<!-- 140447553019232 -->\n<g id=\"node16\" "
+                "class=\"node\">\n<title>140447553019232</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"272.5,-773 183.5,-773 183.5,-754 272.5,-754 "
+                "272.5,-773\"/>\n<text text-anchor=\"middle\" x=\"228\" y=\"-761\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553041264&#45;&gt;140447553019232 -->\n<g id=\"edge15\" "
+                "class=\"edge\">\n<title>140447553041264&#45;&gt;140447553019232</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M129.12,-814.79C148.33,-805.02 179.72,-789.05 "
+                "201.98,-777.73\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"203.86,-780.7 211.19,-773.05 200.69,-774.46 "
+                "203.86,-780.7\"/>\n</g>\n<!-- 140447553148064 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140447553148064</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"141.5,-900 82.5,-900 82.5,-870 141.5,-870 "
+                "141.5,-900\"/>\n<text text-anchor=\"middle\" x=\"112\" y=\"-888\" "
+                "font-family=\"monospace\" font-size=\"10.00\">step0.a</text>\n<text "
+                "text-anchor=\"middle\" x=\"112\" y=\"-877\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553148064&#45;&gt;140447553041264 "
+                "-->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140447553148064&#45;&gt;140447553041264</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M112,-869.84C112,-862.21 112,-852.7 "
+                "112,-844.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"115.5,-844.27 "
+                "112,-834.27 108.5,-844.27 115.5,-844.27\"/>\n</g>\n<!-- 140447553041216 -->\n<g "
+                "id=\"node7\" class=\"node\">\n<title>140447553041216</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"232.5,-272 143.5,-272 143.5,-253 "
+                "232.5,-253 232.5,-272\"/>\n<text text-anchor=\"middle\" x=\"188\" y=\"-260\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553041216&#45;&gt;140450536407152 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140447553041216&#45;&gt;140450536407152</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M188,-252.87C188,-246.22 188,-236.63 "
+                "188,-227.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"191.5,-227.01 "
+                "188,-217.01 184.5,-227.01 191.5,-227.01\"/>\n</g>\n<!-- 140447553041312 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140447553041312</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"232.5,-327 143.5,-327 143.5,-308 "
+                "232.5,-308 232.5,-327\"/>\n<text text-anchor=\"middle\" x=\"188\" y=\"-315\" "
+                "font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- "
+                "140447553041312&#45;&gt;140447553041216 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140447553041312&#45;&gt;140447553041216</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M188,-307.75C188,-300.8 188,-290.85 "
+                "188,-282.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"191.5,-282.09 "
+                "188,-272.09 184.5,-282.09 191.5,-282.09\"/>\n</g>\n<!-- 140447553041408 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140447553041408</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"176.5,-437 87.5,-437 87.5,-418 "
+                "176.5,-418 176.5,-437\"/>\n<text text-anchor=\"middle\" x=\"132\" y=\"-425\" "
+                "font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- "
+                "140447553041408&#45;&gt;140447553041312 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140447553041408&#45;&gt;140447553041312</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M136.58,-417.66C145.78,-399.93 166.62,-359.73 "
+                "178.76,-336.32\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"182.01,-337.65 183.51,-327.16 175.8,-334.42 "
+                "182.01,-337.65\"/>\n</g>\n<!-- 140447553043376 -->\n<g id=\"node10\" "
+                "class=\"node\">\n<title>140447553043376</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"173.5,-602 84.5,-602 84.5,-583 173.5,-583 "
+                "173.5,-602\"/>\n<text text-anchor=\"middle\" x=\"129\" y=\"-590\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140447553043376&#45;&gt;140447553041408 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140447553043376&#45;&gt;140447553041408</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M129.16,-582.74C129.63,-557.31 131,-483.08 "
+                "131.65,-447.69\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"135.15,-447.37 131.84,-437.31 128.15,-447.24 "
+                "135.15,-447.37\"/>\n</g>\n<!-- 140447553041168 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140447553041168</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"158.5,-657 69.5,-657 69.5,-638 158.5,-638 "
+                "158.5,-657\"/>\n<text text-anchor=\"middle\" x=\"114\" y=\"-645\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553041168&#45;&gt;140447553043376 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140447553041168&#45;&gt;140447553043376</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M116.48,-637.75C118.46,-630.72 121.32,-620.62 "
+                "123.81,-611.84\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"127.21,-612.66 126.57,-602.09 120.48,-610.76 "
+                "127.21,-612.66\"/>\n</g>\n<!-- 140447553042272 -->\n<g id=\"node12\" "
+                "class=\"node\">\n<title>140447553042272</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"129.5,-712 28.5,-712 28.5,-693 129.5,-693 "
+                "129.5,-712\"/>\n<text text-anchor=\"middle\" x=\"79\" y=\"-700\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553042272&#45;&gt;140447553041168 -->\n<g id=\"edge10\" "
+                "class=\"edge\">\n<title>140447553042272&#45;&gt;140447553041168</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M84.78,-692.75C89.62,-685.42 96.68,-674.73 "
+                "102.64,-665.7\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"105.74,-667.36 108.33,-657.09 99.9,-663.5 105.74,-667.36\"/>\n</g>\n<!-- "
+                "140450290826352 -->\n<g id=\"node13\" "
+                "class=\"node\">\n<title>140450290826352</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"106,-779 52,-779 52,-748 106,-748 106,-779\"/>\n<text "
+                "text-anchor=\"middle\" x=\"79\" y=\"-755\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140450290826352&#45;&gt;140447553042272 "
+                "-->\n<g id=\"edge11\" "
+                "class=\"edge\">\n<title>140450290826352&#45;&gt;140447553042272</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M79,-747.92C79,-740.22 79,-730.69 "
+                "79,-722.43\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"82.5,-722.25 "
+                "79,-712.25 75.5,-722.25 82.5,-722.25\"/>\n</g>\n<!-- 140447553044432 -->\n<g "
+                "id=\"node14\" class=\"node\">\n<title>140447553044432</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"292.5,-657 203.5,-657 203.5,-638 "
+                "292.5,-638 292.5,-657\"/>\n<text text-anchor=\"middle\" x=\"248\" y=\"-645\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553044432&#45;&gt;140447553043376 -->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140447553044432&#45;&gt;140447553043376</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M228.88,-637.98C209.65,-629.42 179.77,-616.11 "
+                "157.69,-606.28\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"159.03,-603.04 148.47,-602.17 156.18,-609.44 "
+                "159.03,-603.04\"/>\n</g>\n<!-- 140447553018320 -->\n<g id=\"node24\" "
+                "class=\"node\">\n<title>140447553018320</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"304.5,-602 191.5,-602 191.5,-583 304.5,-583 "
+                "304.5,-602\"/>\n<text text-anchor=\"middle\" x=\"248\" y=\"-590\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddcmulBackward0</text>\n</g>\n<!-- "
+                "140447553044432&#45;&gt;140447553018320 -->\n<g id=\"edge28\" "
+                "class=\"edge\">\n<title>140447553044432&#45;&gt;140447553018320</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M242.83,-637.75C241.34,-630.8 240.9,-620.85 "
+                "241.52,-612.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"245.01,-612.47 242.87,-602.09 238.07,-611.53 "
+                "245.01,-612.47\"/>\n</g>\n<!-- 140447553044432&#45;&gt;140447553018320 -->\n<g "
+                "id=\"edge29\" "
+                "class=\"edge\">\n<title>140447553044432&#45;&gt;140447553018320</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M253.17,-637.75C254.66,-630.8 255.1,-620.85 "
+                "254.48,-612.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"257.93,-611.53 253.13,-602.09 250.99,-612.47 "
+                "257.93,-611.53\"/>\n</g>\n<!-- 140447553042080 -->\n<g id=\"node15\" "
+                "class=\"node\">\n<title>140447553042080</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"308.5,-712 147.5,-712 147.5,-693 308.5,-693 "
+                "308.5,-712\"/>\n<text text-anchor=\"middle\" x=\"228\" y=\"-700\" "
+                "font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140447553042080&#45;&gt;140447553044432 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140447553042080&#45;&gt;140447553044432</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M231.3,-692.75C233.98,-685.65 237.85,-675.4 "
+                "241.19,-666.56\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"244.5,-667.68 244.76,-657.09 237.95,-665.21 "
+                "244.5,-667.68\"/>\n</g>\n<!-- 140447553019232&#45;&gt;140447553042080 -->\n<g "
+                "id=\"edge14\" "
+                "class=\"edge\">\n<title>140447553019232&#45;&gt;140447553042080</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M228,-753.79C228,-745.6 228,-733.06 "
+                "228,-722.55\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"231.5,-722.24 "
+                "228,-712.24 224.5,-722.24 231.5,-722.24\"/>\n</g>\n<!-- 140447553019088 -->\n<g "
+                "id=\"node17\" class=\"node\">\n<title>140447553019088</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"381.5,-834 292.5,-834 292.5,-815 "
+                "381.5,-815 381.5,-834\"/>\n<text text-anchor=\"middle\" x=\"337\" y=\"-822\" "
+                "font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- "
+                "140447553019088&#45;&gt;140447553044432 -->\n<g id=\"edge19\" "
+                "class=\"edge\">\n<title>140447553019088&#45;&gt;140447553044432</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M338.08,-814.72C340.35,-792.38 343.51,-732.4 "
+                "318,-693 309.03,-679.15 294.07,-668.79 280.28,-661.56\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"281.53,-658.27 271.01,-657.05 "
+                "278.47,-664.57 281.53,-658.27\"/>\n</g>\n<!-- "
+                "140447553019088&#45;&gt;140447553019232 -->\n<g id=\"edge16\" "
+                "class=\"edge\">\n<title>140447553019088&#45;&gt;140447553019232</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M320.92,-814.79C302.94,-805.07 273.63,-789.2 "
+                "252.73,-777.89\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"254.26,-774.73 243.79,-773.05 250.92,-780.89 "
+                "254.26,-774.73\"/>\n</g>\n<!-- 140447553018464 -->\n<g id=\"node18\" "
+                "class=\"node\">\n<title>140447553018464</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"466.5,-894.5 365.5,-894.5 365.5,-875.5 466.5,-875.5 "
+                "466.5,-894.5\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-882.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553018464&#45;&gt;140447553019088 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140447553018464&#45;&gt;140447553019088</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M404.34,-875.37C391.86,-866.12 371.82,-851.28 "
+                "356.84,-840.19\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"358.82,-837.31 348.7,-834.17 354.66,-842.93 "
+                "358.82,-837.31\"/>\n</g>\n<!-- 140447553043328 -->\n<g id=\"node28\" "
+                "class=\"node\">\n<title>140447553043328</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"500.5,-492 411.5,-492 411.5,-473 500.5,-473 "
+                "500.5,-492\"/>\n<text text-anchor=\"middle\" x=\"456\" y=\"-480\" "
+                "font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- "
+                "140447553018464&#45;&gt;140447553043328 -->\n<g id=\"edge31\" "
+                "class=\"edge\">\n<title>140447553018464&#45;&gt;140447553043328</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M426.98,-875.42C448.69,-857.4 495,-813.25 "
+                "495,-764.5 495,-764.5 495,-764.5 495,-591.5 495,-557.92 478.12,-521.78 "
+                "466.57,-500.98\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"469.46,-498.98 461.42,-492.08 463.4,-502.49 "
+                "469.46,-498.98\"/>\n</g>\n<!-- 140447553148144 -->\n<g id=\"node19\" "
+                "class=\"node\">\n<title>140447553148144</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"443,-966 389,-966 389,-936 443,-936 443,-966\"/>\n<text "
+                "text-anchor=\"middle\" x=\"416\" y=\"-954\" font-family=\"monospace\" "
+                "font-size=\"10.00\">x</text>\n<text text-anchor=\"middle\" x=\"416\" y=\"-943\" "
+                "font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140447553148144&#45;&gt;140447553018464 -->\n<g id=\"edge18\" "
+                "class=\"edge\">\n<title>140447553148144&#45;&gt;140447553018464</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M416,-935.8C416,-926.7 416,-914.79 "
+                "416,-904.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-904.84 "
+                "416,-894.84 412.5,-904.84 419.5,-904.84\"/>\n</g>\n<!-- 140447553041456 -->\n<g "
+                "id=\"node20\" class=\"node\">\n<title>140447553041456</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"264.5,-382 175.5,-382 175.5,-363 "
+                "264.5,-363 264.5,-382\"/>\n<text text-anchor=\"middle\" x=\"220\" y=\"-370\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140447553041456&#45;&gt;140447553041312 -->\n<g id=\"edge20\" "
+                "class=\"edge\">\n<title>140447553041456&#45;&gt;140447553041312</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M214.72,-362.75C210.29,-355.42 203.84,-344.73 "
+                "198.38,-335.7\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"201.35,-333.84 193.19,-327.09 195.36,-337.46 "
+                "201.35,-333.84\"/>\n</g>\n<!-- 140447553041360 -->\n<g id=\"node21\" "
+                "class=\"node\">\n<title>140447553041360</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"290.5,-437 195.5,-437 195.5,-418 290.5,-418 "
+                "290.5,-437\"/>\n<text text-anchor=\"middle\" x=\"243\" y=\"-425\" "
+                "font-family=\"monospace\" font-size=\"10.00\">SqrtBackward0</text>\n</g>\n<!-- "
+                "140447553041360&#45;&gt;140447553041456 -->\n<g id=\"edge21\" "
+                "class=\"edge\">\n<title>140447553041360&#45;&gt;140447553041456</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M239.2,-417.75C236.09,-410.57 231.58,-400.18 "
+                "227.71,-391.27\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"230.92,-389.87 223.73,-382.09 224.5,-392.66 "
+                "230.92,-389.87\"/>\n</g>\n<!-- 140447553015920 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140447553015920</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"288.5,-492 199.5,-492 199.5,-473 288.5,-473 "
+                "288.5,-492\"/>\n<text text-anchor=\"middle\" x=\"244\" y=\"-480\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140447553015920&#45;&gt;140447553041360 -->\n<g id=\"edge22\" "
+                "class=\"edge\">\n<title>140447553015920&#45;&gt;140447553041360</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M243.83,-472.75C243.7,-465.8 243.52,-455.85 "
+                "243.35,-447.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"246.85,-447.02 243.16,-437.09 239.85,-447.15 "
+                "246.85,-447.02\"/>\n</g>\n<!-- 140447553018560 -->\n<g id=\"node23\" "
+                "class=\"node\">\n<title>140447553018560</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"288.5,-547 199.5,-547 199.5,-528 288.5,-528 "
+                "288.5,-547\"/>\n<text text-anchor=\"middle\" x=\"244\" y=\"-535\" "
+                "font-family=\"monospace\" font-size=\"10.00\">DivBackward0</text>\n</g>\n<!-- "
+                "140447553018560&#45;&gt;140447553015920 -->\n<g id=\"edge23\" "
+                "class=\"edge\">\n<title>140447553018560&#45;&gt;140447553015920</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M244,-527.75C244,-520.8 244,-510.85 "
+                "244,-502.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"247.5,-502.09 "
+                "244,-492.09 240.5,-502.09 247.5,-502.09\"/>\n</g>\n<!-- "
+                "140447553018320&#45;&gt;140447553018560 -->\n<g id=\"edge24\" "
+                "class=\"edge\">\n<title>140447553018320&#45;&gt;140447553018560</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M247.34,-582.75C246.82,-575.8 246.06,-565.85 "
+                "245.41,-557.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"248.89,-556.8 244.65,-547.09 241.91,-557.32 "
+                "248.89,-556.8\"/>\n</g>\n<!-- 140447553018272 -->\n<g id=\"node25\" "
+                "class=\"node\">\n<title>140447553018272</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"420.5,-657 331.5,-657 331.5,-638 420.5,-638 "
+                "420.5,-657\"/>\n<text text-anchor=\"middle\" x=\"376\" y=\"-645\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447553018272&#45;&gt;140447553018320 -->\n<g id=\"edge25\" "
+                "class=\"edge\">\n<title>140447553018272&#45;&gt;140447553018320</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M355.43,-637.98C334.57,-629.34 302.03,-615.87 "
+                "278.23,-606.02\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"279.52,-602.76 268.94,-602.17 276.84,-609.23 "
+                "279.52,-602.76\"/>\n</g>\n<!-- 140447553018944 -->\n<g id=\"node26\" "
+                "class=\"node\">\n<title>140447553018944</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"466.5,-712 365.5,-712 365.5,-693 466.5,-693 "
+                "466.5,-712\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-700\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553018944&#45;&gt;140447553018272 -->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140447553018944&#45;&gt;140447553018272</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M409.39,-692.75C403.74,-685.26 395.46,-674.28 "
+                "388.55,-665.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"391.3,-662.96 382.48,-657.09 385.71,-667.18 "
+                "391.3,-662.96\"/>\n</g>\n<!-- 140450290824272 -->\n<g id=\"node27\" "
+                "class=\"node\">\n<title>140450290824272</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"443,-779 389,-779 389,-748 443,-748 443,-779\"/>\n<text "
+                "text-anchor=\"middle\" x=\"416\" y=\"-755\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140450290824272&#45;&gt;140447553018944 "
+                "-->\n<g id=\"edge27\" "
+                "class=\"edge\">\n<title>140450290824272&#45;&gt;140447553018944</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M416,-747.92C416,-740.22 416,-730.69 "
+                "416,-722.43\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-722.25 "
+                "416,-712.25 412.5,-722.25 419.5,-722.25\"/>\n</g>\n<!-- "
+                "140447553043328&#45;&gt;140447553043424 -->\n<g id=\"edge30\" "
+                "class=\"edge\">\n<title>140447553043328&#45;&gt;140447553043424</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M450.44,-472.94C439.45,-455.18 416,-412.69 "
+                "416,-373.5 416,-373.5 416,-373.5 416,-261.5 416,-204.49 352.72,-163.93 "
+                "314.12,-144.49\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"315.42,-141.23 304.9,-140.01 312.36,-147.53 "
+                "315.42,-141.23\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "net = Net()\n",
+        "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
+        "y = torch.tensor(1.0)\n",
+        "\n",
+        "optim = torchopt.MetaAdam(net, lr=1.0, moment_requires_grad=True)\n",
+        "\n",
+        "net_state_0 = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step0.')\n",
+        "inner_loss = F.mse_loss(net(x), y)\n",
+        "optim.step(inner_loss)\n",
+        "net_state_1 = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step1.')\n",
+        "\n",
+        "outer_loss = F.mse_loss(net(x), y)\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        outer_loss, params=[net_state_0, net_state_1, {'x': x, 'outer_loss': "
+        "outer_loss}]\n",
+        "    )\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["We can see that the additional moment terms are added into the computational "
+                  "graph when we set `moment_requires_grad=True`."]
+    },
+    {"cell_type" : "markdown", "metadata" : {}, "source" : ["## 2. Extract and Recover"]},
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 2.1 Basic API\n",
+        "\n",
+        "We observe that how to reinitialize the inner-loop parameter in a new bi-level process "
+        "vary in different meta-learning algorithms. For instance, in algorithm like "
+        "Model-Agnostic Meta-Learning (MAML) "
+        "([arXiv:1703.03400](https://arxiv.org/abs/1703.03400)), every time a new task comes, we "
+        "need to reset the parameters to the initial ones. In other cases such as Meta-Gradient "
+        "Reinforcement Learning (MGRL) ([arXiv:1805.09801](https://arxiv.org/abs/1805.09801)), the "
+        "inner-loop network parameter just inherit previous updated parameter to continue the new "
+        "bi-level process.\n",
+        "\n",
+        "We provide the `torchopt.extract_state_dict` and `torchopt.recover_state_dict` functions "
+        "to extract and restore the state of network and optimizer. By default, the extracted "
+        "state dictionary is a reference (this design is for accumulating gradient of multi-task "
+        "batch training, MAML for example). You can also set `by='copy'` to extract the copy of "
+        "the state dictionary or set `by='deepcopy'` to have a detached copy."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 7,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "a = tensor(-1.0000, grad_fn=<AddBackward0>)\n",
+          "a = tensor(-1.0000, grad_fn=<AddBackward0>)\n"
+        ]
+      } ],
+      "source" : [
+        "net = Net()\n",
+        "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
+        "\n",
+        "optim = torchopt.MetaAdam(net, lr=1.0)\n",
+        "\n",
+        "# Get the reference of state dictionary\n",
+        "init_net_state = torchopt.extract_state_dict(net, by='reference')\n",
+        "init_optim_state = torchopt.extract_state_dict(optim, by='reference')\n",
+        "# If set `detach_buffers=True`, the parameters are referenced as references while buffers "
+        "are detached copies\n",
+        "init_net_state = torchopt.extract_state_dict(net, by='reference', detach_buffers=True)\n",
+        "\n",
+        "# Set `copy` to get the copy of the state dictionary\n",
+        "init_net_state_copy = torchopt.extract_state_dict(net, by='copy')\n",
+        "init_optim_state_copy = torchopt.extract_state_dict(optim, by='copy')\n",
+        "\n",
+        "# Set `deepcopy` to get the detached copy of state dictionary\n",
+        "init_net_state_deepcopy = torchopt.extract_state_dict(net, by='deepcopy')\n",
+        "init_optim_state_deepcopy = torchopt.extract_state_dict(optim, by='deepcopy')\n",
+        "\n",
+        "# Conduct 2 inner-loop optimization\n",
+        "for i in range(2):\n",
+        "    inner_loss = net(x)\n",
+        "    optim.step(inner_loss)\n",
+        "\n",
+        "print(f'a = {net.a!r}')\n",
+        "\n",
+        "# Recover and reconduct 2 inner-loop optimization\n",
+        "torchopt.recover_state_dict(net, init_net_state)\n",
+        "torchopt.recover_state_dict(optim, init_optim_state)\n",
+        "\n",
+        "for i in range(2):\n",
+        "    inner_loss = net(x)\n",
+        "    optim.step(inner_loss)\n",
+        "\n",
+        "print(f'a = {net.a!r}')  # the same result"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "### 2.2 Multi-task Example with `extract_state_dict` and `recover_state_dict`\n",
+        "\n",
+        "Let's move to another more complex setting. Meta-Learning algorithms always fix network "
+        "on several different tasks and accumulate outer loss of each task to the meta-gradient.\n",
+        "\n",
+        "Assume $x$ is a meta-parameter and $a$ is a normal parameter. We firstly update $a$ use "
+        "inner loss $\\mathcal{L}_1^{\\textrm{in}} = a_0 \\cdot x^2$ to $a_1$. Then we use $a_1$ "
+        "to compute the outer loss $\\mathcal{L}_1^{\\textrm{out}} = a_1 \\cdot x^2$ and "
+        "backpropagate it. Then we use $a_0$ to compute the inner loss "
+        "$\\mathcal{L}_2^{\\textrm{in}} = a_0 \\cdot x$ and update $a_0$ to $a_2 = a_0 - \\eta \\, "
+        "\\frac{\\partial \\mathcal{L}_2^{\\textrm{in}}}{\\partial a_0} = a_0 - \\eta \\, x$. Then "
+        "we compute outer loss $\\mathcal{L}_2^{\\textrm{out}} = a_2 \\cdot x$ and backpropagate "
+        "it. So the accumulated meta-gradient would be:\n",
+        "\n",
+        "$$\n",
+        "\\begin{split}\n",
+        "        \\frac{\\partial \\mathcal{L}_1^{\\textrm{out}}}{\\partial x} + \\frac{\\partial "
+        "\\mathcal{L}_2^{\\textrm{out}}}{\\partial x}\n",
+        "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + \\frac{\\partial (a_2 \\cdot "
+        "x)}{\\partial x} \\\\\n",
+        "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + (\\frac{\\partial a_2}{\\partial x} "
+        "\\cdot x + a_2) \\\\\n",
+        "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + [\\frac{\\partial (a_0 - \\eta \\, "
+        "x)}{\\partial x} \\cdot x + (a_0 - \\eta \\, x)] \\\\\n",
+        "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + [(- \\eta) \\cdot x + (a_0 - \\eta "
+        "\\, x)] \\\\\n",
+        "    & = (- 4 \\, \\eta \\, x^3 + 2 \\, a_0 \\, x) + (- 2 \\, \\eta \\, x + a_0)\n",
+        "\\end{split}\n",
+        "$$"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Let's define the network and variables first."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 8,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "class Net2Tasks(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super().__init__()\n",
+        "        self.a = nn.Parameter(torch.tensor(1.0), requires_grad=True)\n",
+        "\n",
+        "    def task1(self, x):\n",
+        "        return self.a * x**2\n",
+        "\n",
+        "    def task2(self, x):\n",
+        "        return self.a * x\n",
+        "\n",
+        "\n",
+        "net = Net2Tasks()\n",
+        "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
+        "\n",
+        "optim = torchopt.MetaSGD(net, lr=1.0)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Once we call `step` method of `MetaOptimizer`, the parameters of the network "
+                  "would be changed. We should use `torchopt.extract_state_dict` to extract state "
+                  "and use `torchopt.recover_state_dict` to recover the state. Note that if we use "
+                  "optimizers that have momentum buffers, we should also extract and recover them, "
+                  "vanilla SGD does not have momentum buffers so code `init_optim_state = "
+                  "torchopt.extract_state_dict(optim)` and `torchopt.recover_state_dict(optim, "
+                  "init_optim_state)` have no effect."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 9,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "init_optim_state = ((EmptyState(),),)\n",
+          "Task 1: x.grad = tensor(-28.)\n",
+          "Accumulated: x.grad = tensor(-31.)\n"
+        ]
+      } ],
+      "source" : [
+        "# Get the reference of state dictionary\n",
+        "init_net_state = torchopt.extract_state_dict(net, by='reference')\n",
+        "init_optim_state = torchopt.extract_state_dict(optim, by='reference')\n",
+        "# The `state_dict` is empty for vanilla SGD optimizer\n",
+        "print(f'init_optim_state = {init_optim_state!r}')\n",
+        "\n",
+        "inner_loss_1 = net.task1(x)\n",
+        "optim.step(inner_loss_1)\n",
+        "outer_loss_1 = net.task1(x)\n",
+        "outer_loss_1.backward()\n",
+        "print(f'Task 1: x.grad = {x.grad!r}')\n",
+        "\n",
+        "torchopt.recover_state_dict(net, init_net_state)\n",
+        "torchopt.recover_state_dict(optim, init_optim_state)\n",
+        "inner_loss_2 = net.task2(x)\n",
+        "optim.step(inner_loss_2)\n",
+        "outer_loss_2 = net.task2(x)\n",
+        "outer_loss_2.backward()\n",
+        "\n",
+        "# `extract_state_dict`` extracts the reference so gradient accumulates\n",
+        "# x.grad = (- 4 * lr * x^3 + 2 * a_0 * x) + (- 2 * lr * x + a_0)\n",
+        "#        = (- 4 * 1 * 2^3 + 2 * 1 * 2) + (- 2 * 1 * 2 + 1)\n",
+        "#        = -28 - 3\n",
+        "#        = -31\n",
+        "print(f'Accumulated: x.grad = {x.grad!r}')"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["## 3. Gradient Transformation in `MetaOptimizer`"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "We can also use some gradient normalization tricks in our `MetaOptimizer`. In fact "
+        "`MetaOptimizer` decedents like `MetaSGD` are specializations of `MetaOptimizer`. "
+        "Specifically, `MetaSGD(net, lr=1.)` is `MetaOptimizer(net, alias.sgd(lr=1., "
+        "moment_requires_grad=True))`, where flag `moment_requires_grad=True` means the momentums "
+        "are created with flag `requires_grad=True` so the momentums will also be the part of the "
+        "computation graph.\n",
+        "\n",
+        "In the designing of TorchOpt, we treat these functions as derivations of `combine.chain`. "
+        "So we can build our own chain like `combine.chain(clip.clip_grad_norm(max_norm=1.), "
+        "sgd(lr=1., requires_grad=True))` to clip the gradient and update parameters using "
+        "`sgd`.\n",
+        "\n",
+        "$$\n",
+        "\\begin{aligned}\n",
+        "        \\frac{\\partial \\mathcal{L}^{\\textrm{out}}}{\\partial x}\n",
+        "    & = \\frac{\\partial (a_1 \\cdot x^2)}{\\partial x} \\\\\n",
+        "    & = \\frac{\\partial a_1}{\\partial x} \\cdot x^2 + a_1 \\cdot \\frac{\\partial "
+        "(x^2)}{\\partial x} \\\\\n",
+        "    & = \\frac{\\partial (a_0 - \\eta \\, g)}{\\partial x} \\cdot x^2 + (a_0 - \\eta \\, "
+        "g) \\cdot 2 x                                  & \\qquad (g \\propto \\frac{\\partial "
+        "\\mathcal{L}^{\\textrm{in}}}{\\partial a_0} = x^2, \\  {\\lVert g \\rVert}_2 \\le "
+        "G_{\\max}) \\\\\n",
+        "    & = \\frac{\\partial (a_0 - \\eta \\, \\beta^{-1} \\, x^2)}{\\partial x} \\cdot x^2 + "
+        "(a_0 - \\eta \\, \\beta^{-1} \\, x^2) \\cdot 2 x  & \\qquad (g = \\beta^{-1} \\, x^2, \\  "
+        " \\beta > 0, \\  {\\lVert g \\rVert}_2 \\le G_{\\max}) \\\\\n",
+        "    & = (- \\beta^{-1} \\, \\eta \\cdot 2 x) \\cdot x^2 + (a_0 - \\beta^{-1} \\, \\eta "
+        "\\, x^2) \\cdot 2 x \\\\\n",
+        "    & = - 4 \\, \\beta^{-1} \\, \\eta \\, x^3 + 2 \\, a_0 \\, x\n",
+        "\\end{aligned}\n",
+        "$$"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 10,
+      "metadata" : {},
+      "outputs" : [
+        {"name" : "stdout", "output_type" : "stream", "text" : ["x.grad = tensor(-12.0000)\n"]}
+      ],
+      "source" : [
+        "net = Net()\n",
+        "x = nn.Parameter(torch.tensor(2.0), requires_grad=True)\n",
+        "\n",
+        "optim_impl = torchopt.combine.chain(\n",
+        "    torchopt.clip.clip_grad_norm(max_norm=2.0),\n",
+        "    torchopt.sgd(lr=1.0, moment_requires_grad=True),\n",
+        ")\n",
+        "optim = torchopt.MetaOptimizer(net, optim_impl)\n",
+        "\n",
+        "inner_loss = net(x)\n",
+        "optim.step(inner_loss)\n",
+        "\n",
+        "outer_loss = net(x)\n",
+        "outer_loss.backward()\n",
+        "# Since `max_norm` is 2 and the gradient is x^2, so the scale = x^2 / 2 = 2^2 / 2 = 2\n",
+        "# x.grad = - 4 * lr * x^3 / scale + 2 * a_0 * x\n",
+        "#        = - 4 * 1 * 2^3 / 2 + 2 * 1 * 2\n",
+        "#        = -16 + 4\n",
+        "#        = -12\n",
+        "print(f'x.grad = {x.grad!r}')"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 4. Learning Rate Scheduler\n",
+        "\n",
+        "TorchOpt also provides implementation of learning rate scheduler, which can be used as:"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 11,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "functional_adam = torchopt.adam(\n",
+        "    lr=torchopt.schedule.linear_schedule(\n",
+        "        init_value=1e-3, end_value=1e-4, transition_steps=10000, transition_begin=2000\n",
+        "    )\n",
+        ")\n",
+        "\n",
+        "adam = torchopt.Adam(\n",
+        "    net.parameters(),\n",
+        "    lr=torchopt.schedule.linear_schedule(\n",
+        "        init_value=1e-3, end_value=1e-4, transition_steps=10000, transition_begin=2000\n",
+        "    ),\n",
+        ")\n",
+        "\n",
+        "meta_adam = torchopt.MetaAdam(\n",
+        "    net,\n",
+        "    lr=torchopt.schedule.linear_schedule(\n",
+        "        init_value=1e-3, end_value=1e-4, transition_steps=10000, transition_begin=2000\n",
+        "    ),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 5. Accelerated Optimizer\n",
+        "\n",
+        "Users can use accelerated optimizer by setting the `use_accelerated_op=True`. Currently "
+        "we only support the Adam optimizer."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Check whether the `accelerated_op` is available:"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 12,
+      "metadata" : {},
+      "outputs" : [ {"name" : "stdout", "output_type" : "stream", "text" : ["True\n"]} ],
+      "source" : ["torchopt.accelerated_op_available(torch.device('cpu'))"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 13,
+      "metadata" : {},
+      "outputs" : [ {"name" : "stdout", "output_type" : "stream", "text" : ["True\n"]} ],
+      "source" : ["torchopt.accelerated_op_available(torch.device('cuda'))"]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 14,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7fbd302aafd0>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"542pt\" height=\"778pt\"\n viewBox=\"0.00 0.00 542.00 778.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 774)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-774 538,-774 538,4 "
+                "-4,4\"/>\n<!-- 140450290825712 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140450290825712</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"454.5,-30 377.5,-30 377.5,0 454.5,0 "
+                "454.5,-30\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-18\" "
+                "font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text "
+                "text-anchor=\"middle\" x=\"416\" y=\"-7\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140450533650240 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140450533650240</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"472.5,-85 359.5,-85 359.5,-66 472.5,-66 "
+                "472.5,-85\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-73\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140450533650240&#45;&gt;140450290825712 -->\n<g id=\"edge31\" "
+                "class=\"edge\">\n<title>140450533650240&#45;&gt;140450290825712</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M416,-65.87C416,-59.11 416,-49.35 "
+                "416,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-40.11 "
+                "416,-30.11 412.5,-40.11 419.5,-40.11\"/>\n</g>\n<!-- 140450533648560 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140450533648560</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"460.5,-140 371.5,-140 371.5,-121 "
+                "460.5,-121 460.5,-140\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140450533648560&#45;&gt;140450533650240 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140450533648560&#45;&gt;140450533650240</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M416,-120.75C416,-113.8 416,-103.85 "
+                "416,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-95.09 "
+                "416,-85.09 412.5,-95.09 419.5,-95.09\"/>\n</g>\n<!-- 140450533647456 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140450533647456</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"336.5,-217 247.5,-217 247.5,-176 "
+                "336.5,-176 336.5,-217\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-205\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text "
+                "text-anchor=\"middle\" x=\"292\" y=\"-194\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.a</text>\n<text text-anchor=\"middle\" x=\"292\" "
+                "y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140450533647456&#45;&gt;140450533648560 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140450533647456&#45;&gt;140450533648560</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M329.88,-175.95C349.47,-165.84 372.87,-153.76 "
+                "390.33,-144.75\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"392.07,-147.79 399.35,-140.09 388.86,-141.57 "
+                "392.07,-147.79\"/>\n</g>\n<!-- 140447435136640 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140447435136640</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"154.5,-638 53.5,-638 53.5,-619 154.5,-619 "
+                "154.5,-638\"/>\n<text text-anchor=\"middle\" x=\"104\" y=\"-626\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447435136640&#45;&gt;140450533647456 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140447435136640&#45;&gt;140450533647456</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M86.83,-618.83C57.29,-602.54 0,-564.4 0,-513.5 "
+                "0,-513.5 0,-513.5 0,-316.5 0,-265.8 152.86,-226.17 237.38,-208.13\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"238.27,-211.52 247.33,-206.04 "
+                "236.83,-204.67 238.27,-211.52\"/>\n</g>\n<!-- 140450533648416 -->\n<g "
+                "id=\"node12\" class=\"node\">\n<title>140450533648416</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"254.5,-583 165.5,-583 165.5,-564 "
+                "254.5,-564 254.5,-583\"/>\n<text text-anchor=\"middle\" x=\"210\" y=\"-571\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447435136640&#45;&gt;140450533648416 -->\n<g id=\"edge11\" "
+                "class=\"edge\">\n<title>140447435136640&#45;&gt;140450533648416</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M121.03,-618.98C137.93,-610.54 164.06,-597.47 "
+                "183.64,-587.68\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"185.28,-590.77 192.66,-583.17 182.15,-584.51 "
+                "185.28,-590.77\"/>\n</g>\n<!-- 140447435236512 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140447435236512</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"133.5,-704 74.5,-704 74.5,-674 133.5,-674 "
+                "133.5,-704\"/>\n<text text-anchor=\"middle\" x=\"104\" y=\"-692\" "
+                "font-family=\"monospace\" font-size=\"10.00\">step0.a</text>\n<text "
+                "text-anchor=\"middle\" x=\"104\" y=\"-681\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435236512&#45;&gt;140447435136640 "
+                "-->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140447435236512&#45;&gt;140447435136640</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M104,-673.84C104,-666.21 104,-656.7 "
+                "104,-648.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"107.5,-648.27 "
+                "104,-638.27 100.5,-648.27 107.5,-648.27\"/>\n</g>\n<!-- 140447435136688 -->\n<g "
+                "id=\"node7\" class=\"node\">\n<title>140447435136688</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"336.5,-272 247.5,-272 247.5,-253 "
+                "336.5,-253 336.5,-272\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-260\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447435136688&#45;&gt;140450533647456 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140447435136688&#45;&gt;140450533647456</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M292,-252.87C292,-246.22 292,-236.63 "
+                "292,-227.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"295.5,-227.01 "
+                "292,-217.01 288.5,-227.01 295.5,-227.01\"/>\n</g>\n<!-- 140447554132144 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140447554132144</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"351.5,-327 232.5,-327 232.5,-308 "
+                "351.5,-308 351.5,-327\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-315\" "
+                "font-family=\"monospace\" "
+                "font-size=\"10.00\">UpdatesOpBackward</text>\n</g>\n<!-- "
+                "140447554132144&#45;&gt;140447435136688 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140447554132144&#45;&gt;140447435136688</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M292,-307.75C292,-300.8 292,-290.85 "
+                "292,-282.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"295.5,-282.09 "
+                "292,-272.09 288.5,-282.09 295.5,-282.09\"/>\n</g>\n<!-- 140447554131664 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140447554131664</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"426.5,-388 337.5,-388 337.5,-369 "
+                "426.5,-369 426.5,-388\"/>\n<text text-anchor=\"middle\" x=\"382\" y=\"-376\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MuOpBackward</text>\n</g>\n<!-- "
+                "140447554131664&#45;&gt;140447554132144 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140447554131664&#45;&gt;140447554132144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M368.72,-368.79C354.28,-359.33 330.97,-344.05 "
+                "313.83,-332.81\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"315.32,-329.6 305.04,-327.05 311.49,-335.46 "
+                "315.32,-329.6\"/>\n</g>\n<!-- 140447435134816 -->\n<g id=\"node10\" "
+                "class=\"node\">\n<title>140447435134816</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"275.5,-455 186.5,-455 186.5,-436 275.5,-436 "
+                "275.5,-455\"/>\n<text text-anchor=\"middle\" x=\"231\" y=\"-443\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140447435134816&#45;&gt;140447554131664 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140447435134816&#45;&gt;140447554131664</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M251.05,-435.87C277,-424.7 322.42,-405.15 "
+                "352.36,-392.26\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"354.06,-395.34 361.87,-388.17 351.3,-388.91 "
+                "354.06,-395.34\"/>\n</g>\n<!-- 140447554131904 -->\n<g id=\"node19\" "
+                "class=\"node\">\n<title>140447554131904</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"174.5,-388 85.5,-388 85.5,-369 174.5,-369 "
+                "174.5,-388\"/>\n<text text-anchor=\"middle\" x=\"130\" y=\"-376\" "
+                "font-family=\"monospace\" font-size=\"10.00\">NuOpBackward</text>\n</g>\n<!-- "
+                "140447435134816&#45;&gt;140447554131904 -->\n<g id=\"edge21\" "
+                "class=\"edge\">\n<title>140447435134816&#45;&gt;140447554131904</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M217.38,-435.73C200.57,-424.92 171.77,-406.38 "
+                "151.86,-393.57\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"153.71,-390.6 143.41,-388.13 149.92,-396.48 "
+                "153.71,-390.6\"/>\n</g>\n<!-- 140450533648992 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140450533648992</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"290.5,-522 129.5,-522 129.5,-503 290.5,-503 "
+                "290.5,-522\"/>\n<text text-anchor=\"middle\" x=\"210\" y=\"-510\" "
+                "font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140450533648992&#45;&gt;140447435134816 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140450533648992&#45;&gt;140447435134816</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M212.83,-502.73C215.95,-493.09 221.05,-477.3 "
+                "225.05,-464.91\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"228.47,-465.72 228.21,-455.13 221.81,-463.57 "
+                "228.47,-465.72\"/>\n</g>\n<!-- 140450533648416&#45;&gt;140450533648992 -->\n<g "
+                "id=\"edge10\" "
+                "class=\"edge\">\n<title>140450533648416&#45;&gt;140450533648992</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M210,-563.79C210,-555.6 210,-543.06 "
+                "210,-532.55\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"213.5,-532.24 "
+                "210,-522.24 206.5,-532.24 213.5,-532.24\"/>\n</g>\n<!-- 140450533646448 -->\n<g "
+                "id=\"node13\" class=\"node\">\n<title>140450533646448</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"363.5,-638 274.5,-638 274.5,-619 "
+                "363.5,-619 363.5,-638\"/>\n<text text-anchor=\"middle\" x=\"319\" y=\"-626\" "
+                "font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- "
+                "140450533646448&#45;&gt;140447435134816 -->\n<g id=\"edge15\" "
+                "class=\"edge\">\n<title>140450533646448&#45;&gt;140447435134816</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M319.92,-618.81C321.83,-596.71 324.2,-537.21 "
+                "300,-497 290.46,-481.14 273.84,-468.75 259.31,-460.21\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"260.6,-456.93 250.15,-455.16 "
+                "257.21,-463.06 260.6,-456.93\"/>\n</g>\n<!-- "
+                "140450533646448&#45;&gt;140450533648416 -->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140450533646448&#45;&gt;140450533648416</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M301.49,-618.98C284.03,-610.5 256.99,-597.35 "
+                "236.83,-587.54\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"238.36,-584.4 227.83,-583.17 235.29,-590.69 "
+                "238.36,-584.4\"/>\n</g>\n<!-- 140447553018176 -->\n<g id=\"node14\" "
+                "class=\"node\">\n<title>140447553018176</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"423.5,-698.5 322.5,-698.5 322.5,-679.5 423.5,-679.5 "
+                "423.5,-698.5\"/>\n<text text-anchor=\"middle\" x=\"373\" y=\"-686.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447553018176&#45;&gt;140450533646448 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140447553018176&#45;&gt;140450533646448</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M365.03,-679.37C356.9,-670.55 344.07,-656.66 "
+                "334.02,-645.77\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"336.36,-643.14 327,-638.17 331.21,-647.89 336.36,-643.14\"/>\n</g>\n<!-- "
+                "140447435135536 -->\n<g id=\"node25\" "
+                "class=\"node\">\n<title>140447435135536</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"508.5,-583 419.5,-583 419.5,-564 508.5,-564 "
+                "508.5,-583\"/>\n<text text-anchor=\"middle\" x=\"464\" y=\"-571\" "
+                "font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- "
+                "140447553018176&#45;&gt;140447435135536 -->\n<g id=\"edge30\" "
+                "class=\"edge\">\n<title>140447553018176&#45;&gt;140447435135536</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M379.84,-679.47C394.86,-660.74 430.95,-615.72 "
+                "450.64,-591.16\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"453.56,-593.12 457.08,-583.13 448.09,-588.74 "
+                "453.56,-593.12\"/>\n</g>\n<!-- 140447553045424 -->\n<g id=\"node15\" "
+                "class=\"node\">\n<title>140447553045424</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"400,-770 346,-770 346,-740 400,-740 400,-770\"/>\n<text "
+                "text-anchor=\"middle\" x=\"373\" y=\"-758\" font-family=\"monospace\" "
+                "font-size=\"10.00\">x</text>\n<text text-anchor=\"middle\" x=\"373\" y=\"-747\" "
+                "font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140447553045424&#45;&gt;140447553018176 -->\n<g id=\"edge14\" "
+                "class=\"edge\">\n<title>140447553045424&#45;&gt;140447553018176</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M373,-739.8C373,-730.7 373,-718.79 "
+                "373,-708.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"376.5,-708.84 "
+                "373,-698.84 369.5,-708.84 376.5,-708.84\"/>\n</g>\n<!-- 140447435136592 -->\n<g "
+                "id=\"node16\" class=\"node\">\n<title>140447435136592</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"505.5,-455 404.5,-455 404.5,-436 "
+                "505.5,-436 505.5,-455\"/>\n<text text-anchor=\"middle\" x=\"455\" y=\"-443\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140447435136592&#45;&gt;140447554131664 -->\n<g id=\"edge16\" "
+                "class=\"edge\">\n<title>140447435136592&#45;&gt;140447554131664</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M445.15,-435.73C433.44,-425.31 413.68,-407.71 "
+                "399.38,-394.97\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"401.49,-392.16 391.69,-388.13 396.83,-397.39 "
+                "401.49,-392.16\"/>\n</g>\n<!-- 140447552973856 -->\n<g id=\"node17\" "
+                "class=\"node\">\n<title>140447552973856</title>\n<polygon fill=\"orange\" "
+                "stroke=\"black\" points=\"442,-528 388,-528 388,-497 442,-497 442,-528\"/>\n<text "
+                "text-anchor=\"middle\" x=\"415\" y=\"-504\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447552973856&#45;&gt;140447554131664 "
+                "-->\n<g id=\"edge19\" "
+                "class=\"edge\">\n<title>140447552973856&#45;&gt;140447554131664</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M408.59,-496.72C404.49,-486.78 399.34,-473.29 "
+                "396,-461 390.26,-439.84 386.38,-414.91 384.19,-398.25\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"387.65,-397.68 382.94,-388.18 "
+                "380.7,-398.54 387.65,-397.68\"/>\n</g>\n<!-- "
+                "140447552973856&#45;&gt;140447435136592 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140447552973856&#45;&gt;140447435136592</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M424.08,-496.75C430.15,-486.89 438.16,-473.87 "
+                "444.5,-463.56\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"447.5,-465.37 449.76,-455.02 441.54,-461.7 447.5,-465.37\"/>\n</g>\n<!-- "
+                "140447553044544 -->\n<g id=\"node18\" "
+                "class=\"node\">\n<title>140447553044544</title>\n<polygon fill=\"orange\" "
+                "stroke=\"black\" points=\"348,-461 294,-461 294,-430 348,-430 348,-461\"/>\n<text "
+                "text-anchor=\"middle\" x=\"321\" y=\"-437\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553044544&#45;&gt;140447554131664 "
+                "-->\n<g id=\"edge18\" "
+                "class=\"edge\">\n<title>140447553044544&#45;&gt;140447554131664</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M334.84,-429.75C344.48,-419.48 357.31,-405.81 "
+                "367.16,-395.31\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"369.71,-397.71 374.01,-388.02 364.61,-392.92 "
+                "369.71,-397.71\"/>\n</g>\n<!-- 140447553044544&#45;&gt;140447554131904 -->\n<g "
+                "id=\"edge24\" "
+                "class=\"edge\">\n<title>140447553044544&#45;&gt;140447554131904</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M293.95,-433.38C290.94,-432.21 287.91,-431.06 "
+                "285,-430 245.4,-415.59 199.43,-400.86 167.9,-391.06\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"168.78,-387.67 158.2,-388.05 166.72,-394.35 "
+                "168.78,-387.67\"/>\n</g>\n<!-- 140447554131904&#45;&gt;140447554132144 -->\n<g "
+                "id=\"edge20\" "
+                "class=\"edge\">\n<title>140447554131904&#45;&gt;140447554132144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M153.56,-368.92C181.27,-358.83 227.47,-342 "
+                "258.81,-330.59\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"260.22,-333.8 268.42,-327.09 257.82,-327.22 "
+                "260.22,-333.8\"/>\n</g>\n<!-- 140450533648896 -->\n<g id=\"node20\" "
+                "class=\"node\">\n<title>140450533648896</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"129.5,-455 28.5,-455 28.5,-436 129.5,-436 "
+                "129.5,-455\"/>\n<text text-anchor=\"middle\" x=\"79\" y=\"-443\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140450533648896&#45;&gt;140447554131904 -->\n<g id=\"edge22\" "
+                "class=\"edge\">\n<title>140450533648896&#45;&gt;140447554131904</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M85.88,-435.73C93.83,-425.6 107.1,-408.69 "
+                "117.01,-396.06\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"119.81,-398.16 123.23,-388.13 114.3,-393.83 "
+                "119.81,-398.16\"/>\n</g>\n<!-- 140447435236752 -->\n<g id=\"node21\" "
+                "class=\"node\">\n<title>140447435236752</title>\n<polygon fill=\"orange\" "
+                "stroke=\"black\" points=\"108,-528 54,-528 54,-497 108,-497 108,-528\"/>\n<text "
+                "text-anchor=\"middle\" x=\"81\" y=\"-504\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435236752&#45;&gt;140447554131904 "
+                "-->\n<g id=\"edge25\" "
+                "class=\"edge\">\n<title>140447435236752&#45;&gt;140447554131904</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M105.47,-496.99C117.69,-488.27 131.27,-475.95 "
+                "138,-461 147.09,-440.79 142.15,-414.98 136.89,-397.88\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"140.08,-396.38 133.53,-388.05 "
+                "133.45,-398.64 140.08,-396.38\"/>\n</g>\n<!-- "
+                "140447435236752&#45;&gt;140450533648896 -->\n<g id=\"edge23\" "
+                "class=\"edge\">\n<title>140447435236752&#45;&gt;140450533648896</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M80.55,-496.75C80.26,-487.39 79.88,-475.19 "
+                "79.57,-465.16\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"83.07,-464.91 79.26,-455.02 76.07,-465.12 83.07,-464.91\"/>\n</g>\n<!-- "
+                "140447553045904 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140447553045904</title>\n<polygon fill=\"orange\" "
+                "stroke=\"black\" points=\"247,-394 193,-394 193,-363 247,-363 247,-394\"/>\n<text "
+                "text-anchor=\"middle\" x=\"220\" y=\"-370\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553045904&#45;&gt;140447554132144 "
+                "-->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140447553045904&#45;&gt;140447554132144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M237.8,-362.92C248.68,-354 262.58,-342.61 "
+                "273.58,-333.6\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"275.81,-336.29 281.32,-327.25 271.37,-330.88 "
+                "275.81,-336.29\"/>\n</g>\n<!-- 140447435237152 -->\n<g id=\"node23\" "
+                "class=\"node\">\n<title>140447435237152</title>\n<polygon fill=\"orange\" "
+                "stroke=\"black\" points=\"319,-394 265,-394 265,-363 319,-363 319,-394\"/>\n<text "
+                "text-anchor=\"middle\" x=\"292\" y=\"-370\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435237152&#45;&gt;140447554132144 "
+                "-->\n<g id=\"edge27\" "
+                "class=\"edge\">\n<title>140447435237152&#45;&gt;140447554132144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M292,-362.92C292,-355.22 292,-345.69 "
+                "292,-337.43\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"295.5,-337.25 "
+                "292,-327.25 288.5,-337.25 295.5,-337.25\"/>\n</g>\n<!-- 140447435237232 -->\n<g "
+                "id=\"node24\" class=\"node\">\n<title>140447435237232</title>\n<polygon "
+                "fill=\"orange\" stroke=\"black\" points=\"499,-394 445,-394 445,-363 499,-363 "
+                "499,-394\"/>\n<text text-anchor=\"middle\" x=\"472\" y=\"-370\" "
+                "font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140447435237232&#45;&gt;140447554132144 -->\n<g id=\"edge28\" "
+                "class=\"edge\">\n<title>140447435237232&#45;&gt;140447554132144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M444.97,-366.33C441.95,-365.17 438.92,-364.04 "
+                "436,-363 401.26,-350.66 361.14,-338.42 332.09,-329.92\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"332.88,-326.5 322.3,-327.07 "
+                "330.93,-333.22 332.88,-326.5\"/>\n</g>\n<!-- "
+                "140447435135536&#45;&gt;140450533648560 -->\n<g id=\"edge29\" "
+                "class=\"edge\">\n<title>140447435135536&#45;&gt;140450533648560</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M472.87,-563.95C491.7,-544.81 534,-496.3 "
+                "534,-446.5 534,-446.5 534,-446.5 534,-261.5 534,-207.17 476.8,-165.48 "
+                "442.05,-145.16\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"443.34,-141.87 432.91,-140 439.9,-147.96 "
+                "443.34,-141.87\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "net = Net().to(device='cuda')\n",
+        "x = nn.Parameter(torch.tensor(2.0, device=torch.device('cuda')), requires_grad=True)\n",
+        "y = torch.tensor(1.0, device=torch.device('cuda'))\n",
+        "\n",
+        "optim = torchopt.MetaAdam(net, lr=1.0, moment_requires_grad=True, "
+        "use_accelerated_op=True)\n",
+        "\n",
+        "net_state_0 = torchopt.extract_state_dict(\n",
+        "    net, by='reference', enable_visual=True, visual_prefix='step0.'\n",
+        ")\n",
+        "inner_loss = F.mse_loss(net(x), y)\n",
+        "optim.step(inner_loss)\n",
+        "net_state_1 = torchopt.extract_state_dict(\n",
+        "    net, by='reference', enable_visual=True, visual_prefix='step1.'\n",
+        ")\n",
+        "\n",
+        "outer_loss = F.mse_loss(net(x), y)\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        outer_loss, params=[net_state_0, net_state_1, {'x': x, 'outer_loss': "
+        "outer_loss}]\n",
+        "    )\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "## 6. Known Issues\n",
+        "\n",
+        "Here we record some common issues faced by users when using the meta-optimizer."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "**1. Get `NaN` error when using `MetaAdam` or other meta-optimizers.**\n",
+        "\n",
+        "The `NaN` error is because of the numerical instability of the `Adam` in meta-learning. "
+        "There exist an `sqrt` operation in `Adam`'s computation process. Backpropogating through "
+        "the `Adam` operator introduces the second derivation of the `sqrt` operation, which is "
+        "not numerical stable, i.e. ${\\left. \\frac{d^2 \\sqrt{x}}{{dx}^2} \\right\\rvert}_{x = "
+        "0} = \\texttt{NaN}$. You can also refer to issue "
+        "[facebookresearch/higher#125](https://github.com/facebookresearch/higher/issues/125).\n",
+        "\n",
+        "For this problem, TorchOpt have two recommended solutions.\n",
+        "\n",
+        "* Put the `sqrt` operation into the whole equation, and compute the derivation of the "
+        "output to the input manually. The second derivation of the `sqrt` operation will be "
+        "eliminated. You can achieve this by setting the flag `use_accelerated_op=True`, you can "
+        "follow the instructions in notebook [Functional Optimizer](1_Functional_Optimizer.ipynb) "
+        "and Meta-Optimizer."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 15,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : ["inner_optim = torchopt.MetaAdam(net, lr=1.0, use_accelerated_op=True)"]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["* Register hook to the first-order gradients. During the backpropagation, the "
+                  "NaN gradients will be set to 0, which will have a similar effect to the first "
+                  "solution but much slower. "]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 16,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "impl = torchopt.chain(torchopt.hook.register_hook(torchopt.hook.zero_nan_hook), "
+        "torchopt.adam(1e-1))\n",
+        "inner_optim = torchopt.MetaOptimizer(net, impl)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "**2. Get `Trying to backward through the graph a second time` error when conducting "
+        "multiple meta-optimization.**\n",
+        "\n",
+        "Please refer to the tutorial notebook [Stop Gradient](4_Stop_Gradient.ipynb) for more "
+        "guidance."
+      ]
     }
-   ],
-   "source": [
-    "torchopt.accelerated_op_available(torch.device('cuda'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7fbd302aafd0>\n"
-     ]
-    },
-    {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"542pt\" height=\"778pt\"\n viewBox=\"0.00 0.00 542.00 778.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 774)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-774 538,-774 538,4 -4,4\"/>\n<!-- 140450290825712 -->\n<g id=\"node1\" class=\"node\">\n<title>140450290825712</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"454.5,-30 377.5,-30 377.5,0 454.5,0 454.5,-30\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"416\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140450533650240 -->\n<g id=\"node2\" class=\"node\">\n<title>140450533650240</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"472.5,-85 359.5,-85 359.5,-66 472.5,-66 472.5,-85\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140450533650240&#45;&gt;140450290825712 -->\n<g id=\"edge31\" class=\"edge\">\n<title>140450533650240&#45;&gt;140450290825712</title>\n<path fill=\"none\" stroke=\"black\" d=\"M416,-65.87C416,-59.11 416,-49.35 416,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-40.11 416,-30.11 412.5,-40.11 419.5,-40.11\"/>\n</g>\n<!-- 140450533648560 -->\n<g id=\"node3\" class=\"node\">\n<title>140450533648560</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"460.5,-140 371.5,-140 371.5,-121 460.5,-121 460.5,-140\"/>\n<text text-anchor=\"middle\" x=\"416\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140450533648560&#45;&gt;140450533650240 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140450533648560&#45;&gt;140450533650240</title>\n<path fill=\"none\" stroke=\"black\" d=\"M416,-120.75C416,-113.8 416,-103.85 416,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"419.5,-95.09 416,-85.09 412.5,-95.09 419.5,-95.09\"/>\n</g>\n<!-- 140450533647456 -->\n<g id=\"node4\" class=\"node\">\n<title>140450533647456</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"336.5,-217 247.5,-217 247.5,-176 336.5,-176 336.5,-217\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-205\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"292\" y=\"-194\" font-family=\"monospace\" font-size=\"10.00\">step1.a</text>\n<text text-anchor=\"middle\" x=\"292\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140450533647456&#45;&gt;140450533648560 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140450533647456&#45;&gt;140450533648560</title>\n<path fill=\"none\" stroke=\"black\" d=\"M329.88,-175.95C349.47,-165.84 372.87,-153.76 390.33,-144.75\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"392.07,-147.79 399.35,-140.09 388.86,-141.57 392.07,-147.79\"/>\n</g>\n<!-- 140447435136640 -->\n<g id=\"node5\" class=\"node\">\n<title>140447435136640</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"154.5,-638 53.5,-638 53.5,-619 154.5,-619 154.5,-638\"/>\n<text text-anchor=\"middle\" x=\"104\" y=\"-626\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447435136640&#45;&gt;140450533647456 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140447435136640&#45;&gt;140450533647456</title>\n<path fill=\"none\" stroke=\"black\" d=\"M86.83,-618.83C57.29,-602.54 0,-564.4 0,-513.5 0,-513.5 0,-513.5 0,-316.5 0,-265.8 152.86,-226.17 237.38,-208.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"238.27,-211.52 247.33,-206.04 236.83,-204.67 238.27,-211.52\"/>\n</g>\n<!-- 140450533648416 -->\n<g id=\"node12\" class=\"node\">\n<title>140450533648416</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"254.5,-583 165.5,-583 165.5,-564 254.5,-564 254.5,-583\"/>\n<text text-anchor=\"middle\" x=\"210\" y=\"-571\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447435136640&#45;&gt;140450533648416 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140447435136640&#45;&gt;140450533648416</title>\n<path fill=\"none\" stroke=\"black\" d=\"M121.03,-618.98C137.93,-610.54 164.06,-597.47 183.64,-587.68\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"185.28,-590.77 192.66,-583.17 182.15,-584.51 185.28,-590.77\"/>\n</g>\n<!-- 140447435236512 -->\n<g id=\"node6\" class=\"node\">\n<title>140447435236512</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"133.5,-704 74.5,-704 74.5,-674 133.5,-674 133.5,-704\"/>\n<text text-anchor=\"middle\" x=\"104\" y=\"-692\" font-family=\"monospace\" font-size=\"10.00\">step0.a</text>\n<text text-anchor=\"middle\" x=\"104\" y=\"-681\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435236512&#45;&gt;140447435136640 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140447435236512&#45;&gt;140447435136640</title>\n<path fill=\"none\" stroke=\"black\" d=\"M104,-673.84C104,-666.21 104,-656.7 104,-648.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"107.5,-648.27 104,-638.27 100.5,-648.27 107.5,-648.27\"/>\n</g>\n<!-- 140447435136688 -->\n<g id=\"node7\" class=\"node\">\n<title>140447435136688</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"336.5,-272 247.5,-272 247.5,-253 336.5,-253 336.5,-272\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-260\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447435136688&#45;&gt;140450533647456 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140447435136688&#45;&gt;140450533647456</title>\n<path fill=\"none\" stroke=\"black\" d=\"M292,-252.87C292,-246.22 292,-236.63 292,-227.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"295.5,-227.01 292,-217.01 288.5,-227.01 295.5,-227.01\"/>\n</g>\n<!-- 140447554132144 -->\n<g id=\"node8\" class=\"node\">\n<title>140447554132144</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"351.5,-327 232.5,-327 232.5,-308 351.5,-308 351.5,-327\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">UpdatesOpBackward</text>\n</g>\n<!-- 140447554132144&#45;&gt;140447435136688 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140447554132144&#45;&gt;140447435136688</title>\n<path fill=\"none\" stroke=\"black\" d=\"M292,-307.75C292,-300.8 292,-290.85 292,-282.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"295.5,-282.09 292,-272.09 288.5,-282.09 295.5,-282.09\"/>\n</g>\n<!-- 140447554131664 -->\n<g id=\"node9\" class=\"node\">\n<title>140447554131664</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"426.5,-388 337.5,-388 337.5,-369 426.5,-369 426.5,-388\"/>\n<text text-anchor=\"middle\" x=\"382\" y=\"-376\" font-family=\"monospace\" font-size=\"10.00\">MuOpBackward</text>\n</g>\n<!-- 140447554131664&#45;&gt;140447554132144 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140447554131664&#45;&gt;140447554132144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M368.72,-368.79C354.28,-359.33 330.97,-344.05 313.83,-332.81\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"315.32,-329.6 305.04,-327.05 311.49,-335.46 315.32,-329.6\"/>\n</g>\n<!-- 140447435134816 -->\n<g id=\"node10\" class=\"node\">\n<title>140447435134816</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"275.5,-455 186.5,-455 186.5,-436 275.5,-436 275.5,-455\"/>\n<text text-anchor=\"middle\" x=\"231\" y=\"-443\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140447435134816&#45;&gt;140447554131664 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140447435134816&#45;&gt;140447554131664</title>\n<path fill=\"none\" stroke=\"black\" d=\"M251.05,-435.87C277,-424.7 322.42,-405.15 352.36,-392.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"354.06,-395.34 361.87,-388.17 351.3,-388.91 354.06,-395.34\"/>\n</g>\n<!-- 140447554131904 -->\n<g id=\"node19\" class=\"node\">\n<title>140447554131904</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"174.5,-388 85.5,-388 85.5,-369 174.5,-369 174.5,-388\"/>\n<text text-anchor=\"middle\" x=\"130\" y=\"-376\" font-family=\"monospace\" font-size=\"10.00\">NuOpBackward</text>\n</g>\n<!-- 140447435134816&#45;&gt;140447554131904 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140447435134816&#45;&gt;140447554131904</title>\n<path fill=\"none\" stroke=\"black\" d=\"M217.38,-435.73C200.57,-424.92 171.77,-406.38 151.86,-393.57\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"153.71,-390.6 143.41,-388.13 149.92,-396.48 153.71,-390.6\"/>\n</g>\n<!-- 140450533648992 -->\n<g id=\"node11\" class=\"node\">\n<title>140450533648992</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"290.5,-522 129.5,-522 129.5,-503 290.5,-503 290.5,-522\"/>\n<text text-anchor=\"middle\" x=\"210\" y=\"-510\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140450533648992&#45;&gt;140447435134816 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140450533648992&#45;&gt;140447435134816</title>\n<path fill=\"none\" stroke=\"black\" d=\"M212.83,-502.73C215.95,-493.09 221.05,-477.3 225.05,-464.91\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"228.47,-465.72 228.21,-455.13 221.81,-463.57 228.47,-465.72\"/>\n</g>\n<!-- 140450533648416&#45;&gt;140450533648992 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140450533648416&#45;&gt;140450533648992</title>\n<path fill=\"none\" stroke=\"black\" d=\"M210,-563.79C210,-555.6 210,-543.06 210,-532.55\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"213.5,-532.24 210,-522.24 206.5,-532.24 213.5,-532.24\"/>\n</g>\n<!-- 140450533646448 -->\n<g id=\"node13\" class=\"node\">\n<title>140450533646448</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"363.5,-638 274.5,-638 274.5,-619 363.5,-619 363.5,-638\"/>\n<text text-anchor=\"middle\" x=\"319\" y=\"-626\" font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- 140450533646448&#45;&gt;140447435134816 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140450533646448&#45;&gt;140447435134816</title>\n<path fill=\"none\" stroke=\"black\" d=\"M319.92,-618.81C321.83,-596.71 324.2,-537.21 300,-497 290.46,-481.14 273.84,-468.75 259.31,-460.21\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"260.6,-456.93 250.15,-455.16 257.21,-463.06 260.6,-456.93\"/>\n</g>\n<!-- 140450533646448&#45;&gt;140450533648416 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140450533646448&#45;&gt;140450533648416</title>\n<path fill=\"none\" stroke=\"black\" d=\"M301.49,-618.98C284.03,-610.5 256.99,-597.35 236.83,-587.54\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"238.36,-584.4 227.83,-583.17 235.29,-590.69 238.36,-584.4\"/>\n</g>\n<!-- 140447553018176 -->\n<g id=\"node14\" class=\"node\">\n<title>140447553018176</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"423.5,-698.5 322.5,-698.5 322.5,-679.5 423.5,-679.5 423.5,-698.5\"/>\n<text text-anchor=\"middle\" x=\"373\" y=\"-686.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447553018176&#45;&gt;140450533646448 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140447553018176&#45;&gt;140450533646448</title>\n<path fill=\"none\" stroke=\"black\" d=\"M365.03,-679.37C356.9,-670.55 344.07,-656.66 334.02,-645.77\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"336.36,-643.14 327,-638.17 331.21,-647.89 336.36,-643.14\"/>\n</g>\n<!-- 140447435135536 -->\n<g id=\"node25\" class=\"node\">\n<title>140447435135536</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"508.5,-583 419.5,-583 419.5,-564 508.5,-564 508.5,-583\"/>\n<text text-anchor=\"middle\" x=\"464\" y=\"-571\" font-family=\"monospace\" font-size=\"10.00\">PowBackward0</text>\n</g>\n<!-- 140447553018176&#45;&gt;140447435135536 -->\n<g id=\"edge30\" class=\"edge\">\n<title>140447553018176&#45;&gt;140447435135536</title>\n<path fill=\"none\" stroke=\"black\" d=\"M379.84,-679.47C394.86,-660.74 430.95,-615.72 450.64,-591.16\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"453.56,-593.12 457.08,-583.13 448.09,-588.74 453.56,-593.12\"/>\n</g>\n<!-- 140447553045424 -->\n<g id=\"node15\" class=\"node\">\n<title>140447553045424</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"400,-770 346,-770 346,-740 400,-740 400,-770\"/>\n<text text-anchor=\"middle\" x=\"373\" y=\"-758\" font-family=\"monospace\" font-size=\"10.00\">x</text>\n<text text-anchor=\"middle\" x=\"373\" y=\"-747\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553045424&#45;&gt;140447553018176 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140447553045424&#45;&gt;140447553018176</title>\n<path fill=\"none\" stroke=\"black\" d=\"M373,-739.8C373,-730.7 373,-718.79 373,-708.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"376.5,-708.84 373,-698.84 369.5,-708.84 376.5,-708.84\"/>\n</g>\n<!-- 140447435136592 -->\n<g id=\"node16\" class=\"node\">\n<title>140447435136592</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"505.5,-455 404.5,-455 404.5,-436 505.5,-436 505.5,-455\"/>\n<text text-anchor=\"middle\" x=\"455\" y=\"-443\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140447435136592&#45;&gt;140447554131664 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140447435136592&#45;&gt;140447554131664</title>\n<path fill=\"none\" stroke=\"black\" d=\"M445.15,-435.73C433.44,-425.31 413.68,-407.71 399.38,-394.97\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"401.49,-392.16 391.69,-388.13 396.83,-397.39 401.49,-392.16\"/>\n</g>\n<!-- 140447552973856 -->\n<g id=\"node17\" class=\"node\">\n<title>140447552973856</title>\n<polygon fill=\"orange\" stroke=\"black\" points=\"442,-528 388,-528 388,-497 442,-497 442,-528\"/>\n<text text-anchor=\"middle\" x=\"415\" y=\"-504\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447552973856&#45;&gt;140447554131664 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140447552973856&#45;&gt;140447554131664</title>\n<path fill=\"none\" stroke=\"black\" d=\"M408.59,-496.72C404.49,-486.78 399.34,-473.29 396,-461 390.26,-439.84 386.38,-414.91 384.19,-398.25\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"387.65,-397.68 382.94,-388.18 380.7,-398.54 387.65,-397.68\"/>\n</g>\n<!-- 140447552973856&#45;&gt;140447435136592 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140447552973856&#45;&gt;140447435136592</title>\n<path fill=\"none\" stroke=\"black\" d=\"M424.08,-496.75C430.15,-486.89 438.16,-473.87 444.5,-463.56\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"447.5,-465.37 449.76,-455.02 441.54,-461.7 447.5,-465.37\"/>\n</g>\n<!-- 140447553044544 -->\n<g id=\"node18\" class=\"node\">\n<title>140447553044544</title>\n<polygon fill=\"orange\" stroke=\"black\" points=\"348,-461 294,-461 294,-430 348,-430 348,-461\"/>\n<text text-anchor=\"middle\" x=\"321\" y=\"-437\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553044544&#45;&gt;140447554131664 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140447553044544&#45;&gt;140447554131664</title>\n<path fill=\"none\" stroke=\"black\" d=\"M334.84,-429.75C344.48,-419.48 357.31,-405.81 367.16,-395.31\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"369.71,-397.71 374.01,-388.02 364.61,-392.92 369.71,-397.71\"/>\n</g>\n<!-- 140447553044544&#45;&gt;140447554131904 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140447553044544&#45;&gt;140447554131904</title>\n<path fill=\"none\" stroke=\"black\" d=\"M293.95,-433.38C290.94,-432.21 287.91,-431.06 285,-430 245.4,-415.59 199.43,-400.86 167.9,-391.06\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"168.78,-387.67 158.2,-388.05 166.72,-394.35 168.78,-387.67\"/>\n</g>\n<!-- 140447554131904&#45;&gt;140447554132144 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140447554131904&#45;&gt;140447554132144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M153.56,-368.92C181.27,-358.83 227.47,-342 258.81,-330.59\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"260.22,-333.8 268.42,-327.09 257.82,-327.22 260.22,-333.8\"/>\n</g>\n<!-- 140450533648896 -->\n<g id=\"node20\" class=\"node\">\n<title>140450533648896</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"129.5,-455 28.5,-455 28.5,-436 129.5,-436 129.5,-455\"/>\n<text text-anchor=\"middle\" x=\"79\" y=\"-443\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140450533648896&#45;&gt;140447554131904 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140450533648896&#45;&gt;140447554131904</title>\n<path fill=\"none\" stroke=\"black\" d=\"M85.88,-435.73C93.83,-425.6 107.1,-408.69 117.01,-396.06\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"119.81,-398.16 123.23,-388.13 114.3,-393.83 119.81,-398.16\"/>\n</g>\n<!-- 140447435236752 -->\n<g id=\"node21\" class=\"node\">\n<title>140447435236752</title>\n<polygon fill=\"orange\" stroke=\"black\" points=\"108,-528 54,-528 54,-497 108,-497 108,-528\"/>\n<text text-anchor=\"middle\" x=\"81\" y=\"-504\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435236752&#45;&gt;140447554131904 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140447435236752&#45;&gt;140447554131904</title>\n<path fill=\"none\" stroke=\"black\" d=\"M105.47,-496.99C117.69,-488.27 131.27,-475.95 138,-461 147.09,-440.79 142.15,-414.98 136.89,-397.88\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"140.08,-396.38 133.53,-388.05 133.45,-398.64 140.08,-396.38\"/>\n</g>\n<!-- 140447435236752&#45;&gt;140450533648896 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140447435236752&#45;&gt;140450533648896</title>\n<path fill=\"none\" stroke=\"black\" d=\"M80.55,-496.75C80.26,-487.39 79.88,-475.19 79.57,-465.16\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"83.07,-464.91 79.26,-455.02 76.07,-465.12 83.07,-464.91\"/>\n</g>\n<!-- 140447553045904 -->\n<g id=\"node22\" class=\"node\">\n<title>140447553045904</title>\n<polygon fill=\"orange\" stroke=\"black\" points=\"247,-394 193,-394 193,-363 247,-363 247,-394\"/>\n<text text-anchor=\"middle\" x=\"220\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447553045904&#45;&gt;140447554132144 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140447553045904&#45;&gt;140447554132144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M237.8,-362.92C248.68,-354 262.58,-342.61 273.58,-333.6\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"275.81,-336.29 281.32,-327.25 271.37,-330.88 275.81,-336.29\"/>\n</g>\n<!-- 140447435237152 -->\n<g id=\"node23\" class=\"node\">\n<title>140447435237152</title>\n<polygon fill=\"orange\" stroke=\"black\" points=\"319,-394 265,-394 265,-363 319,-363 319,-394\"/>\n<text text-anchor=\"middle\" x=\"292\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435237152&#45;&gt;140447554132144 -->\n<g id=\"edge27\" class=\"edge\">\n<title>140447435237152&#45;&gt;140447554132144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M292,-362.92C292,-355.22 292,-345.69 292,-337.43\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"295.5,-337.25 292,-327.25 288.5,-337.25 295.5,-337.25\"/>\n</g>\n<!-- 140447435237232 -->\n<g id=\"node24\" class=\"node\">\n<title>140447435237232</title>\n<polygon fill=\"orange\" stroke=\"black\" points=\"499,-394 445,-394 445,-363 499,-363 499,-394\"/>\n<text text-anchor=\"middle\" x=\"472\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140447435237232&#45;&gt;140447554132144 -->\n<g id=\"edge28\" class=\"edge\">\n<title>140447435237232&#45;&gt;140447554132144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M444.97,-366.33C441.95,-365.17 438.92,-364.04 436,-363 401.26,-350.66 361.14,-338.42 332.09,-329.92\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"332.88,-326.5 322.3,-327.07 330.93,-333.22 332.88,-326.5\"/>\n</g>\n<!-- 140447435135536&#45;&gt;140450533648560 -->\n<g id=\"edge29\" class=\"edge\">\n<title>140447435135536&#45;&gt;140450533648560</title>\n<path fill=\"none\" stroke=\"black\" d=\"M472.87,-563.95C491.7,-544.81 534,-496.3 534,-446.5 534,-446.5 534,-446.5 534,-261.5 534,-207.17 476.8,-165.48 442.05,-145.16\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"443.34,-141.87 432.91,-140 439.9,-147.96 443.34,-141.87\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
+  ],
+  "metadata" : {
+    "kernelspec" :
+        {"display_name" : "Python 3 (ipykernel)", "language" : "python", "name" : "python3"},
+    "language_info" : {
+      "codemirror_mode" : {"name" : "ipython", "version" : 3},
+      "file_extension" : ".py",
+      "mimetype" : "text/x-python",
+      "name" : "python",
+      "nbconvert_exporter" : "python",
+      "pygments_lexer" : "ipython3",
+      "version" : "3.9.15"
+    },
+    "vscode" : {
+      "interpreter" :
+          {"hash" : "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"}
     }
-   ],
-   "source": [
-    "net = Net().to(device='cuda')\n",
-    "x = nn.Parameter(torch.tensor(2.0, device=torch.device('cuda')), requires_grad=True)\n",
-    "y = torch.tensor(1.0, device=torch.device('cuda'))\n",
-    "\n",
-    "optim = torchopt.MetaAdam(net, lr=1.0, moment_requires_grad=True, use_accelerated_op=True)\n",
-    "\n",
-    "net_state_0 = torchopt.extract_state_dict(\n",
-    "    net, by='reference', enable_visual=True, visual_prefix='step0.'\n",
-    ")\n",
-    "inner_loss = F.mse_loss(net(x), y)\n",
-    "optim.step(inner_loss)\n",
-    "net_state_1 = torchopt.extract_state_dict(\n",
-    "    net, by='reference', enable_visual=True, visual_prefix='step1.'\n",
-    ")\n",
-    "\n",
-    "outer_loss = F.mse_loss(net(x), y)\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        outer_loss, params=[net_state_0, net_state_1, {'x': x, 'outer_loss': outer_loss}]\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 6. Known Issues\n",
-    "\n",
-    "Here we record some common issues faced by users when using the meta-optimizer."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**1. Get `NaN` error when using `MetaAdam` or other meta-optimizers.**\n",
-    "\n",
-    "The `NaN` error is because of the numerical instability of the `Adam` in meta-learning. There exist an `sqrt` operation in `Adam`'s computation process. Backpropogating through the `Adam` operator introduces the second derivation of the `sqrt` operation, which is not numerical stable, i.e. ${\\left. \\frac{d^2 \\sqrt{x}}{{dx}^2} \\right\\rvert}_{x = 0} = \\texttt{NaN}$. You can also refer to issue [facebookresearch/higher#125](https://github.com/facebookresearch/higher/issues/125).\n",
-    "\n",
-    "For this problem, TorchOpt have two recommended solutions.\n",
-    "\n",
-    "* Put the `sqrt` operation into the whole equation, and compute the derivation of the output to the input manually. The second derivation of the `sqrt` operation will be eliminated. You can achieve this by setting the flag `use_accelerated_op=True`, you can follow the instructions in notebook [Functional Optimizer](1_Functional_Optimizer.ipynb) and Meta-Optimizer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "inner_optim = torchopt.MetaAdam(net, lr=1.0, use_accelerated_op=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "* Register hook to the first-order gradients. During the backpropagation, the NaN gradients will be set to 0, which will have a similar effect to the first solution but much slower. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "impl = torchopt.chain(torchopt.hook.register_hook(torchopt.hook.zero_nan_hook), torchopt.adam(1e-1))\n",
-    "inner_optim = torchopt.MetaOptimizer(net, impl)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**2. Get `Trying to backward through the graph a second time` error when conducting multiple meta-optimization.**\n",
-    "\n",
-    "Please refer to the tutorial notebook [Stop Gradient](4_Stop_Gradient.ipynb) for more guidance."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
   },
-  "vscode": {
-   "interpreter": {
-    "hash": "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+  "nbformat" : 4,
+  "nbformat_minor" : 4
 }
diff --git a/tutorials/4_Stop_Gradient.ipynb b/tutorials/4_Stop_Gradient.ipynb
index d8c24bc6..e497a9d1 100644
--- a/tutorials/4_Stop_Gradient.ipynb
+++ b/tutorials/4_Stop_Gradient.ipynb
@@ -1,521 +1,1888 @@
 {
-  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# `torchopt.stop_gradient` in Meta-Learning"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[<img align=\"left\" src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/4_Stop_Gradient.ipynb)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will illustrate the usage of `torchopt.stop_gradient` with a meta-learning example. We use `torchopt.visual` to help us visualize what is going on in automatic differentiation. Firstly, we define a simple network and the objective function for inner- and outer- optimization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import display\n",
-    "\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import torchopt\n",
-    "\n",
-    "\n",
-    "class Net(nn.Module):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, 1, bias=True)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.fc(x)\n",
-    "\n",
-    "\n",
-    "loss_fn = F.mse_loss"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We define the input `x` and output `y`. `y` will be served as the regression target in the following code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "batch_size = 64\n",
-    "dim = 16\n",
-    "\n",
-    "x = torch.randn((batch_size, dim))\n",
-    "y = torch.zeros((batch_size, 1))\n",
-    "net = Net(dim)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let us define the meta-parameter,  we use `MetaSGD` as the inner-loop optimizer and `Adam` as the outer-loop optimizer. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "meta_parameter = nn.Parameter(torch.tensor(1.0), requires_grad=True)\n",
-    "\n",
-    "optim = torchopt.MetaSGD(net, lr=1e-1)\n",
-    "meta_optim = torch.optim.Adam([meta_parameter], lr=1e-1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Define the inner-loop optimization and visualize the inner-loop forward gradient flow."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
+  "cells" : [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "inner loss: 0.3472\n",
-      "<graphviz.graphs.Digraph object at 0x7f5a21d70c40>\n"
-     ]
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["# `torchopt.stop_gradient` in Meta-Learning"]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"226pt\" height=\"335pt\"\n viewBox=\"0.00 0.00 226.00 335.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 331)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-331 222,-331 222,4 -4,4\"/>\n<!-- 140025091550880 -->\n<g id=\"node1\" class=\"node\">\n<title>140025091550880</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"144,-30 67,-30 67,0 144,0 144,-30\"/>\n<text text-anchor=\"middle\" x=\"105.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">inner_loss</text>\n<text text-anchor=\"middle\" x=\"105.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140028156253184 -->\n<g id=\"node2\" class=\"node\">\n<title>140028156253184</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"162,-85 49,-85 49,-66 162,-66 162,-85\"/>\n<text text-anchor=\"middle\" x=\"105.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140028156253184&#45;&gt;140025091550880 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140028156253184&#45;&gt;140025091550880</title>\n<path fill=\"none\" stroke=\"black\" d=\"M105.5,-65.87C105.5,-59.11 105.5,-49.35 105.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"109,-40.11 105.5,-30.11 102,-40.11 109,-40.11\"/>\n</g>\n<!-- 140028156436736 -->\n<g id=\"node3\" class=\"node\">\n<title>140028156436736</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"156,-140 55,-140 55,-121 156,-121 156,-140\"/>\n<text text-anchor=\"middle\" x=\"105.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140028156436736&#45;&gt;140028156253184 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140028156436736&#45;&gt;140028156253184</title>\n<path fill=\"none\" stroke=\"black\" d=\"M105.5,-120.75C105.5,-113.8 105.5,-103.85 105.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"109,-95.09 105.5,-85.09 102,-95.09 109,-95.09\"/>\n</g>\n<!-- 140025091526416 -->\n<g id=\"node4\" class=\"node\">\n<title>140025091526416</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"101,-195 0,-195 0,-176 101,-176 101,-195\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140025091526416&#45;&gt;140028156436736 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140025091526416&#45;&gt;140028156436736</title>\n<path fill=\"none\" stroke=\"black\" d=\"M59.58,-175.75C67.59,-168.03 79.46,-156.6 89.12,-147.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"91.81,-149.55 96.59,-140.09 86.96,-144.51 91.81,-149.55\"/>\n</g>\n<!-- 140028155952000 -->\n<g id=\"node5\" class=\"node\">\n<title>140028155952000</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"96,-261 1,-261 1,-231 96,-231 96,-261\"/>\n<text text-anchor=\"middle\" x=\"48.5\" y=\"-249\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"48.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140028155952000&#45;&gt;140025091526416 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140028155952000&#45;&gt;140025091526416</title>\n<path fill=\"none\" stroke=\"black\" d=\"M48.98,-230.84C49.24,-223.21 49.57,-213.7 49.85,-205.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"53.36,-205.38 50.2,-195.27 46.36,-205.14 53.36,-205.38\"/>\n</g>\n<!-- 140025091525408 -->\n<g id=\"node6\" class=\"node\">\n<title>140025091525408</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"200,-195 123,-195 123,-176 200,-176 200,-195\"/>\n<text text-anchor=\"middle\" x=\"161.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091525408&#45;&gt;140028156436736 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140025091525408&#45;&gt;140028156436736</title>\n<path fill=\"none\" stroke=\"black\" d=\"M152.5,-175.98C144.31,-168.23 131.99,-156.58 122.03,-147.14\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"124.33,-144.5 114.66,-140.17 119.52,-149.59 124.33,-144.5\"/>\n</g>\n<!-- 140025091526224 -->\n<g id=\"node7\" class=\"node\">\n<title>140025091526224</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"215,-255.5 114,-255.5 114,-236.5 215,-236.5 215,-255.5\"/>\n<text text-anchor=\"middle\" x=\"164.5\" y=\"-243.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091525408 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140025091526224&#45;&gt;140025091525408</title>\n<path fill=\"none\" stroke=\"black\" d=\"M164.06,-236.37C163.64,-228.25 163,-215.81 162.47,-205.39\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"165.95,-204.97 161.94,-195.17 158.96,-205.33 165.95,-204.97\"/>\n</g>\n<!-- 140028155952880 -->\n<g id=\"node8\" class=\"node\">\n<title>140028155952880</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"218,-327 111,-327 111,-297 218,-297 218,-327\"/>\n<text text-anchor=\"middle\" x=\"164.5\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.weight</text>\n<text text-anchor=\"middle\" x=\"164.5\" y=\"-304\" font-family=\"monospace\" font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- 140028155952880&#45;&gt;140025091526224 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140028155952880&#45;&gt;140025091526224</title>\n<path fill=\"none\" stroke=\"black\" d=\"M164.5,-296.8C164.5,-287.7 164.5,-275.79 164.5,-265.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"168,-265.84 164.5,-255.84 161,-265.84 168,-265.84\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "init_net_state = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step0.')\n",
-    "\n",
-    "# inner loss\n",
-    "inner_loss = loss_fn(net(x), y)\n",
-    "\n",
-    "print(f'inner loss: {inner_loss:.4f}')\n",
-    "display(torchopt.visual.make_dot(inner_loss, params=(init_net_state, {'inner_loss': inner_loss})))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Conduct inner-loop optimization with `MetaSGD`, here the meta-parameter is served as a factor controlling the scale of inner-loop loss."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# inner-step optimization\n",
-    "loss = inner_loss * meta_parameter\n",
-    "optim.step(loss)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We compute the outer loss and draw the full computation graph of the first bi-level process. In this graph, three main parts are included.\n",
-    "\n",
-    "- Inner-loop: forward process and inner-loss calculation\n",
-    "- Inner-loop optimization: `MetaSGD` optimization step given inner-loss\n",
-    "- Outer-loop: forward process and outer-loss calculation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["[<img align=\"left\" "
+                  "src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://"
+                  "colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/"
+                  "4_Stop_Gradient.ipynb)"]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "outer loss: 0.2039\n",
-      "<graphviz.graphs.Digraph object at 0x7f5a21d70730>\n"
-     ]
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["In this tutorial, we will illustrate the usage of `torchopt.stop_gradient` with "
+                  "a meta-learning example. We use `torchopt.visual` to help us visualize what is "
+                  "going on in automatic differentiation. Firstly, we define a simple network and "
+                  "the objective function for inner- and outer- optimization."]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"421pt\" height=\"874pt\"\n viewBox=\"0.00 0.00 421.00 874.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 870)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-870 417,-870 417,4 -4,4\"/>\n<!-- 140027829238416 -->\n<g id=\"node1\" class=\"node\">\n<title>140027829238416</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"189,-30 112,-30 112,0 189,0 189,-30\"/>\n<text text-anchor=\"middle\" x=\"150.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"150.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091525072 -->\n<g id=\"node2\" class=\"node\">\n<title>140025091525072</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"207,-85 94,-85 94,-66 207,-66 207,-85\"/>\n<text text-anchor=\"middle\" x=\"150.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140025091525072&#45;&gt;140027829238416 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140025091525072&#45;&gt;140027829238416</title>\n<path fill=\"none\" stroke=\"black\" d=\"M150.5,-65.87C150.5,-59.11 150.5,-49.35 150.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"154,-40.11 150.5,-30.11 147,-40.11 154,-40.11\"/>\n</g>\n<!-- 140025091525216 -->\n<g id=\"node3\" class=\"node\">\n<title>140025091525216</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"201,-140 100,-140 100,-121 201,-121 201,-140\"/>\n<text text-anchor=\"middle\" x=\"150.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140025091525216&#45;&gt;140025091525072 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140025091525216&#45;&gt;140025091525072</title>\n<path fill=\"none\" stroke=\"black\" d=\"M150.5,-120.75C150.5,-113.8 150.5,-103.85 150.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"154,-95.09 150.5,-85.09 147,-95.09 154,-95.09\"/>\n</g>\n<!-- 140025091526128 -->\n<g id=\"node4\" class=\"node\">\n<title>140025091526128</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"154,-404 59,-404 59,-363 154,-363 154,-404\"/>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-392\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-381\" font-family=\"monospace\" font-size=\"10.00\">step1.fc.bias</text>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-370\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140025091526128&#45;&gt;140025091525216 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140025091526128&#45;&gt;140025091525216</title>\n<path fill=\"none\" stroke=\"black\" d=\"M109.95,-362.8C118.22,-315.61 139.09,-196.57 147.2,-150.3\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"150.66,-150.88 148.94,-140.42 143.76,-149.67 150.66,-150.88\"/>\n</g>\n<!-- 140025091526416 -->\n<g id=\"node5\" class=\"node\">\n<title>140025091526416</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"101,-734 0,-734 0,-715 101,-715 101,-734\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-722\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140025091526416&#45;&gt;140025091526128 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140025091526416&#45;&gt;140025091526128</title>\n<path fill=\"none\" stroke=\"black\" d=\"M47.8,-714.92C42.34,-696.77 30.5,-653.07 30.5,-615.5 30.5,-615.5 30.5,-615.5 30.5,-503.5 30.5,-473.63 36.82,-465.42 52.5,-440 58.83,-429.73 67.36,-419.85 75.76,-411.33\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"78.33,-413.71 83.06,-404.23 73.45,-408.7 78.33,-413.71\"/>\n</g>\n<!-- 140028156436736 -->\n<g id=\"node14\" class=\"node\">\n<title>140028156436736</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"225,-679 124,-679 124,-660 225,-660 225,-679\"/>\n<text text-anchor=\"middle\" x=\"174.5\" y=\"-667\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140025091526416&#45;&gt;140028156436736 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140025091526416&#45;&gt;140028156436736</title>\n<path fill=\"none\" stroke=\"black\" d=\"M70.42,-714.98C90.55,-706.38 121.88,-692.99 144.91,-683.15\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"146.4,-686.32 154.21,-679.17 143.64,-679.88 146.4,-686.32\"/>\n</g>\n<!-- 140028155952000 -->\n<g id=\"node6\" class=\"node\">\n<title>140028155952000</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"98,-800 3,-800 3,-770 98,-770 98,-800\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-788\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-777\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140028155952000&#45;&gt;140025091526416 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140028155952000&#45;&gt;140025091526416</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-769.84C50.5,-762.21 50.5,-752.7 50.5,-744.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-744.27 50.5,-734.27 47,-744.27 54,-744.27\"/>\n</g>\n<!-- 140025091524976 -->\n<g id=\"node7\" class=\"node\">\n<title>140025091524976</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"151,-459 62,-459 62,-440 151,-440 151,-459\"/>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-447\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140025091524976&#45;&gt;140025091526128 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140025091524976&#45;&gt;140025091526128</title>\n<path fill=\"none\" stroke=\"black\" d=\"M106.5,-439.87C106.5,-433.22 106.5,-423.63 106.5,-414.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"110,-414.01 106.5,-404.01 103,-414.01 110,-414.01\"/>\n</g>\n<!-- 140025091526560 -->\n<g id=\"node8\" class=\"node\">\n<title>140025091526560</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"154,-514 59,-514 59,-495 154,-495 154,-514\"/>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-502\" font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- 140025091526560&#45;&gt;140025091524976 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140025091526560&#45;&gt;140025091524976</title>\n<path fill=\"none\" stroke=\"black\" d=\"M106.5,-494.75C106.5,-487.8 106.5,-477.85 106.5,-469.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"110,-469.09 106.5,-459.09 103,-469.09 110,-469.09\"/>\n</g>\n<!-- 140025091525456 -->\n<g id=\"node9\" class=\"node\">\n<title>140025091525456</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"152,-569 63,-569 63,-550 152,-550 152,-569\"/>\n<text text-anchor=\"middle\" x=\"107.5\" y=\"-557\" font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- 140025091525456&#45;&gt;140025091526560 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140025091525456&#45;&gt;140025091526560</title>\n<path fill=\"none\" stroke=\"black\" d=\"M107.33,-549.75C107.2,-542.8 107.02,-532.85 106.85,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"110.35,-524.02 106.66,-514.09 103.35,-524.15 110.35,-524.02\"/>\n</g>\n<!-- 140025091524112 -->\n<g id=\"node10\" class=\"node\">\n<title>140025091524112</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"255,-624 94,-624 94,-605 255,-605 255,-624\"/>\n<text text-anchor=\"middle\" x=\"174.5\" y=\"-612\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140025091524112&#45;&gt;140025091525456 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140025091524112&#45;&gt;140025091525456</title>\n<path fill=\"none\" stroke=\"black\" d=\"M163.74,-604.98C153.73,-597.07 138.61,-585.11 126.57,-575.58\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"128.47,-572.63 118.46,-569.17 124.13,-578.12 128.47,-572.63\"/>\n</g>\n<!-- 140024973742672 -->\n<g id=\"node24\" class=\"node\">\n<title>140024973742672</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"249,-569 172,-569 172,-550 249,-550 249,-569\"/>\n<text text-anchor=\"middle\" x=\"210.5\" y=\"-557\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091524112&#45;&gt;140024973742672 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140025091524112&#45;&gt;140024973742672</title>\n<path fill=\"none\" stroke=\"black\" d=\"M180.44,-604.75C185.48,-597.34 192.84,-586.5 199.01,-577.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"201.94,-579.33 204.67,-569.09 196.15,-575.39 201.94,-579.33\"/>\n</g>\n<!-- 140024973742288 -->\n<g id=\"node11\" class=\"node\">\n<title>140024973742288</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"404,-679 315,-679 315,-660 404,-660 404,-679\"/>\n<text text-anchor=\"middle\" x=\"359.5\" y=\"-667\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140024973742288&#45;&gt;140025091524112 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140024973742288&#45;&gt;140025091524112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M329.78,-659.98C298.45,-651.01 248.93,-636.82 214.26,-626.89\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"215.03,-623.47 204.46,-624.08 213.11,-630.2 215.03,-623.47\"/>\n</g>\n<!-- 140024973742384 -->\n<g id=\"node12\" class=\"node\">\n<title>140024973742384</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"413,-734 312,-734 312,-715 413,-715 413,-734\"/>\n<text text-anchor=\"middle\" x=\"362.5\" y=\"-722\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140024973742384&#45;&gt;140024973742288 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140024973742384&#45;&gt;140024973742288</title>\n<path fill=\"none\" stroke=\"black\" d=\"M362,-714.75C361.61,-707.8 361.05,-697.85 360.55,-689.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"364.05,-688.88 359.99,-679.09 357.06,-689.27 364.05,-688.88\"/>\n</g>\n<!-- 140025091549440 -->\n<g id=\"node13\" class=\"node\">\n<title>140025091549440</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"413,-800 312,-800 312,-770 413,-770 413,-800\"/>\n<text text-anchor=\"middle\" x=\"362.5\" y=\"-788\" font-family=\"monospace\" font-size=\"10.00\">meta_parameter</text>\n<text text-anchor=\"middle\" x=\"362.5\" y=\"-777\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091549440&#45;&gt;140024973742384 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140025091549440&#45;&gt;140024973742384</title>\n<path fill=\"none\" stroke=\"black\" d=\"M362.5,-769.84C362.5,-762.21 362.5,-752.7 362.5,-744.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"366,-744.27 362.5,-734.27 359,-744.27 366,-744.27\"/>\n</g>\n<!-- 140028156436736&#45;&gt;140025091524112 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140028156436736&#45;&gt;140025091524112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M174.5,-659.75C174.5,-652.8 174.5,-642.85 174.5,-634.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"178,-634.09 174.5,-624.09 171,-634.09 178,-634.09\"/>\n</g>\n<!-- 140025091525408 -->\n<g id=\"node15\" class=\"node\">\n<title>140025091525408</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"235,-734 158,-734 158,-715 235,-715 235,-734\"/>\n<text text-anchor=\"middle\" x=\"196.5\" y=\"-722\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091525408&#45;&gt;140028156436736 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140025091525408&#45;&gt;140028156436736</title>\n<path fill=\"none\" stroke=\"black\" d=\"M192.87,-714.75C189.92,-707.65 185.67,-697.4 181.99,-688.56\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"185.13,-686.98 178.07,-679.09 178.67,-689.67 185.13,-686.98\"/>\n</g>\n<!-- 140025091526224 -->\n<g id=\"node16\" class=\"node\">\n<title>140025091526224</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"285,-794.5 184,-794.5 184,-775.5 285,-775.5 285,-794.5\"/>\n<text text-anchor=\"middle\" x=\"234.5\" y=\"-782.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091525408 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140025091526224&#45;&gt;140025091525408</title>\n<path fill=\"none\" stroke=\"black\" d=\"M228.89,-775.37C223.34,-766.81 214.67,-753.47 207.7,-742.74\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"210.51,-740.65 202.13,-734.17 204.64,-744.46 210.51,-740.65\"/>\n</g>\n<!-- 140025091524928 -->\n<g id=\"node19\" class=\"node\">\n<title>140025091524928</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"264,-272 157,-272 157,-231 264,-231 264,-272\"/>\n<text text-anchor=\"middle\" x=\"210.5\" y=\"-260\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"210.5\" y=\"-249\" font-family=\"monospace\" font-size=\"10.00\">step1.fc.weight</text>\n<text text-anchor=\"middle\" x=\"210.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091524928 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140025091526224&#45;&gt;140025091524928</title>\n<path fill=\"none\" stroke=\"black\" d=\"M241.11,-775.43C254.48,-757.2 283.5,-712.78 283.5,-670.5 283.5,-670.5 283.5,-670.5 283.5,-382.5 283.5,-348.22 280.31,-337.88 263.5,-308 257.64,-297.58 249.38,-287.65 241.12,-279.14\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"243.5,-276.58 233.92,-272.05 238.59,-281.56 243.5,-276.58\"/>\n</g>\n<!-- 140028155952880 -->\n<g id=\"node17\" class=\"node\">\n<title>140028155952880</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"288,-866 181,-866 181,-836 288,-836 288,-866\"/>\n<text text-anchor=\"middle\" x=\"234.5\" y=\"-854\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.weight</text>\n<text text-anchor=\"middle\" x=\"234.5\" y=\"-843\" font-family=\"monospace\" font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- 140028155952880&#45;&gt;140025091526224 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140028155952880&#45;&gt;140025091526224</title>\n<path fill=\"none\" stroke=\"black\" d=\"M234.5,-835.8C234.5,-826.7 234.5,-814.79 234.5,-804.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"238,-804.84 234.5,-794.84 231,-804.84 238,-804.84\"/>\n</g>\n<!-- 140025091524448 -->\n<g id=\"node18\" class=\"node\">\n<title>140025091524448</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"230,-195 153,-195 153,-176 230,-176 230,-195\"/>\n<text text-anchor=\"middle\" x=\"191.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091524448&#45;&gt;140025091525216 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140025091524448&#45;&gt;140025091525216</title>\n<path fill=\"none\" stroke=\"black\" d=\"M184.73,-175.75C178.94,-168.26 170.44,-157.28 163.36,-148.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"166.03,-145.86 157.14,-140.09 160.49,-150.14 166.03,-145.86\"/>\n</g>\n<!-- 140025091524928&#45;&gt;140025091524448 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140025091524928&#45;&gt;140025091524448</title>\n<path fill=\"none\" stroke=\"black\" d=\"M204.7,-230.95C202.24,-222.67 199.39,-213.07 196.97,-204.92\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"200.3,-203.85 194.1,-195.26 193.59,-205.84 200.3,-203.85\"/>\n</g>\n<!-- 140025091525600 -->\n<g id=\"node20\" class=\"node\">\n<title>140025091525600</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"255,-327 166,-327 166,-308 255,-308 255,-327\"/>\n<text text-anchor=\"middle\" x=\"210.5\" y=\"-315\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140025091525600&#45;&gt;140025091524928 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140025091525600&#45;&gt;140025091524928</title>\n<path fill=\"none\" stroke=\"black\" d=\"M210.5,-307.87C210.5,-301.22 210.5,-291.63 210.5,-282.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"214,-282.01 210.5,-272.01 207,-282.01 214,-282.01\"/>\n</g>\n<!-- 140024973742144 -->\n<g id=\"node21\" class=\"node\">\n<title>140024973742144</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"250,-393 173,-393 173,-374 250,-374 250,-393\"/>\n<text text-anchor=\"middle\" x=\"211.5\" y=\"-381\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973742144&#45;&gt;140025091525600 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140024973742144&#45;&gt;140025091525600</title>\n<path fill=\"none\" stroke=\"black\" d=\"M211.37,-373.87C211.22,-364.66 210.99,-349.79 210.8,-337.77\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"214.3,-337.35 210.64,-327.41 207.3,-337.46 214.3,-337.35\"/>\n</g>\n<!-- 140024973742576 -->\n<g id=\"node22\" class=\"node\">\n<title>140024973742576</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"251,-459 174,-459 174,-440 251,-440 251,-459\"/>\n<text text-anchor=\"middle\" x=\"212.5\" y=\"-447\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973742576&#45;&gt;140024973742144 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140024973742576&#45;&gt;140024973742144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M212.37,-439.87C212.22,-430.66 211.99,-415.79 211.8,-403.77\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"215.3,-403.35 211.64,-393.41 208.3,-403.46 215.3,-403.35\"/>\n</g>\n<!-- 140024973742480 -->\n<g id=\"node23\" class=\"node\">\n<title>140024973742480</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"255,-514 172,-514 172,-495 255,-495 255,-514\"/>\n<text text-anchor=\"middle\" x=\"213.5\" y=\"-502\" font-family=\"monospace\" font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- 140024973742480&#45;&gt;140024973742576 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140024973742480&#45;&gt;140024973742576</title>\n<path fill=\"none\" stroke=\"black\" d=\"M213.33,-494.75C213.2,-487.8 213.02,-477.85 212.85,-469.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"216.35,-469.02 212.66,-459.09 209.35,-469.15 216.35,-469.02\"/>\n</g>\n<!-- 140024973742672&#45;&gt;140024973742480 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140024973742672&#45;&gt;140024973742480</title>\n<path fill=\"none\" stroke=\"black\" d=\"M211,-549.75C211.39,-542.8 211.95,-532.85 212.45,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"215.94,-524.27 213.01,-514.09 208.95,-523.88 215.94,-524.27\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Extract `state_dict`` for updated network\n",
-    "one_step_net_state = torchopt.extract_state_dict(net, enable_visual=True, visual_prefix='step1.')\n",
-    "one_step_optim_state = torchopt.extract_state_dict(optim)\n",
-    "\n",
-    "# Calculate outer loss\n",
-    "outer_loss = loss_fn(net(x), y)\n",
-    "print(f'outer loss: {outer_loss:.4f}')\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        outer_loss,\n",
-    "        params=(\n",
-    "            init_net_state,\n",
-    "            one_step_net_state,\n",
-    "            {'meta_parameter': meta_parameter, 'outer_loss': outer_loss},\n",
-    "        ),\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we backward the loss to conduct outer-loop meta-optimization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "code",
+      "execution_count" : 1,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "from IPython.display import display\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "import torchopt\n",
+        "\n",
+        "\n",
+        "class Net(nn.Module):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, 1, bias=True)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.fc(x)\n",
+        "\n",
+        "\n",
+        "loss_fn = F.mse_loss"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "meta_parameter.grad = tensor(-0.1205)\n",
-      "meta_parameter = Parameter containing:\n",
-      "tensor(1.1000, requires_grad=True)\n"
-     ]
-    }
-   ],
-   "source": [
-    "meta_optim.zero_grad()\n",
-    "outer_loss.backward()\n",
-    "print(f'meta_parameter.grad = {meta_parameter.grad!r}')\n",
-    "meta_optim.step()\n",
-    "print(f'meta_parameter = {meta_parameter!r}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We have already conducted one bi-level optimization and optimize our meta-parameters. When you want to conduct the second bi-level optimization, you need to be careful whether you need to use the `stop_gradient` function. For example, if your new inner-loop parameters directly inherits previous inner-loop parameters (which is a common strategy in many meta-learning algorithms like Meta-Gradient Reinforcement Learning (MGRL) ([arXiv:1805.09801](https://arxiv.org/abs/1805.09801))), you might need `stop_gradient` function."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In general, the backpropagation only frees saved tensors (often used as auxiliary data for computing the gradient) but the computation graph remains. Once the outer iteration is finished, if you want to use any intermediate network parameters produced by the inner loop for the next bi-level iteration, you should detach them from the computation graph.\n",
-    "\n",
-    "There are two main reasons:\n",
-    "\n",
-    "- The network parameters are still connected to the previous computation graph (`.grad_fn` is not `None`). If later the gradient backpropagate to these parameters, the PyTorch backward engine will try to backpropagate through the previous computation graph. This will raise a `RuntimeError`: Trying to backward through the graph a second time...\n",
-    "- If we do not detach the computation graph, the computation graph connected to these parameters can not be freed by GC (Garbage Collector) until these parameters are collected by GC."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now let us see what will happen if we do not use the `stop_gradient` function before we conduct the second bi-level process."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["We define the input `x` and output `y`. `y` will be served as the regression "
+                  "target in the following code."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 2,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "batch_size = 64\n",
+        "dim = 16\n",
+        "\n",
+        "x = torch.randn((batch_size, dim))\n",
+        "y = torch.zeros((batch_size, 1))\n",
+        "net = Net(dim)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Let us define the meta-parameter,  we use `MetaSGD` as the inner-loop optimizer "
+                  "and `Adam` as the outer-loop optimizer. "]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<graphviz.graphs.Digraph object at 0x7f5ac5072280>\n"
-     ]
+      "cell_type" : "code",
+      "execution_count" : 3,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "meta_parameter = nn.Parameter(torch.tensor(1.0), requires_grad=True)\n",
+        "\n",
+        "optim = torchopt.MetaSGD(net, lr=1e-1)\n",
+        "meta_optim = torch.optim.Adam([meta_parameter], lr=1e-1)"
+      ]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"413pt\" height=\"1369pt\"\n viewBox=\"0.00 0.00 413.00 1369.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 1365)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-1365 409,-1365 409,4 -4,4\"/>\n<!-- 140024973755152 -->\n<g id=\"node1\" class=\"node\">\n<title>140024973755152</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"221,-30 144,-30 144,0 221,0 221,-30\"/>\n<text text-anchor=\"middle\" x=\"182.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"182.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140027829363232 -->\n<g id=\"node2\" class=\"node\">\n<title>140027829363232</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"239,-85 126,-85 126,-66 239,-66 239,-85\"/>\n<text text-anchor=\"middle\" x=\"182.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140027829363232&#45;&gt;140024973755152 -->\n<g id=\"edge44\" class=\"edge\">\n<title>140027829363232&#45;&gt;140024973755152</title>\n<path fill=\"none\" stroke=\"black\" d=\"M182.5,-65.87C182.5,-59.11 182.5,-49.35 182.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"186,-40.11 182.5,-30.11 179,-40.11 186,-40.11\"/>\n</g>\n<!-- 140027829363616 -->\n<g id=\"node3\" class=\"node\">\n<title>140027829363616</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"233,-140 132,-140 132,-121 233,-121 233,-140\"/>\n<text text-anchor=\"middle\" x=\"182.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140027829363616&#45;&gt;140027829363232 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140027829363616&#45;&gt;140027829363232</title>\n<path fill=\"none\" stroke=\"black\" d=\"M182.5,-120.75C182.5,-113.8 182.5,-103.85 182.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"186,-95.09 182.5,-85.09 179,-95.09 186,-95.09\"/>\n</g>\n<!-- 140027829366544 -->\n<g id=\"node4\" class=\"node\">\n<title>140027829366544</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"281,-360 192,-360 192,-341 281,-341 281,-360\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-348\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140027829366544&#45;&gt;140027829363616 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140027829366544&#45;&gt;140027829363616</title>\n<path fill=\"none\" stroke=\"black\" d=\"M234.74,-340.77C229.54,-315.26 213.55,-238.64 196.5,-176 194.16,-167.4 191.25,-157.98 188.68,-150.02\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"191.94,-148.71 185.49,-140.31 185.29,-150.9 191.94,-148.71\"/>\n</g>\n<!-- 140025091526128 -->\n<g id=\"node5\" class=\"node\">\n<title>140025091526128</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"360,-844 265,-844 265,-803 360,-803 360,-844\"/>\n<text text-anchor=\"middle\" x=\"312.5\" y=\"-832\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"312.5\" y=\"-821\" font-family=\"monospace\" font-size=\"10.00\">step1.fc.bias</text>\n<text text-anchor=\"middle\" x=\"312.5\" y=\"-810\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140025091526128&#45;&gt;140027829366544 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140025091526128&#45;&gt;140027829366544</title>\n<path fill=\"none\" stroke=\"black\" d=\"M312.5,-802.88C312.5,-775.65 312.5,-724.83 312.5,-681.5 312.5,-681.5 312.5,-681.5 312.5,-459.5 312.5,-429.48 306.69,-420.6 289.5,-396 281.33,-384.31 269.33,-373.88 258.82,-366.07\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"260.64,-363.07 250.45,-360.15 256.6,-368.79 260.64,-363.07\"/>\n</g>\n<!-- 140025091725152 -->\n<g id=\"node24\" class=\"node\">\n<title>140025091725152</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"226,-635 125,-635 125,-616 226,-616 226,-635\"/>\n<text text-anchor=\"middle\" x=\"175.5\" y=\"-623\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140025091526128&#45;&gt;140025091725152 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140025091526128&#45;&gt;140025091725152</title>\n<path fill=\"none\" stroke=\"black\" d=\"M302.16,-802.98C286.03,-773.27 253.12,-715.44 218.5,-671 210.6,-660.85 200.68,-650.46 192.36,-642.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"194.63,-639.6 184.99,-635.18 189.77,-644.64 194.63,-639.6\"/>\n</g>\n<!-- 140025091526416 -->\n<g id=\"node6\" class=\"node\">\n<title>140025091526416</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"405,-1229 304,-1229 304,-1210 405,-1210 405,-1229\"/>\n<text text-anchor=\"middle\" x=\"354.5\" y=\"-1217\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140025091526416&#45;&gt;140025091526128 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140025091526416&#45;&gt;140025091526128</title>\n<path fill=\"none\" stroke=\"black\" d=\"M360.06,-1209.94C371.05,-1192.18 394.5,-1149.69 394.5,-1110.5 394.5,-1110.5 394.5,-1110.5 394.5,-943.5 394.5,-906.76 367.67,-873.11 344.7,-851.04\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"346.82,-848.23 337.1,-844.01 342.07,-853.37 346.82,-848.23\"/>\n</g>\n<!-- 140028156436736 -->\n<g id=\"node15\" class=\"node\">\n<title>140028156436736</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"366,-1174 265,-1174 265,-1155 366,-1155 366,-1174\"/>\n<text text-anchor=\"middle\" x=\"315.5\" y=\"-1162\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140025091526416&#45;&gt;140028156436736 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140025091526416&#45;&gt;140028156436736</title>\n<path fill=\"none\" stroke=\"black\" d=\"M348.06,-1209.75C342.61,-1202.34 334.64,-1191.5 327.94,-1182.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"330.57,-1180.07 321.82,-1174.09 324.93,-1184.22 330.57,-1180.07\"/>\n</g>\n<!-- 140028155952000 -->\n<g id=\"node7\" class=\"node\">\n<title>140028155952000</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"402,-1295 307,-1295 307,-1265 402,-1265 402,-1295\"/>\n<text text-anchor=\"middle\" x=\"354.5\" y=\"-1283\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"354.5\" y=\"-1272\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140028155952000&#45;&gt;140025091526416 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140028155952000&#45;&gt;140025091526416</title>\n<path fill=\"none\" stroke=\"black\" d=\"M354.5,-1264.84C354.5,-1257.21 354.5,-1247.7 354.5,-1239.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"358,-1239.27 354.5,-1229.27 351,-1239.27 358,-1239.27\"/>\n</g>\n<!-- 140025091524976 -->\n<g id=\"node8\" class=\"node\">\n<title>140025091524976</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"357,-954 268,-954 268,-935 357,-935 357,-954\"/>\n<text text-anchor=\"middle\" x=\"312.5\" y=\"-942\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140025091524976&#45;&gt;140025091526128 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140025091524976&#45;&gt;140025091526128</title>\n<path fill=\"none\" stroke=\"black\" d=\"M312.5,-934.94C312.5,-918.36 312.5,-881.15 312.5,-854.5\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"316,-854.17 312.5,-844.17 309,-854.17 316,-854.17\"/>\n</g>\n<!-- 140025091526560 -->\n<g id=\"node9\" class=\"node\">\n<title>140025091526560</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"360,-1009 265,-1009 265,-990 360,-990 360,-1009\"/>\n<text text-anchor=\"middle\" x=\"312.5\" y=\"-997\" font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- 140025091526560&#45;&gt;140025091524976 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140025091526560&#45;&gt;140025091524976</title>\n<path fill=\"none\" stroke=\"black\" d=\"M312.5,-989.75C312.5,-982.8 312.5,-972.85 312.5,-964.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"316,-964.09 312.5,-954.09 309,-964.09 316,-964.09\"/>\n</g>\n<!-- 140025091525456 -->\n<g id=\"node10\" class=\"node\">\n<title>140025091525456</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"355,-1064 266,-1064 266,-1045 355,-1045 355,-1064\"/>\n<text text-anchor=\"middle\" x=\"310.5\" y=\"-1052\" font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- 140025091525456&#45;&gt;140025091526560 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140025091525456&#45;&gt;140025091526560</title>\n<path fill=\"none\" stroke=\"black\" d=\"M310.83,-1044.75C311.09,-1037.8 311.47,-1027.85 311.8,-1019.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"315.3,-1019.21 312.18,-1009.09 308.3,-1018.95 315.3,-1019.21\"/>\n</g>\n<!-- 140025091524112 -->\n<g id=\"node11\" class=\"node\">\n<title>140025091524112</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"343,-1119 182,-1119 182,-1100 343,-1100 343,-1119\"/>\n<text text-anchor=\"middle\" x=\"262.5\" y=\"-1107\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140025091524112&#45;&gt;140025091525456 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140025091524112&#45;&gt;140025091525456</title>\n<path fill=\"none\" stroke=\"black\" d=\"M270.43,-1099.75C277.35,-1092.11 287.57,-1080.82 295.95,-1071.56\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"298.6,-1073.85 302.72,-1064.09 293.41,-1069.15 298.6,-1073.85\"/>\n</g>\n<!-- 140024973742672 -->\n<g id=\"node31\" class=\"node\">\n<title>140024973742672</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"245,-1064 168,-1064 168,-1045 245,-1045 245,-1064\"/>\n<text text-anchor=\"middle\" x=\"206.5\" y=\"-1052\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091524112&#45;&gt;140024973742672 -->\n<g id=\"edge34\" class=\"edge\">\n<title>140025091524112&#45;&gt;140024973742672</title>\n<path fill=\"none\" stroke=\"black\" d=\"M253.5,-1099.98C245.31,-1092.23 232.99,-1080.58 223.03,-1071.14\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"225.33,-1068.5 215.66,-1064.17 220.52,-1073.59 225.33,-1068.5\"/>\n</g>\n<!-- 140024973742288 -->\n<g id=\"node12\" class=\"node\">\n<title>140024973742288</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"247,-1174 158,-1174 158,-1155 247,-1155 247,-1174\"/>\n<text text-anchor=\"middle\" x=\"202.5\" y=\"-1162\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140024973742288&#45;&gt;140025091524112 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140024973742288&#45;&gt;140025091524112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M212.14,-1154.98C221.01,-1147.15 234.37,-1135.34 245.11,-1125.86\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"247.51,-1128.41 252.68,-1119.17 242.87,-1123.17 247.51,-1128.41\"/>\n</g>\n<!-- 140024973742384 -->\n<g id=\"node13\" class=\"node\">\n<title>140024973742384</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"101,-1229 0,-1229 0,-1210 101,-1210 101,-1229\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-1217\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140024973742384&#45;&gt;140024973742288 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140024973742384&#45;&gt;140024973742288</title>\n<path fill=\"none\" stroke=\"black\" d=\"M74.92,-1209.98C100.22,-1201.16 139.95,-1187.31 168.36,-1177.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"169.6,-1180.68 177.89,-1174.08 167.29,-1174.07 169.6,-1180.68\"/>\n</g>\n<!-- 140025091726064 -->\n<g id=\"node23\" class=\"node\">\n<title>140025091726064</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"100,-1119 11,-1119 11,-1100 100,-1100 100,-1119\"/>\n<text text-anchor=\"middle\" x=\"55.5\" y=\"-1107\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140024973742384&#45;&gt;140025091726064 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140024973742384&#45;&gt;140025091726064</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.91,-1209.66C51.72,-1192.17 53.54,-1152.8 54.63,-1129.27\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"58.13,-1129.31 55.1,-1119.16 51.14,-1128.99 58.13,-1129.31\"/>\n</g>\n<!-- 140025091549440 -->\n<g id=\"node14\" class=\"node\">\n<title>140025091549440</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"101,-1295 0,-1295 0,-1265 101,-1265 101,-1295\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-1283\" font-family=\"monospace\" font-size=\"10.00\">meta_parameter</text>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-1272\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091549440&#45;&gt;140024973742384 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140025091549440&#45;&gt;140024973742384</title>\n<path fill=\"none\" stroke=\"black\" d=\"M50.5,-1264.84C50.5,-1257.21 50.5,-1247.7 50.5,-1239.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-1239.27 50.5,-1229.27 47,-1239.27 54,-1239.27\"/>\n</g>\n<!-- 140028156436736&#45;&gt;140025091524112 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140028156436736&#45;&gt;140025091524112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M306.75,-1154.75C299.03,-1147.03 287.6,-1135.6 278.28,-1126.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"280.64,-1123.69 271.09,-1119.09 275.69,-1128.64 280.64,-1123.69\"/>\n</g>\n<!-- 140025091525408 -->\n<g id=\"node16\" class=\"node\">\n<title>140025091525408</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"280,-1229 203,-1229 203,-1210 280,-1210 280,-1229\"/>\n<text text-anchor=\"middle\" x=\"241.5\" y=\"-1217\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091525408&#45;&gt;140028156436736 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140025091525408&#45;&gt;140028156436736</title>\n<path fill=\"none\" stroke=\"black\" d=\"M253.39,-1209.98C264.65,-1201.92 281.79,-1189.65 295.21,-1180.03\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"297.3,-1182.84 303.39,-1174.17 293.23,-1177.15 297.3,-1182.84\"/>\n</g>\n<!-- 140025091526224 -->\n<g id=\"node17\" class=\"node\">\n<title>140025091526224</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"274,-1289.5 173,-1289.5 173,-1270.5 274,-1270.5 274,-1289.5\"/>\n<text text-anchor=\"middle\" x=\"223.5\" y=\"-1277.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091525408 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140025091526224&#45;&gt;140025091525408</title>\n<path fill=\"none\" stroke=\"black\" d=\"M226.16,-1270.37C228.68,-1262.16 232.56,-1249.54 235.79,-1239.05\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"239.24,-1239.75 238.83,-1229.17 232.55,-1237.7 239.24,-1239.75\"/>\n</g>\n<!-- 140025091524928 -->\n<g id=\"node26\" class=\"node\">\n<title>140025091524928</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"210,-767 103,-767 103,-726 210,-726 210,-767\"/>\n<text text-anchor=\"middle\" x=\"156.5\" y=\"-755\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"156.5\" y=\"-744\" font-family=\"monospace\" font-size=\"10.00\">step1.fc.weight</text>\n<text text-anchor=\"middle\" x=\"156.5\" y=\"-733\" font-family=\"monospace\" font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091524928 -->\n<g id=\"edge28\" class=\"edge\">\n<title>140025091526224&#45;&gt;140025091524928</title>\n<path fill=\"none\" stroke=\"black\" d=\"M208.81,-1270.48C182.38,-1253.91 129.5,-1214.62 129.5,-1165.5 129.5,-1165.5 129.5,-1165.5 129.5,-888.5 129.5,-849.43 139.62,-805.34 147.47,-777.03\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"150.86,-777.93 150.24,-767.35 144.12,-776 150.86,-777.93\"/>\n</g>\n<!-- 140028155952880 -->\n<g id=\"node18\" class=\"node\">\n<title>140028155952880</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"277,-1361 170,-1361 170,-1331 277,-1331 277,-1361\"/>\n<text text-anchor=\"middle\" x=\"223.5\" y=\"-1349\" font-family=\"monospace\" font-size=\"10.00\">step0.fc.weight</text>\n<text text-anchor=\"middle\" x=\"223.5\" y=\"-1338\" font-family=\"monospace\" font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- 140028155952880&#45;&gt;140025091526224 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140028155952880&#45;&gt;140025091526224</title>\n<path fill=\"none\" stroke=\"black\" d=\"M223.5,-1330.8C223.5,-1321.7 223.5,-1309.79 223.5,-1299.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"227,-1299.84 223.5,-1289.84 220,-1299.84 227,-1299.84\"/>\n</g>\n<!-- 140025091726784 -->\n<g id=\"node19\" class=\"node\">\n<title>140025091726784</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"281,-415 192,-415 192,-396 281,-396 281,-415\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-403\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140025091726784&#45;&gt;140027829366544 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140025091726784&#45;&gt;140027829366544</title>\n<path fill=\"none\" stroke=\"black\" d=\"M236.5,-395.75C236.5,-388.8 236.5,-378.85 236.5,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"240,-370.09 236.5,-360.09 233,-370.09 240,-370.09\"/>\n</g>\n<!-- 140025091726688 -->\n<g id=\"node20\" class=\"node\">\n<title>140025091726688</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"284,-470 189,-470 189,-451 284,-451 284,-470\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-458\" font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- 140025091726688&#45;&gt;140025091726784 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140025091726688&#45;&gt;140025091726784</title>\n<path fill=\"none\" stroke=\"black\" d=\"M236.5,-450.75C236.5,-443.8 236.5,-433.85 236.5,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"240,-425.09 236.5,-415.09 233,-425.09 240,-425.09\"/>\n</g>\n<!-- 140025091725680 -->\n<g id=\"node21\" class=\"node\">\n<title>140025091725680</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"278,-525 189,-525 189,-506 278,-506 278,-525\"/>\n<text text-anchor=\"middle\" x=\"233.5\" y=\"-513\" font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- 140025091725680&#45;&gt;140025091726688 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140025091725680&#45;&gt;140025091726688</title>\n<path fill=\"none\" stroke=\"black\" d=\"M234,-505.75C234.39,-498.8 234.95,-488.85 235.45,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"238.94,-480.27 236.01,-470.09 231.95,-479.88 238.94,-480.27\"/>\n</g>\n<!-- 140025091726112 -->\n<g id=\"node22\" class=\"node\">\n<title>140025091726112</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"252,-580 91,-580 91,-561 252,-561 252,-580\"/>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-568\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140025091726112&#45;&gt;140025091725680 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140025091726112&#45;&gt;140025091725680</title>\n<path fill=\"none\" stroke=\"black\" d=\"M181.46,-560.98C190.63,-553.15 204.44,-541.34 215.53,-531.86\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"218.03,-534.33 223.36,-525.17 213.48,-529.01 218.03,-534.33\"/>\n</g>\n<!-- 140025091726880 -->\n<g id=\"node38\" class=\"node\">\n<title>140025091726880</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"169,-525 92,-525 92,-506 169,-506 169,-525\"/>\n<text text-anchor=\"middle\" x=\"130.5\" y=\"-513\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091726112&#45;&gt;140025091726880 -->\n<g id=\"edge43\" class=\"edge\">\n<title>140025091726112&#45;&gt;140025091726880</title>\n<path fill=\"none\" stroke=\"black\" d=\"M164.73,-560.75C158.94,-553.26 150.44,-542.28 143.36,-533.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"146.03,-530.86 137.14,-525.09 140.49,-535.14 146.03,-530.86\"/>\n</g>\n<!-- 140025091726064&#45;&gt;140025091726112 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140025091726064&#45;&gt;140025091726112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M56.04,-1099.82C57.13,-1081.49 59.5,-1037.46 59.5,-1000.5 59.5,-1000.5 59.5,-1000.5 59.5,-745.5 59.5,-682.79 77.36,-665.78 115.5,-616 124.56,-604.18 137.42,-593.66 148.53,-585.83\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"150.63,-588.64 156.97,-580.15 146.72,-582.83 150.63,-588.64\"/>\n</g>\n<!-- 140025091725152&#45;&gt;140025091726112 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140025091725152&#45;&gt;140025091726112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M174.84,-615.75C174.32,-608.8 173.56,-598.85 172.91,-590.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"176.39,-589.8 172.15,-580.09 169.41,-590.32 176.39,-589.8\"/>\n</g>\n<!-- 140025091725824 -->\n<g id=\"node25\" class=\"node\">\n<title>140025091725824</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"210,-690 133,-690 133,-671 210,-671 210,-690\"/>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-678\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091725824&#45;&gt;140025091725152 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140025091725824&#45;&gt;140025091725152</title>\n<path fill=\"none\" stroke=\"black\" d=\"M172.16,-670.75C172.68,-663.8 173.44,-653.85 174.09,-645.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"177.59,-645.32 174.85,-635.09 170.61,-644.8 177.59,-645.32\"/>\n</g>\n<!-- 140025091524928&#45;&gt;140025091725824 -->\n<g id=\"edge27\" class=\"edge\">\n<title>140025091524928&#45;&gt;140025091725824</title>\n<path fill=\"none\" stroke=\"black\" d=\"M161.08,-725.95C163,-717.76 165.22,-708.28 167.12,-700.19\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"170.57,-700.8 169.45,-690.26 163.76,-699.2 170.57,-700.8\"/>\n</g>\n<!-- 140025091726016 -->\n<g id=\"node33\" class=\"node\">\n<title>140025091726016</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"176,-250 87,-250 87,-231 176,-231 176,-250\"/>\n<text text-anchor=\"middle\" x=\"131.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140025091524928&#45;&gt;140025091726016 -->\n<g id=\"edge37\" class=\"edge\">\n<title>140025091524928&#45;&gt;140025091726016</title>\n<path fill=\"none\" stroke=\"black\" d=\"M136.43,-725.86C107.76,-695.59 58.5,-634.43 58.5,-571.5 58.5,-571.5 58.5,-571.5 58.5,-349.5 58.5,-320.04 61.21,-310.54 77.5,-286 85.37,-274.14 97.36,-263.81 108.03,-256.1\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"110.3,-258.79 116.58,-250.26 106.35,-253.02 110.3,-258.79\"/>\n</g>\n<!-- 140025091525600 -->\n<g id=\"node27\" class=\"node\">\n<title>140025091525600</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"247,-833 158,-833 158,-814 247,-814 247,-833\"/>\n<text text-anchor=\"middle\" x=\"202.5\" y=\"-821\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140025091525600&#45;&gt;140025091524928 -->\n<g id=\"edge29\" class=\"edge\">\n<title>140025091525600&#45;&gt;140025091524928</title>\n<path fill=\"none\" stroke=\"black\" d=\"M197.22,-813.9C191.53,-804.62 182.17,-789.35 173.85,-775.79\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"176.8,-773.9 168.59,-767.21 170.83,-777.56 176.8,-773.9\"/>\n</g>\n<!-- 140024973742144 -->\n<g id=\"node28\" class=\"node\">\n<title>140024973742144</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"242,-899 165,-899 165,-880 242,-880 242,-899\"/>\n<text text-anchor=\"middle\" x=\"203.5\" y=\"-887\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973742144&#45;&gt;140025091525600 -->\n<g id=\"edge30\" class=\"edge\">\n<title>140024973742144&#45;&gt;140025091525600</title>\n<path fill=\"none\" stroke=\"black\" d=\"M203.37,-879.87C203.22,-870.66 202.99,-855.79 202.8,-843.77\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"206.3,-843.35 202.64,-833.41 199.3,-843.46 206.3,-843.35\"/>\n</g>\n<!-- 140024973742576 -->\n<g id=\"node29\" class=\"node\">\n<title>140024973742576</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"242,-954 165,-954 165,-935 242,-935 242,-954\"/>\n<text text-anchor=\"middle\" x=\"203.5\" y=\"-942\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973742576&#45;&gt;140024973742144 -->\n<g id=\"edge31\" class=\"edge\">\n<title>140024973742576&#45;&gt;140024973742144</title>\n<path fill=\"none\" stroke=\"black\" d=\"M203.5,-934.75C203.5,-927.8 203.5,-917.85 203.5,-909.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"207,-909.09 203.5,-899.09 200,-909.09 207,-909.09\"/>\n</g>\n<!-- 140024973742480 -->\n<g id=\"node30\" class=\"node\">\n<title>140024973742480</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"246,-1009 163,-1009 163,-990 246,-990 246,-1009\"/>\n<text text-anchor=\"middle\" x=\"204.5\" y=\"-997\" font-family=\"monospace\" font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- 140024973742480&#45;&gt;140024973742576 -->\n<g id=\"edge32\" class=\"edge\">\n<title>140024973742480&#45;&gt;140024973742576</title>\n<path fill=\"none\" stroke=\"black\" d=\"M204.33,-989.75C204.2,-982.8 204.02,-972.85 203.85,-964.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"207.35,-964.02 203.66,-954.09 200.35,-964.15 207.35,-964.02\"/>\n</g>\n<!-- 140024973742672&#45;&gt;140024973742480 -->\n<g id=\"edge33\" class=\"edge\">\n<title>140024973742672&#45;&gt;140024973742480</title>\n<path fill=\"none\" stroke=\"black\" d=\"M206.17,-1044.75C205.91,-1037.8 205.53,-1027.85 205.2,-1019.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"208.7,-1018.95 204.82,-1009.09 201.7,-1019.21 208.7,-1018.95\"/>\n</g>\n<!-- 140027829365632 -->\n<g id=\"node32\" class=\"node\">\n<title>140027829365632</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"188,-195 111,-195 111,-176 188,-176 188,-195\"/>\n<text text-anchor=\"middle\" x=\"149.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140027829365632&#45;&gt;140027829363616 -->\n<g id=\"edge35\" class=\"edge\">\n<title>140027829365632&#45;&gt;140027829363616</title>\n<path fill=\"none\" stroke=\"black\" d=\"M154.95,-175.75C159.51,-168.42 166.17,-157.73 171.79,-148.7\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"174.84,-150.43 177.15,-140.09 168.9,-146.73 174.84,-150.43\"/>\n</g>\n<!-- 140025091726016&#45;&gt;140027829365632 -->\n<g id=\"edge36\" class=\"edge\">\n<title>140025091726016&#45;&gt;140027829365632</title>\n<path fill=\"none\" stroke=\"black\" d=\"M134.47,-230.75C136.86,-223.72 140.29,-213.62 143.27,-204.84\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"146.68,-205.68 146.58,-195.09 140.05,-203.43 146.68,-205.68\"/>\n</g>\n<!-- 140025091726544 -->\n<g id=\"node34\" class=\"node\">\n<title>140025091726544</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"176,-305 87,-305 87,-286 176,-286 176,-305\"/>\n<text text-anchor=\"middle\" x=\"131.5\" y=\"-293\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140025091726544&#45;&gt;140025091726016 -->\n<g id=\"edge38\" class=\"edge\">\n<title>140025091726544&#45;&gt;140025091726016</title>\n<path fill=\"none\" stroke=\"black\" d=\"M131.5,-285.75C131.5,-278.8 131.5,-268.85 131.5,-260.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"135,-260.09 131.5,-250.09 128,-260.09 135,-260.09\"/>\n</g>\n<!-- 140025091726448 -->\n<g id=\"node35\" class=\"node\">\n<title>140025091726448</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"169,-360 92,-360 92,-341 169,-341 169,-360\"/>\n<text text-anchor=\"middle\" x=\"130.5\" y=\"-348\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091726448&#45;&gt;140025091726544 -->\n<g id=\"edge39\" class=\"edge\">\n<title>140025091726448&#45;&gt;140025091726544</title>\n<path fill=\"none\" stroke=\"black\" d=\"M130.67,-340.75C130.8,-333.8 130.98,-323.85 131.15,-315.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"134.65,-315.15 131.34,-305.09 127.65,-315.02 134.65,-315.15\"/>\n</g>\n<!-- 140025091725584 -->\n<g id=\"node36\" class=\"node\">\n<title>140025091725584</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"168,-415 91,-415 91,-396 168,-396 168,-415\"/>\n<text text-anchor=\"middle\" x=\"129.5\" y=\"-403\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140025091725584&#45;&gt;140025091726448 -->\n<g id=\"edge40\" class=\"edge\">\n<title>140025091725584&#45;&gt;140025091726448</title>\n<path fill=\"none\" stroke=\"black\" d=\"M129.67,-395.75C129.8,-388.8 129.98,-378.85 130.15,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"133.65,-370.15 130.34,-360.09 126.65,-370.02 133.65,-370.15\"/>\n</g>\n<!-- 140025091727024 -->\n<g id=\"node37\" class=\"node\">\n<title>140025091727024</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"171,-470 88,-470 88,-451 171,-451 171,-470\"/>\n<text text-anchor=\"middle\" x=\"129.5\" y=\"-458\" font-family=\"monospace\" font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- 140025091727024&#45;&gt;140025091725584 -->\n<g id=\"edge41\" class=\"edge\">\n<title>140025091727024&#45;&gt;140025091725584</title>\n<path fill=\"none\" stroke=\"black\" d=\"M129.5,-450.75C129.5,-443.8 129.5,-433.85 129.5,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"133,-425.09 129.5,-415.09 126,-425.09 133,-425.09\"/>\n</g>\n<!-- 140025091726880&#45;&gt;140025091727024 -->\n<g id=\"edge42\" class=\"edge\">\n<title>140025091726880&#45;&gt;140025091727024</title>\n<path fill=\"none\" stroke=\"black\" d=\"M130.33,-505.75C130.2,-498.8 130.02,-488.85 129.85,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"133.35,-480.02 129.66,-470.09 126.35,-480.15 133.35,-480.02\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Define the inner-loop optimization and visualize the inner-loop forward "
+                  "gradient flow."]
     },
     {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ───────────────────────────────────────╮</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/tmp/ipykernel_3962266/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">4178930003.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">21</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000; font-style: italic\">[Errno 2] No such file or directory: '/tmp/ipykernel_3962266/4178930003.py'</span>                                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/torch/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">_tensor.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">487</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">backward</span>           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 484 │   │   │   │   </span>create_graph=create_graph,                                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 485 │   │   │   │   </span>inputs=inputs,                                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 486 │   │   │   </span>)                                                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 487 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>torch.autograd.backward(                                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 488 │   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, gradient, retain_graph, create_graph, inputs=inputs                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 489 │   │   </span>)                                                                                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 490 </span>                                                                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">╭───────────────────────── locals ──────────────────────────╮</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span> create_graph = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>                                      <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>     gradient = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>                                       <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>       inputs = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>                                       <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span> retain_graph = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>                                       <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>         self = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">tensor</span><span style=\"font-weight: bold\">(</span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">0.1203</span>, <span style=\"color: #808000; text-decoration-color: #808000\">grad_fn</span>=<span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">MseLossBackward0</span><span style=\"font-weight: bold\">&gt;)</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">╰───────────────────────────────────────────────────────────╯</span>                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/torch/autograd/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">__init__.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">197</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">backward</span> <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">194 │   # The reason we repeat same the comment below is that</span>                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">195 │   # some Python versions print out the first line of a multi-line function</span>                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">196 │   # calls in the traceback and some print out the last line</span>                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>197 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span>Variable._execution_engine.run_backward(  <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># Calls into the C++ engine to run the ba</span>                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">198 │   │   </span>tensors, grad_tensors_, retain_graph, create_graph, inputs,                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">199 │   │   </span>allow_unreachable=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">True</span>, accumulate_grad=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">True</span>)  <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># Calls into the C++ engine to r</span>                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">200 </span>                                                                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">╭──────────────────────────── locals ────────────────────────────╮</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>   create_graph = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>                                         <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>   grad_tensors = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>                                          <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>  grad_tensors_ = <span style=\"font-weight: bold\">(</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">tensor</span><span style=\"font-weight: bold\">(</span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span>.<span style=\"font-weight: bold\">)</span>,<span style=\"font-weight: bold\">)</span>                                 <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span> grad_variables = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>                                          <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>         inputs = <span style=\"font-weight: bold\">()</span>                                            <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>   retain_graph = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>                                         <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>        tensors = <span style=\"font-weight: bold\">(</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">tensor</span><span style=\"font-weight: bold\">(</span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">0.1203</span>, <span style=\"color: #808000; text-decoration-color: #808000\">grad_fn</span>=<span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">MseLossBackward0</span><span style=\"font-weight: bold\">&gt;)</span>,<span style=\"font-weight: bold\">)</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">╰────────────────────────────────────────────────────────────────╯</span>                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
-       "<span style=\"color: #800000; text-decoration-color: #800000\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
-       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">RuntimeError: </span>Trying to backward through the graph a second time <span style=\"font-weight: bold\">(</span>or directly access saved tensors after they have \n",
-       "already been freed<span style=\"font-weight: bold\">)</span>. Saved intermediate values of the graph are freed when you call <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.backward</span><span style=\"font-weight: bold\">()</span> or <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">autograd.grad</span><span style=\"font-weight: bold\">()</span>.\n",
-       "Specify <span style=\"color: #808000; text-decoration-color: #808000\">retain_graph</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span> if you need to backward through the graph a second time or if you need to access saved \n",
-       "tensors after calling backward.\n",
-       "</pre>\n"
+      "cell_type" : "code",
+      "execution_count" : 4,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" :
+              [ "inner loss: 0.3472\n", "<graphviz.graphs.Digraph object at 0x7f5a21d70c40>\n" ]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"226pt\" height=\"335pt\"\n viewBox=\"0.00 0.00 226.00 335.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 331)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-331 222,-331 222,4 "
+                "-4,4\"/>\n<!-- 140025091550880 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140025091550880</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"144,-30 67,-30 67,0 144,0 144,-30\"/>\n<text "
+                "text-anchor=\"middle\" x=\"105.5\" y=\"-18\" font-family=\"monospace\" "
+                "font-size=\"10.00\">inner_loss</text>\n<text text-anchor=\"middle\" x=\"105.5\" "
+                "y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140028156253184 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140028156253184</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"162,-85 49,-85 49,-66 162,-66 162,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"105.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140028156253184&#45;&gt;140025091550880 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140028156253184&#45;&gt;140025091550880</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M105.5,-65.87C105.5,-59.11 105.5,-49.35 "
+                "105.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"109,-40.11 "
+                "105.5,-30.11 102,-40.11 109,-40.11\"/>\n</g>\n<!-- 140028156436736 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140028156436736</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"156,-140 55,-140 55,-121 156,-121 "
+                "156,-140\"/>\n<text text-anchor=\"middle\" x=\"105.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140028156436736&#45;&gt;140028156253184 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140028156436736&#45;&gt;140028156253184</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M105.5,-120.75C105.5,-113.8 105.5,-103.85 "
+                "105.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"109,-95.09 "
+                "105.5,-85.09 102,-95.09 109,-95.09\"/>\n</g>\n<!-- 140025091526416 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140025091526416</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"101,-195 0,-195 0,-176 101,-176 "
+                "101,-195\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-183\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140025091526416&#45;&gt;140028156436736 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140025091526416&#45;&gt;140028156436736</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M59.58,-175.75C67.59,-168.03 79.46,-156.6 "
+                "89.12,-147.28\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"91.81,-149.55 96.59,-140.09 86.96,-144.51 91.81,-149.55\"/>\n</g>\n<!-- "
+                "140028155952000 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140028155952000</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"96,-261 1,-261 1,-231 96,-231 96,-261\"/>\n<text "
+                "text-anchor=\"middle\" x=\"48.5\" y=\"-249\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"48.5\" "
+                "y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140028155952000&#45;&gt;140025091526416 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140028155952000&#45;&gt;140025091526416</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M48.98,-230.84C49.24,-223.21 49.57,-213.7 "
+                "49.85,-205.45\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"53.36,-205.38 50.2,-195.27 46.36,-205.14 53.36,-205.38\"/>\n</g>\n<!-- "
+                "140025091525408 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140025091525408</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"200,-195 123,-195 123,-176 200,-176 200,-195\"/>\n<text "
+                "text-anchor=\"middle\" x=\"161.5\" y=\"-183\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091525408&#45;&gt;140028156436736 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140025091525408&#45;&gt;140028156436736</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M152.5,-175.98C144.31,-168.23 131.99,-156.58 "
+                "122.03,-147.14\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"124.33,-144.5 114.66,-140.17 119.52,-149.59 "
+                "124.33,-144.5\"/>\n</g>\n<!-- 140025091526224 -->\n<g id=\"node7\" "
+                "class=\"node\">\n<title>140025091526224</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"215,-255.5 114,-255.5 114,-236.5 215,-236.5 "
+                "215,-255.5\"/>\n<text text-anchor=\"middle\" x=\"164.5\" y=\"-243.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140025091526224&#45;&gt;140025091525408 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140025091526224&#45;&gt;140025091525408</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M164.06,-236.37C163.64,-228.25 163,-215.81 "
+                "162.47,-205.39\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"165.95,-204.97 161.94,-195.17 158.96,-205.33 "
+                "165.95,-204.97\"/>\n</g>\n<!-- 140028155952880 -->\n<g id=\"node8\" "
+                "class=\"node\">\n<title>140028155952880</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"218,-327 111,-327 111,-297 218,-297 218,-327\"/>\n<text "
+                "text-anchor=\"middle\" x=\"164.5\" y=\"-315\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step0.fc.weight</text>\n<text text-anchor=\"middle\" "
+                "x=\"164.5\" y=\"-304\" font-family=\"monospace\" font-size=\"10.00\">(1, "
+                "16)</text>\n</g>\n<!-- 140028155952880&#45;&gt;140025091526224 -->\n<g "
+                "id=\"edge6\" "
+                "class=\"edge\">\n<title>140028155952880&#45;&gt;140025091526224</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M164.5,-296.8C164.5,-287.7 164.5,-275.79 "
+                "164.5,-265.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"168,-265.84 "
+                "164.5,-255.84 161,-265.84 168,-265.84\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
       ],
-      "text/plain": [
-       "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────────────────────── \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m ──────────────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[2;33m/tmp/ipykernel_3962266/\u001b[0m\u001b[1;33m4178930003.py\u001b[0m:\u001b[94m21\u001b[0m in \u001b[92m<module>\u001b[0m                                                             \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[3;31m[Errno 2] No such file or directory: '/tmp/ipykernel_3962266/4178930003.py'\u001b[0m                                     \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[2;33m/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/torch/\u001b[0m\u001b[1;33m_tensor.py\u001b[0m:\u001b[94m487\u001b[0m in \u001b[92mbackward\u001b[0m           \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m 484 \u001b[0m\u001b[2m│   │   │   │   \u001b[0mcreate_graph=create_graph,                                                               \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m 485 \u001b[0m\u001b[2m│   │   │   │   \u001b[0minputs=inputs,                                                                           \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m 486 \u001b[0m\u001b[2m│   │   │   \u001b[0m)                                                                                            \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 487 \u001b[2m│   │   \u001b[0mtorch.autograd.backward(                                                                         \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m 488 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m, gradient, retain_graph, create_graph, inputs=inputs                                    \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m 489 \u001b[0m\u001b[2m│   │   \u001b[0m)                                                                                                \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m 490 \u001b[0m                                                                                                         \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m──────────────────────── locals ─────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m create_graph = \u001b[94mFalse\u001b[0m                                      \u001b[33m│\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m     gradient = \u001b[94mNone\u001b[0m                                       \u001b[33m│\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m       inputs = \u001b[94mNone\u001b[0m                                       \u001b[33m│\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m retain_graph = \u001b[94mNone\u001b[0m                                       \u001b[33m│\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m         self = \u001b[1;35mtensor\u001b[0m\u001b[1m(\u001b[0m\u001b[94m0.1203\u001b[0m, \u001b[33mgrad_fn\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;95mMseLossBackward0\u001b[0m\u001b[1m>\u001b[0m\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m╰───────────────────────────────────────────────────────────╯\u001b[0m                                                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[2;33m/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/torch/autograd/\u001b[0m\u001b[1;33m__init__.py\u001b[0m:\u001b[94m197\u001b[0m in \u001b[92mbackward\u001b[0m \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m194 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# The reason we repeat same the comment below is that\u001b[0m                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m195 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# some Python versions print out the first line of a multi-line function\u001b[0m                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m196 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# calls in the traceback and some print out the last line\u001b[0m                                             \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m197 \u001b[2m│   \u001b[0mVariable._execution_engine.run_backward(  \u001b[2m# Calls into the C++ engine to run the ba\u001b[0m                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m198 \u001b[0m\u001b[2m│   │   \u001b[0mtensors, grad_tensors_, retain_graph, create_graph, inputs,                                       \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m199 \u001b[0m\u001b[2m│   │   \u001b[0mallow_unreachable=\u001b[94mTrue\u001b[0m, accumulate_grad=\u001b[94mTrue\u001b[0m)  \u001b[2m# Calls into the C++ engine to r\u001b[0m                   \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m   \u001b[2m200 \u001b[0m                                                                                                          \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m                                                                                                                 \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m─────────────────────────── locals ───────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m   create_graph = \u001b[94mFalse\u001b[0m                                         \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m   grad_tensors = \u001b[94mNone\u001b[0m                                          \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m  grad_tensors_ = \u001b[1m(\u001b[0m\u001b[1;35mtensor\u001b[0m\u001b[1m(\u001b[0m\u001b[94m1\u001b[0m.\u001b[1m)\u001b[0m,\u001b[1m)\u001b[0m                                 \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m grad_variables = \u001b[94mNone\u001b[0m                                          \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m         inputs = \u001b[1m(\u001b[0m\u001b[1m)\u001b[0m                                            \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m   retain_graph = \u001b[94mFalse\u001b[0m                                         \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m        tensors = \u001b[1m(\u001b[0m\u001b[1;35mtensor\u001b[0m\u001b[1m(\u001b[0m\u001b[94m0.1203\u001b[0m, \u001b[33mgrad_fn\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;95mMseLossBackward0\u001b[0m\u001b[1m>\u001b[0m\u001b[1m)\u001b[0m,\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m│\u001b[0m \u001b[33m╰────────────────────────────────────────────────────────────────╯\u001b[0m                                              \u001b[31m│\u001b[0m\n",
-       "\u001b[31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
-       "\u001b[1;91mRuntimeError: \u001b[0mTrying to backward through the graph a second time \u001b[1m(\u001b[0mor directly access saved tensors after they have \n",
-       "already been freed\u001b[1m)\u001b[0m. Saved intermediate values of the graph are freed when you call \u001b[1;35m.backward\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m or \u001b[1;35mautograd.grad\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m.\n",
-       "Specify \u001b[33mretain_graph\u001b[0m=\u001b[3;92mTrue\u001b[0m if you need to backward through the graph a second time or if you need to access saved \n",
-       "tensors after calling backward.\n"
+      "source" : [
+        "init_net_state = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step0.')\n",
+        "\n",
+        "# inner loss\n",
+        "inner_loss = loss_fn(net(x), y)\n",
+        "\n",
+        "print(f'inner loss: {inner_loss:.4f}')\n",
+        "display(torchopt.visual.make_dot(inner_loss, params=(init_net_state, {'inner_loss': "
+        "inner_loss})))"
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Inner update with attached computation graph\n",
-    "inner_loss = loss_fn(net(x), y)\n",
-    "loss = inner_loss * meta_parameter\n",
-    "optim.step(loss)\n",
-    "\n",
-    "# Outer forward process\n",
-    "outer_loss = loss_fn(net(x), y)\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        outer_loss,\n",
-    "        params=(\n",
-    "            init_net_state,\n",
-    "            one_step_net_state,\n",
-    "            {'meta_parameter': meta_parameter, 'outer_loss': outer_loss},\n",
-    "        ),\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "# Outer update\n",
-    "meta_optim.zero_grad()\n",
-    "outer_loss.backward()\n",
-    "meta_optim.step()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "From the graph we can see, directly conducting the second bi-level process links the graph of first and second bi-level process together. We should manually stop gradient with `torchopt.stop_gradient`. `torchopt.stop_gradient` will detach the node of gradient graph and make it become a leaf node. It allows the input of network, optimizer, or state dictionary and the gradient operation happens in an in-place manner.\n",
-    "\n",
-    "Let's use `recover_state_dict` to come back to one-step updated states."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Reset to previous one-step updated states\n",
-    "torchopt.recover_state_dict(net, one_step_net_state)\n",
-    "torchopt.recover_state_dict(optim, one_step_optim_state)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "And finally, Let's conduct the stop-gradient operation before the second meta-optimization step. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Conduct inner-loop optimization with `MetaSGD`, here the meta-parameter is "
+                  "served as a factor controlling the scale of inner-loop loss."]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "meta_parameter.grad = tensor(-0.0635)\n",
-      "meta_parameter = Parameter containing:\n",
-      "tensor(1.1940, requires_grad=True)\n",
-      "<graphviz.graphs.Digraph object at 0x7f5a19ced640>\n"
-     ]
+      "cell_type" : "code",
+      "execution_count" : 5,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "# inner-step optimization\n",
+        "loss = inner_loss * meta_parameter\n",
+        "optim.step(loss)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "We compute the outer loss and draw the full computation graph of the first bi-level "
+        "process. In this graph, three main parts are included.\n",
+        "\n",
+        "- Inner-loop: forward process and inner-loss calculation\n",
+        "- Inner-loop optimization: `MetaSGD` optimization step given inner-loss\n",
+        "- Outer-loop: forward process and outer-loss calculation"
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 6,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" :
+              [ "outer loss: 0.2039\n", "<graphviz.graphs.Digraph object at 0x7f5a21d70730>\n" ]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"421pt\" height=\"874pt\"\n viewBox=\"0.00 0.00 421.00 874.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 870)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-870 417,-870 417,4 "
+                "-4,4\"/>\n<!-- 140027829238416 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140027829238416</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"189,-30 112,-30 112,0 189,0 189,-30\"/>\n<text "
+                "text-anchor=\"middle\" x=\"150.5\" y=\"-18\" font-family=\"monospace\" "
+                "font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"150.5\" "
+                "y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140025091525072 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140025091525072</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"207,-85 94,-85 94,-66 207,-66 207,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"150.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140025091525072&#45;&gt;140027829238416 -->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140025091525072&#45;&gt;140027829238416</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M150.5,-65.87C150.5,-59.11 150.5,-49.35 "
+                "150.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"154,-40.11 "
+                "150.5,-30.11 147,-40.11 154,-40.11\"/>\n</g>\n<!-- 140025091525216 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140025091525216</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"201,-140 100,-140 100,-121 201,-121 "
+                "201,-140\"/>\n<text text-anchor=\"middle\" x=\"150.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140025091525216&#45;&gt;140025091525072 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140025091525216&#45;&gt;140025091525072</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M150.5,-120.75C150.5,-113.8 150.5,-103.85 "
+                "150.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"154,-95.09 "
+                "150.5,-85.09 147,-95.09 154,-95.09\"/>\n</g>\n<!-- 140025091526128 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140025091526128</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"154,-404 59,-404 59,-363 154,-363 "
+                "154,-404\"/>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-392\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n<text "
+                "text-anchor=\"middle\" x=\"106.5\" y=\"-381\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.fc.bias</text>\n<text text-anchor=\"middle\" "
+                "x=\"106.5\" y=\"-370\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140025091526128&#45;&gt;140025091525216 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140025091526128&#45;&gt;140025091525216</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M109.95,-362.8C118.22,-315.61 139.09,-196.57 "
+                "147.2,-150.3\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"150.66,-150.88 148.94,-140.42 143.76,-149.67 "
+                "150.66,-150.88\"/>\n</g>\n<!-- 140025091526416 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140025091526416</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"101,-734 0,-734 0,-715 101,-715 101,-734\"/>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-722\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140025091526416&#45;&gt;140025091526128 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140025091526416&#45;&gt;140025091526128</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M47.8,-714.92C42.34,-696.77 30.5,-653.07 "
+                "30.5,-615.5 30.5,-615.5 30.5,-615.5 30.5,-503.5 30.5,-473.63 36.82,-465.42 "
+                "52.5,-440 58.83,-429.73 67.36,-419.85 75.76,-411.33\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"78.33,-413.71 83.06,-404.23 73.45,-408.7 "
+                "78.33,-413.71\"/>\n</g>\n<!-- 140028156436736 -->\n<g id=\"node14\" "
+                "class=\"node\">\n<title>140028156436736</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"225,-679 124,-679 124,-660 225,-660 225,-679\"/>\n<text "
+                "text-anchor=\"middle\" x=\"174.5\" y=\"-667\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140025091526416&#45;&gt;140028156436736 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140025091526416&#45;&gt;140028156436736</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M70.42,-714.98C90.55,-706.38 121.88,-692.99 "
+                "144.91,-683.15\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"146.4,-686.32 154.21,-679.17 143.64,-679.88 "
+                "146.4,-686.32\"/>\n</g>\n<!-- 140028155952000 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140028155952000</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"98,-800 3,-800 3,-770 98,-770 98,-800\"/>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-788\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step0.fc.bias</text>\n<text text-anchor=\"middle\" x=\"50.5\" "
+                "y=\"-777\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140028155952000&#45;&gt;140025091526416 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140028155952000&#45;&gt;140025091526416</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-769.84C50.5,-762.21 50.5,-752.7 "
+                "50.5,-744.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-744.27 "
+                "50.5,-734.27 47,-744.27 54,-744.27\"/>\n</g>\n<!-- 140025091524976 -->\n<g "
+                "id=\"node7\" class=\"node\">\n<title>140025091524976</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"151,-459 62,-459 62,-440 151,-440 "
+                "151,-459\"/>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-447\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140025091524976&#45;&gt;140025091526128 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140025091524976&#45;&gt;140025091526128</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M106.5,-439.87C106.5,-433.22 106.5,-423.63 "
+                "106.5,-414.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"110,-414.01 "
+                "106.5,-404.01 103,-414.01 110,-414.01\"/>\n</g>\n<!-- 140025091526560 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140025091526560</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"154,-514 59,-514 59,-495 154,-495 "
+                "154,-514\"/>\n<text text-anchor=\"middle\" x=\"106.5\" y=\"-502\" "
+                "font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- "
+                "140025091526560&#45;&gt;140025091524976 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140025091526560&#45;&gt;140025091524976</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M106.5,-494.75C106.5,-487.8 106.5,-477.85 "
+                "106.5,-469.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"110,-469.09 "
+                "106.5,-459.09 103,-469.09 110,-469.09\"/>\n</g>\n<!-- 140025091525456 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140025091525456</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"152,-569 63,-569 63,-550 152,-550 "
+                "152,-569\"/>\n<text text-anchor=\"middle\" x=\"107.5\" y=\"-557\" "
+                "font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- "
+                "140025091525456&#45;&gt;140025091526560 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140025091525456&#45;&gt;140025091526560</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M107.33,-549.75C107.2,-542.8 107.02,-532.85 "
+                "106.85,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"110.35,-524.02 106.66,-514.09 103.35,-524.15 "
+                "110.35,-524.02\"/>\n</g>\n<!-- 140025091524112 -->\n<g id=\"node10\" "
+                "class=\"node\">\n<title>140025091524112</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"255,-624 94,-624 94,-605 255,-605 255,-624\"/>\n<text "
+                "text-anchor=\"middle\" x=\"174.5\" y=\"-612\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140025091524112&#45;&gt;140025091525456 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140025091524112&#45;&gt;140025091525456</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M163.74,-604.98C153.73,-597.07 138.61,-585.11 "
+                "126.57,-575.58\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"128.47,-572.63 118.46,-569.17 124.13,-578.12 "
+                "128.47,-572.63\"/>\n</g>\n<!-- 140024973742672 -->\n<g id=\"node24\" "
+                "class=\"node\">\n<title>140024973742672</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"249,-569 172,-569 172,-550 249,-550 249,-569\"/>\n<text "
+                "text-anchor=\"middle\" x=\"210.5\" y=\"-557\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091524112&#45;&gt;140024973742672 -->\n<g id=\"edge25\" "
+                "class=\"edge\">\n<title>140025091524112&#45;&gt;140024973742672</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M180.44,-604.75C185.48,-597.34 192.84,-586.5 "
+                "199.01,-577.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"201.94,-579.33 204.67,-569.09 196.15,-575.39 "
+                "201.94,-579.33\"/>\n</g>\n<!-- 140024973742288 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140024973742288</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"404,-679 315,-679 315,-660 404,-660 404,-679\"/>\n<text "
+                "text-anchor=\"middle\" x=\"359.5\" y=\"-667\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140024973742288&#45;&gt;140025091524112 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140024973742288&#45;&gt;140025091524112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M329.78,-659.98C298.45,-651.01 248.93,-636.82 "
+                "214.26,-626.89\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"215.03,-623.47 204.46,-624.08 213.11,-630.2 "
+                "215.03,-623.47\"/>\n</g>\n<!-- 140024973742384 -->\n<g id=\"node12\" "
+                "class=\"node\">\n<title>140024973742384</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"413,-734 312,-734 312,-715 413,-715 413,-734\"/>\n<text "
+                "text-anchor=\"middle\" x=\"362.5\" y=\"-722\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140024973742384&#45;&gt;140024973742288 -->\n<g id=\"edge10\" "
+                "class=\"edge\">\n<title>140024973742384&#45;&gt;140024973742288</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M362,-714.75C361.61,-707.8 361.05,-697.85 "
+                "360.55,-689.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"364.05,-688.88 359.99,-679.09 357.06,-689.27 "
+                "364.05,-688.88\"/>\n</g>\n<!-- 140025091549440 -->\n<g id=\"node13\" "
+                "class=\"node\">\n<title>140025091549440</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"413,-800 312,-800 312,-770 413,-770 413,-800\"/>\n<text "
+                "text-anchor=\"middle\" x=\"362.5\" y=\"-788\" font-family=\"monospace\" "
+                "font-size=\"10.00\">meta_parameter</text>\n<text text-anchor=\"middle\" "
+                "x=\"362.5\" y=\"-777\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091549440&#45;&gt;140024973742384 "
+                "-->\n<g id=\"edge11\" "
+                "class=\"edge\">\n<title>140025091549440&#45;&gt;140024973742384</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M362.5,-769.84C362.5,-762.21 362.5,-752.7 "
+                "362.5,-744.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"366,-744.27 "
+                "362.5,-734.27 359,-744.27 366,-744.27\"/>\n</g>\n<!-- "
+                "140028156436736&#45;&gt;140025091524112 -->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140028156436736&#45;&gt;140025091524112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M174.5,-659.75C174.5,-652.8 174.5,-642.85 "
+                "174.5,-634.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"178,-634.09 "
+                "174.5,-624.09 171,-634.09 178,-634.09\"/>\n</g>\n<!-- 140025091525408 -->\n<g "
+                "id=\"node15\" class=\"node\">\n<title>140025091525408</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"235,-734 158,-734 158,-715 235,-715 "
+                "235,-734\"/>\n<text text-anchor=\"middle\" x=\"196.5\" y=\"-722\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091525408&#45;&gt;140028156436736 -->\n<g id=\"edge14\" "
+                "class=\"edge\">\n<title>140025091525408&#45;&gt;140028156436736</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M192.87,-714.75C189.92,-707.65 185.67,-697.4 "
+                "181.99,-688.56\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"185.13,-686.98 178.07,-679.09 178.67,-689.67 "
+                "185.13,-686.98\"/>\n</g>\n<!-- 140025091526224 -->\n<g id=\"node16\" "
+                "class=\"node\">\n<title>140025091526224</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"285,-794.5 184,-794.5 184,-775.5 285,-775.5 "
+                "285,-794.5\"/>\n<text text-anchor=\"middle\" x=\"234.5\" y=\"-782.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140025091526224&#45;&gt;140025091525408 -->\n<g id=\"edge15\" "
+                "class=\"edge\">\n<title>140025091526224&#45;&gt;140025091525408</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M228.89,-775.37C223.34,-766.81 214.67,-753.47 "
+                "207.7,-742.74\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"210.51,-740.65 202.13,-734.17 204.64,-744.46 "
+                "210.51,-740.65\"/>\n</g>\n<!-- 140025091524928 -->\n<g id=\"node19\" "
+                "class=\"node\">\n<title>140025091524928</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"264,-272 157,-272 157,-231 264,-231 264,-272\"/>\n<text "
+                "text-anchor=\"middle\" x=\"210.5\" y=\"-260\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"210.5\" "
+                "y=\"-249\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.fc.weight</text>\n<text text-anchor=\"middle\" "
+                "x=\"210.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">(1, "
+                "16)</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091524928 -->\n<g "
+                "id=\"edge19\" "
+                "class=\"edge\">\n<title>140025091526224&#45;&gt;140025091524928</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M241.11,-775.43C254.48,-757.2 283.5,-712.78 "
+                "283.5,-670.5 283.5,-670.5 283.5,-670.5 283.5,-382.5 283.5,-348.22 280.31,-337.88 "
+                "263.5,-308 257.64,-297.58 249.38,-287.65 241.12,-279.14\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"243.5,-276.58 233.92,-272.05 "
+                "238.59,-281.56 243.5,-276.58\"/>\n</g>\n<!-- 140028155952880 -->\n<g "
+                "id=\"node17\" class=\"node\">\n<title>140028155952880</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"288,-866 181,-866 181,-836 288,-836 "
+                "288,-866\"/>\n<text text-anchor=\"middle\" x=\"234.5\" y=\"-854\" "
+                "font-family=\"monospace\" font-size=\"10.00\">step0.fc.weight</text>\n<text "
+                "text-anchor=\"middle\" x=\"234.5\" y=\"-843\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- "
+                "140028155952880&#45;&gt;140025091526224 -->\n<g id=\"edge16\" "
+                "class=\"edge\">\n<title>140028155952880&#45;&gt;140025091526224</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M234.5,-835.8C234.5,-826.7 234.5,-814.79 "
+                "234.5,-804.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"238,-804.84 "
+                "234.5,-794.84 231,-804.84 238,-804.84\"/>\n</g>\n<!-- 140025091524448 -->\n<g "
+                "id=\"node18\" class=\"node\">\n<title>140025091524448</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"230,-195 153,-195 153,-176 230,-176 "
+                "230,-195\"/>\n<text text-anchor=\"middle\" x=\"191.5\" y=\"-183\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091524448&#45;&gt;140025091525216 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140025091524448&#45;&gt;140025091525216</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M184.73,-175.75C178.94,-168.26 170.44,-157.28 "
+                "163.36,-148.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"166.03,-145.86 157.14,-140.09 160.49,-150.14 "
+                "166.03,-145.86\"/>\n</g>\n<!-- 140025091524928&#45;&gt;140025091524448 -->\n<g "
+                "id=\"edge18\" "
+                "class=\"edge\">\n<title>140025091524928&#45;&gt;140025091524448</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M204.7,-230.95C202.24,-222.67 199.39,-213.07 "
+                "196.97,-204.92\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"200.3,-203.85 194.1,-195.26 193.59,-205.84 200.3,-203.85\"/>\n</g>\n<!-- "
+                "140025091525600 -->\n<g id=\"node20\" "
+                "class=\"node\">\n<title>140025091525600</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"255,-327 166,-327 166,-308 255,-308 255,-327\"/>\n<text "
+                "text-anchor=\"middle\" x=\"210.5\" y=\"-315\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140025091525600&#45;&gt;140025091524928 -->\n<g id=\"edge20\" "
+                "class=\"edge\">\n<title>140025091525600&#45;&gt;140025091524928</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M210.5,-307.87C210.5,-301.22 210.5,-291.63 "
+                "210.5,-282.28\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"214,-282.01 "
+                "210.5,-272.01 207,-282.01 214,-282.01\"/>\n</g>\n<!-- 140024973742144 -->\n<g "
+                "id=\"node21\" class=\"node\">\n<title>140024973742144</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"250,-393 173,-393 173,-374 250,-374 "
+                "250,-393\"/>\n<text text-anchor=\"middle\" x=\"211.5\" y=\"-381\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973742144&#45;&gt;140025091525600 -->\n<g id=\"edge21\" "
+                "class=\"edge\">\n<title>140024973742144&#45;&gt;140025091525600</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M211.37,-373.87C211.22,-364.66 210.99,-349.79 "
+                "210.8,-337.77\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"214.3,-337.35 210.64,-327.41 207.3,-337.46 214.3,-337.35\"/>\n</g>\n<!-- "
+                "140024973742576 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140024973742576</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"251,-459 174,-459 174,-440 251,-440 251,-459\"/>\n<text "
+                "text-anchor=\"middle\" x=\"212.5\" y=\"-447\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973742576&#45;&gt;140024973742144 -->\n<g id=\"edge22\" "
+                "class=\"edge\">\n<title>140024973742576&#45;&gt;140024973742144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M212.37,-439.87C212.22,-430.66 211.99,-415.79 "
+                "211.8,-403.77\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"215.3,-403.35 211.64,-393.41 208.3,-403.46 215.3,-403.35\"/>\n</g>\n<!-- "
+                "140024973742480 -->\n<g id=\"node23\" "
+                "class=\"node\">\n<title>140024973742480</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"255,-514 172,-514 172,-495 255,-495 255,-514\"/>\n<text "
+                "text-anchor=\"middle\" x=\"213.5\" y=\"-502\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- "
+                "140024973742480&#45;&gt;140024973742576 -->\n<g id=\"edge23\" "
+                "class=\"edge\">\n<title>140024973742480&#45;&gt;140024973742576</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M213.33,-494.75C213.2,-487.8 213.02,-477.85 "
+                "212.85,-469.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"216.35,-469.02 212.66,-459.09 209.35,-469.15 "
+                "216.35,-469.02\"/>\n</g>\n<!-- 140024973742672&#45;&gt;140024973742480 -->\n<g "
+                "id=\"edge24\" "
+                "class=\"edge\">\n<title>140024973742672&#45;&gt;140024973742480</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M211,-549.75C211.39,-542.8 211.95,-532.85 "
+                "212.45,-524.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"215.94,-524.27 213.01,-514.09 208.95,-523.88 "
+                "215.94,-524.27\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "# Extract `state_dict`` for updated network\n",
+        "one_step_net_state = torchopt.extract_state_dict(net, enable_visual=True, "
+        "visual_prefix='step1.')\n",
+        "one_step_optim_state = torchopt.extract_state_dict(optim)\n",
+        "\n",
+        "# Calculate outer loss\n",
+        "outer_loss = loss_fn(net(x), y)\n",
+        "print(f'outer loss: {outer_loss:.4f}')\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        outer_loss,\n",
+        "        params=(\n",
+        "            init_net_state,\n",
+        "            one_step_net_state,\n",
+        "            {'meta_parameter': meta_parameter, 'outer_loss': outer_loss},\n",
+        "        ),\n",
+        "    )\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Then we backward the loss to conduct outer-loop meta-optimization."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 7,
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "meta_parameter.grad = tensor(-0.1205)\n",
+          "meta_parameter = Parameter containing:\n",
+          "tensor(1.1000, requires_grad=True)\n"
+        ]
+      } ],
+      "source" : [
+        "meta_optim.zero_grad()\n",
+        "outer_loss.backward()\n",
+        "print(f'meta_parameter.grad = {meta_parameter.grad!r}')\n",
+        "meta_optim.step()\n",
+        "print(f'meta_parameter = {meta_parameter!r}')"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" :
+          ["We have already conducted one bi-level optimization and optimize our meta-parameters. "
+           "When you want to conduct the second bi-level optimization, you need to be careful "
+           "whether you need to use the `stop_gradient` function. For example, if your new "
+           "inner-loop parameters directly inherits previous inner-loop parameters (which is a "
+           "common strategy in many meta-learning algorithms like Meta-Gradient Reinforcement "
+           "Learning (MGRL) ([arXiv:1805.09801](https://arxiv.org/abs/1805.09801))), you might "
+           "need `stop_gradient` function."]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "In general, the backpropagation only frees saved tensors (often used as auxiliary data "
+        "for computing the gradient) but the computation graph remains. Once the outer iteration "
+        "is finished, if you want to use any intermediate network parameters produced by the inner "
+        "loop for the next bi-level iteration, you should detach them from the computation "
+        "graph.\n",
+        "\n",
+        "There are two main reasons:\n",
+        "\n",
+        "- The network parameters are still connected to the previous computation graph "
+        "(`.grad_fn` is not `None`). If later the gradient backpropagate to these parameters, the "
+        "PyTorch backward engine will try to backpropagate through the previous computation graph. "
+        "This will raise a `RuntimeError`: Trying to backward through the graph a second time...\n",
+        "- If we do not detach the computation graph, the computation graph connected to these "
+        "parameters can not be freed by GC (Garbage Collector) until these parameters are "
+        "collected by GC."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["Now let us see what will happen if we do not use the `stop_gradient` function "
+                  "before we conduct the second bi-level process."]
     },
     {
-     "data": {
-      "image/svg+xml": "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg width=\"437pt\" height=\"830pt\"\n viewBox=\"0.00 0.00 437.00 830.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 826)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-826 433,-826 433,4 -4,4\"/>\n<!-- 140024973754912 -->\n<g id=\"node1\" class=\"node\">\n<title>140024973754912</title>\n<polygon fill=\"#caff70\" stroke=\"black\" points=\"210,-30 133,-30 133,0 210,0 210,-30\"/>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-18\" font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140024956770528 -->\n<g id=\"node2\" class=\"node\">\n<title>140024956770528</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"228,-85 115,-85 115,-66 228,-66 228,-85\"/>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-73\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- 140024956770528&#45;&gt;140024973754912 -->\n<g id=\"edge26\" class=\"edge\">\n<title>140024956770528&#45;&gt;140024973754912</title>\n<path fill=\"none\" stroke=\"black\" d=\"M171.5,-65.87C171.5,-59.11 171.5,-49.35 171.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"175,-40.11 171.5,-30.11 168,-40.11 175,-40.11\"/>\n</g>\n<!-- 140024956772112 -->\n<g id=\"node3\" class=\"node\">\n<title>140024956772112</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"222,-140 121,-140 121,-121 222,-121 222,-140\"/>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-128\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140024956772112&#45;&gt;140024956770528 -->\n<g id=\"edge1\" class=\"edge\">\n<title>140024956772112&#45;&gt;140024956770528</title>\n<path fill=\"none\" stroke=\"black\" d=\"M171.5,-120.75C171.5,-113.8 171.5,-103.85 171.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"175,-95.09 171.5,-85.09 168,-95.09 175,-95.09\"/>\n</g>\n<!-- 140024956770720 -->\n<g id=\"node4\" class=\"node\">\n<title>140024956770720</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"167,-360 78,-360 78,-341 167,-341 167,-360\"/>\n<text text-anchor=\"middle\" x=\"122.5\" y=\"-348\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140024956770720&#45;&gt;140024956772112 -->\n<g id=\"edge2\" class=\"edge\">\n<title>140024956770720&#45;&gt;140024956772112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M124.32,-340.82C129.65,-315.41 145.8,-239.06 160.5,-176 162.51,-167.37 164.86,-157.82 166.86,-149.8\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"170.28,-150.57 169.32,-140.01 163.49,-148.86 170.28,-150.57\"/>\n</g>\n<!-- 140024962101312 -->\n<g id=\"node5\" class=\"node\">\n<title>140024962101312</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"125,-690 24,-690 24,-671 125,-671 125,-690\"/>\n<text text-anchor=\"middle\" x=\"74.5\" y=\"-678\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140024962101312&#45;&gt;140024956770720 -->\n<g id=\"edge3\" class=\"edge\">\n<title>140024962101312&#45;&gt;140024956770720</title>\n<path fill=\"none\" stroke=\"black\" d=\"M70.61,-670.74C62.91,-652.6 46.5,-609.46 46.5,-571.5 46.5,-571.5 46.5,-571.5 46.5,-459.5 46.5,-429.63 51.45,-420.52 68.5,-396 76.73,-384.16 88.95,-373.72 99.68,-365.94\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"101.97,-368.61 108.22,-360.06 98,-362.85 101.97,-368.61\"/>\n</g>\n<!-- 140024973745552 -->\n<g id=\"node14\" class=\"node\">\n<title>140024973745552</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"241,-635 140,-635 140,-616 241,-616 241,-635\"/>\n<text text-anchor=\"middle\" x=\"190.5\" y=\"-623\" font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- 140024962101312&#45;&gt;140024973745552 -->\n<g id=\"edge13\" class=\"edge\">\n<title>140024962101312&#45;&gt;140024973745552</title>\n<path fill=\"none\" stroke=\"black\" d=\"M93.14,-670.98C111.8,-662.46 140.75,-649.23 162.24,-639.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"163.88,-642.51 171.52,-635.17 160.97,-636.14 163.88,-642.51\"/>\n</g>\n<!-- 140025091547520 -->\n<g id=\"node6\" class=\"node\">\n<title>140025091547520</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"149,-756 0,-756 0,-726 149,-726 149,-756\"/>\n<text text-anchor=\"middle\" x=\"74.5\" y=\"-744\" font-family=\"monospace\" font-size=\"10.00\">step1.detached.fc.bias</text>\n<text text-anchor=\"middle\" x=\"74.5\" y=\"-733\" font-family=\"monospace\" font-size=\"10.00\">(1)</text>\n</g>\n<!-- 140025091547520&#45;&gt;140024962101312 -->\n<g id=\"edge4\" class=\"edge\">\n<title>140025091547520&#45;&gt;140024962101312</title>\n<path fill=\"none\" stroke=\"black\" d=\"M74.5,-725.84C74.5,-718.21 74.5,-708.7 74.5,-700.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"78,-700.27 74.5,-690.27 71,-700.27 78,-700.27\"/>\n</g>\n<!-- 140024971586864 -->\n<g id=\"node7\" class=\"node\">\n<title>140024971586864</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"167,-415 78,-415 78,-396 167,-396 167,-415\"/>\n<text text-anchor=\"middle\" x=\"122.5\" y=\"-403\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140024971586864&#45;&gt;140024956770720 -->\n<g id=\"edge5\" class=\"edge\">\n<title>140024971586864&#45;&gt;140024956770720</title>\n<path fill=\"none\" stroke=\"black\" d=\"M122.5,-395.75C122.5,-388.8 122.5,-378.85 122.5,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"126,-370.09 122.5,-360.09 119,-370.09 126,-370.09\"/>\n</g>\n<!-- 140024973742528 -->\n<g id=\"node8\" class=\"node\">\n<title>140024973742528</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"170,-470 75,-470 75,-451 170,-451 170,-470\"/>\n<text text-anchor=\"middle\" x=\"122.5\" y=\"-458\" font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- 140024973742528&#45;&gt;140024971586864 -->\n<g id=\"edge6\" class=\"edge\">\n<title>140024973742528&#45;&gt;140024971586864</title>\n<path fill=\"none\" stroke=\"black\" d=\"M122.5,-450.75C122.5,-443.8 122.5,-433.85 122.5,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"126,-425.09 122.5,-415.09 119,-425.09 126,-425.09\"/>\n</g>\n<!-- 140024973743968 -->\n<g id=\"node9\" class=\"node\">\n<title>140024973743968</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"168,-525 79,-525 79,-506 168,-506 168,-525\"/>\n<text text-anchor=\"middle\" x=\"123.5\" y=\"-513\" font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- 140024973743968&#45;&gt;140024973742528 -->\n<g id=\"edge7\" class=\"edge\">\n<title>140024973743968&#45;&gt;140024973742528</title>\n<path fill=\"none\" stroke=\"black\" d=\"M123.33,-505.75C123.2,-498.8 123.02,-488.85 122.85,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"126.35,-480.02 122.66,-470.09 119.35,-480.15 126.35,-480.02\"/>\n</g>\n<!-- 140024973742768 -->\n<g id=\"node10\" class=\"node\">\n<title>140024973742768</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"271,-580 110,-580 110,-561 271,-561 271,-580\"/>\n<text text-anchor=\"middle\" x=\"190.5\" y=\"-568\" font-family=\"monospace\" font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- 140024973742768&#45;&gt;140024973743968 -->\n<g id=\"edge8\" class=\"edge\">\n<title>140024973742768&#45;&gt;140024973743968</title>\n<path fill=\"none\" stroke=\"black\" d=\"M179.74,-560.98C169.73,-553.07 154.61,-541.11 142.57,-531.58\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"144.47,-528.63 134.46,-525.17 140.13,-534.12 144.47,-528.63\"/>\n</g>\n<!-- 140024973744400 -->\n<g id=\"node24\" class=\"node\">\n<title>140024973744400</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"265,-525 188,-525 188,-506 265,-506 265,-525\"/>\n<text text-anchor=\"middle\" x=\"226.5\" y=\"-513\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973742768&#45;&gt;140024973744400 -->\n<g id=\"edge25\" class=\"edge\">\n<title>140024973742768&#45;&gt;140024973744400</title>\n<path fill=\"none\" stroke=\"black\" d=\"M196.44,-560.75C201.48,-553.34 208.84,-542.5 215.01,-533.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"217.94,-535.33 220.67,-525.09 212.15,-531.39 217.94,-535.33\"/>\n</g>\n<!-- 140024973744688 -->\n<g id=\"node11\" class=\"node\">\n<title>140024973744688</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"420,-635 331,-635 331,-616 420,-616 420,-635\"/>\n<text text-anchor=\"middle\" x=\"375.5\" y=\"-623\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140024973744688&#45;&gt;140024973742768 -->\n<g id=\"edge9\" class=\"edge\">\n<title>140024973744688&#45;&gt;140024973742768</title>\n<path fill=\"none\" stroke=\"black\" d=\"M345.78,-615.98C314.45,-607.01 264.93,-592.82 230.26,-582.89\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"231.03,-579.47 220.46,-580.08 229.11,-586.2 231.03,-579.47\"/>\n</g>\n<!-- 140024973745264 -->\n<g id=\"node12\" class=\"node\">\n<title>140024973745264</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"429,-690 328,-690 328,-671 429,-671 429,-690\"/>\n<text text-anchor=\"middle\" x=\"378.5\" y=\"-678\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140024973745264&#45;&gt;140024973744688 -->\n<g id=\"edge10\" class=\"edge\">\n<title>140024973745264&#45;&gt;140024973744688</title>\n<path fill=\"none\" stroke=\"black\" d=\"M378,-670.75C377.61,-663.8 377.05,-653.85 376.55,-645.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"380.05,-644.88 375.99,-635.09 373.06,-645.27 380.05,-644.88\"/>\n</g>\n<!-- 140025091549440 -->\n<g id=\"node13\" class=\"node\">\n<title>140025091549440</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"429,-756 328,-756 328,-726 429,-726 429,-756\"/>\n<text text-anchor=\"middle\" x=\"378.5\" y=\"-744\" font-family=\"monospace\" font-size=\"10.00\">meta_parameter</text>\n<text text-anchor=\"middle\" x=\"378.5\" y=\"-733\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091549440&#45;&gt;140024973745264 -->\n<g id=\"edge11\" class=\"edge\">\n<title>140025091549440&#45;&gt;140024973745264</title>\n<path fill=\"none\" stroke=\"black\" d=\"M378.5,-725.84C378.5,-718.21 378.5,-708.7 378.5,-700.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"382,-700.27 378.5,-690.27 375,-700.27 382,-700.27\"/>\n</g>\n<!-- 140024973745552&#45;&gt;140024973742768 -->\n<g id=\"edge12\" class=\"edge\">\n<title>140024973745552&#45;&gt;140024973742768</title>\n<path fill=\"none\" stroke=\"black\" d=\"M190.5,-615.75C190.5,-608.8 190.5,-598.85 190.5,-590.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"194,-590.09 190.5,-580.09 187,-590.09 194,-590.09\"/>\n</g>\n<!-- 140024973745168 -->\n<g id=\"node15\" class=\"node\">\n<title>140024973745168</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"250,-690 173,-690 173,-671 250,-671 250,-690\"/>\n<text text-anchor=\"middle\" x=\"211.5\" y=\"-678\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973745168&#45;&gt;140024973745552 -->\n<g id=\"edge14\" class=\"edge\">\n<title>140024973745168&#45;&gt;140024973745552</title>\n<path fill=\"none\" stroke=\"black\" d=\"M208.03,-670.75C205.22,-663.65 201.16,-653.4 197.65,-644.56\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"200.84,-643.1 193.9,-635.09 194.33,-645.68 200.84,-643.1\"/>\n</g>\n<!-- 140024973744256 -->\n<g id=\"node16\" class=\"node\">\n<title>140024973744256</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"297,-750.5 196,-750.5 196,-731.5 297,-731.5 297,-750.5\"/>\n<text text-anchor=\"middle\" x=\"246.5\" y=\"-738.5\" font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- 140024973744256&#45;&gt;140024973745168 -->\n<g id=\"edge15\" class=\"edge\">\n<title>140024973744256&#45;&gt;140024973745168</title>\n<path fill=\"none\" stroke=\"black\" d=\"M241.34,-731.37C236.27,-722.9 228.4,-709.74 222.01,-699.07\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"224.82,-696.95 216.69,-690.17 218.82,-700.55 224.82,-696.95\"/>\n</g>\n<!-- 140024973745984 -->\n<g id=\"node19\" class=\"node\">\n<title>140024973745984</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"271,-250 182,-250 182,-231 271,-231 271,-250\"/>\n<text text-anchor=\"middle\" x=\"226.5\" y=\"-238\" font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- 140024973744256&#45;&gt;140024973745984 -->\n<g id=\"edge19\" class=\"edge\">\n<title>140024973744256&#45;&gt;140024973745984</title>\n<path fill=\"none\" stroke=\"black\" d=\"M253.87,-731.24C268.43,-713.01 299.5,-669.12 299.5,-626.5 299.5,-626.5 299.5,-626.5 299.5,-349.5 299.5,-319.91 295.93,-310.61 279.5,-286 271.58,-274.13 259.6,-263.7 249.04,-255.93\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"250.82,-252.9 240.61,-250.05 246.81,-258.64 250.82,-252.9\"/>\n</g>\n<!-- 140027828983424 -->\n<g id=\"node17\" class=\"node\">\n<title>140027828983424</title>\n<polygon fill=\"lightblue\" stroke=\"black\" points=\"327,-822 166,-822 166,-792 327,-792 327,-822\"/>\n<text text-anchor=\"middle\" x=\"246.5\" y=\"-810\" font-family=\"monospace\" font-size=\"10.00\">step1.detached.fc.weight</text>\n<text text-anchor=\"middle\" x=\"246.5\" y=\"-799\" font-family=\"monospace\" font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- 140027828983424&#45;&gt;140024973744256 -->\n<g id=\"edge16\" class=\"edge\">\n<title>140027828983424&#45;&gt;140024973744256</title>\n<path fill=\"none\" stroke=\"black\" d=\"M246.5,-791.8C246.5,-782.7 246.5,-770.79 246.5,-760.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"250,-760.84 246.5,-750.84 243,-760.84 250,-760.84\"/>\n</g>\n<!-- 140024956771632 -->\n<g id=\"node18\" class=\"node\">\n<title>140024956771632</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"247,-195 170,-195 170,-176 247,-176 247,-195\"/>\n<text text-anchor=\"middle\" x=\"208.5\" y=\"-183\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024956771632&#45;&gt;140024956772112 -->\n<g id=\"edge17\" class=\"edge\">\n<title>140024956771632&#45;&gt;140024956772112</title>\n<path fill=\"none\" stroke=\"black\" d=\"M202.39,-175.75C197.22,-168.34 189.65,-157.5 183.31,-148.41\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"186.09,-146.29 177.5,-140.09 180.35,-150.29 186.09,-146.29\"/>\n</g>\n<!-- 140024973745984&#45;&gt;140024956771632 -->\n<g id=\"edge18\" class=\"edge\">\n<title>140024973745984&#45;&gt;140024956771632</title>\n<path fill=\"none\" stroke=\"black\" d=\"M223.53,-230.75C221.14,-223.72 217.71,-213.62 214.73,-204.84\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"217.95,-203.43 211.42,-195.09 211.32,-205.68 217.95,-203.43\"/>\n</g>\n<!-- 140024973743728 -->\n<g id=\"node20\" class=\"node\">\n<title>140024973743728</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"271,-305 182,-305 182,-286 271,-286 271,-305\"/>\n<text text-anchor=\"middle\" x=\"226.5\" y=\"-293\" font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- 140024973743728&#45;&gt;140024973745984 -->\n<g id=\"edge20\" class=\"edge\">\n<title>140024973743728&#45;&gt;140024973745984</title>\n<path fill=\"none\" stroke=\"black\" d=\"M226.5,-285.75C226.5,-278.8 226.5,-268.85 226.5,-260.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230,-260.09 226.5,-250.09 223,-260.09 230,-260.09\"/>\n</g>\n<!-- 140024973743344 -->\n<g id=\"node21\" class=\"node\">\n<title>140024973743344</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"266,-360 189,-360 189,-341 266,-341 266,-360\"/>\n<text text-anchor=\"middle\" x=\"227.5\" y=\"-348\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973743344&#45;&gt;140024973743728 -->\n<g id=\"edge21\" class=\"edge\">\n<title>140024973743344&#45;&gt;140024973743728</title>\n<path fill=\"none\" stroke=\"black\" d=\"M227.33,-340.75C227.2,-333.8 227.02,-323.85 226.85,-315.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230.35,-315.02 226.66,-305.09 223.35,-315.15 230.35,-315.02\"/>\n</g>\n<!-- 140024973745312 -->\n<g id=\"node22\" class=\"node\">\n<title>140024973745312</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"267,-415 190,-415 190,-396 267,-396 267,-415\"/>\n<text text-anchor=\"middle\" x=\"228.5\" y=\"-403\" font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- 140024973745312&#45;&gt;140024973743344 -->\n<g id=\"edge22\" class=\"edge\">\n<title>140024973745312&#45;&gt;140024973743344</title>\n<path fill=\"none\" stroke=\"black\" d=\"M228.33,-395.75C228.2,-388.8 228.02,-378.85 227.85,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"231.35,-370.02 227.66,-360.09 224.35,-370.15 231.35,-370.02\"/>\n</g>\n<!-- 140024973743200 -->\n<g id=\"node23\" class=\"node\">\n<title>140024973743200</title>\n<polygon fill=\"lightgrey\" stroke=\"black\" points=\"271,-470 188,-470 188,-451 271,-451 271,-470\"/>\n<text text-anchor=\"middle\" x=\"229.5\" y=\"-458\" font-family=\"monospace\" font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- 140024973743200&#45;&gt;140024973745312 -->\n<g id=\"edge23\" class=\"edge\">\n<title>140024973743200&#45;&gt;140024973745312</title>\n<path fill=\"none\" stroke=\"black\" d=\"M229.33,-450.75C229.2,-443.8 229.02,-433.85 228.85,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"232.35,-425.02 228.66,-415.09 225.35,-425.15 232.35,-425.02\"/>\n</g>\n<!-- 140024973744400&#45;&gt;140024973743200 -->\n<g id=\"edge24\" class=\"edge\">\n<title>140024973744400&#45;&gt;140024973743200</title>\n<path fill=\"none\" stroke=\"black\" d=\"M227,-505.75C227.39,-498.8 227.95,-488.85 228.45,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"231.94,-480.27 229.01,-470.09 224.95,-479.88 231.94,-480.27\"/>\n</g>\n</g>\n</svg>\n"
-     },
-     "metadata": {},
-     "output_type": "display_data"
+      "cell_type" : "code",
+      "execution_count" : 8,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : ["<graphviz.graphs.Digraph object at 0x7f5ac5072280>\n"]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"413pt\" height=\"1369pt\"\n viewBox=\"0.00 0.00 413.00 1369.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 "
+                "1365)\">\n<title>%3</title>\n<polygon fill=\"white\" stroke=\"transparent\" "
+                "points=\"-4,4 -4,-1365 409,-1365 409,4 -4,4\"/>\n<!-- 140024973755152 -->\n<g "
+                "id=\"node1\" class=\"node\">\n<title>140024973755152</title>\n<polygon "
+                "fill=\"#caff70\" stroke=\"black\" points=\"221,-30 144,-30 144,0 221,0 "
+                "221,-30\"/>\n<text text-anchor=\"middle\" x=\"182.5\" y=\"-18\" "
+                "font-family=\"monospace\" font-size=\"10.00\">outer_loss</text>\n<text "
+                "text-anchor=\"middle\" x=\"182.5\" y=\"-7\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140027829363232 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140027829363232</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"239,-85 126,-85 126,-66 239,-66 239,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"182.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140027829363232&#45;&gt;140024973755152 -->\n<g id=\"edge44\" "
+                "class=\"edge\">\n<title>140027829363232&#45;&gt;140024973755152</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M182.5,-65.87C182.5,-59.11 182.5,-49.35 "
+                "182.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"186,-40.11 "
+                "182.5,-30.11 179,-40.11 186,-40.11\"/>\n</g>\n<!-- 140027829363616 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140027829363616</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"233,-140 132,-140 132,-121 233,-121 "
+                "233,-140\"/>\n<text text-anchor=\"middle\" x=\"182.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140027829363616&#45;&gt;140027829363232 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140027829363616&#45;&gt;140027829363232</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M182.5,-120.75C182.5,-113.8 182.5,-103.85 "
+                "182.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"186,-95.09 "
+                "182.5,-85.09 179,-95.09 186,-95.09\"/>\n</g>\n<!-- 140027829366544 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140027829366544</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"281,-360 192,-360 192,-341 281,-341 "
+                "281,-360\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-348\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140027829366544&#45;&gt;140027829363616 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140027829366544&#45;&gt;140027829363616</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M234.74,-340.77C229.54,-315.26 213.55,-238.64 "
+                "196.5,-176 194.16,-167.4 191.25,-157.98 188.68,-150.02\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"191.94,-148.71 185.49,-140.31 "
+                "185.29,-150.9 191.94,-148.71\"/>\n</g>\n<!-- 140025091526128 -->\n<g id=\"node5\" "
+                "class=\"node\">\n<title>140025091526128</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"360,-844 265,-844 265,-803 360,-803 360,-844\"/>\n<text "
+                "text-anchor=\"middle\" x=\"312.5\" y=\"-832\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"312.5\" "
+                "y=\"-821\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.fc.bias</text>\n<text text-anchor=\"middle\" "
+                "x=\"312.5\" y=\"-810\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140025091526128&#45;&gt;140027829366544 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140025091526128&#45;&gt;140027829366544</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M312.5,-802.88C312.5,-775.65 312.5,-724.83 "
+                "312.5,-681.5 312.5,-681.5 312.5,-681.5 312.5,-459.5 312.5,-429.48 306.69,-420.6 "
+                "289.5,-396 281.33,-384.31 269.33,-373.88 258.82,-366.07\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"260.64,-363.07 250.45,-360.15 "
+                "256.6,-368.79 260.64,-363.07\"/>\n</g>\n<!-- 140025091725152 -->\n<g "
+                "id=\"node24\" class=\"node\">\n<title>140025091725152</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"226,-635 125,-635 125,-616 226,-616 "
+                "226,-635\"/>\n<text text-anchor=\"middle\" x=\"175.5\" y=\"-623\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140025091526128&#45;&gt;140025091725152 -->\n<g id=\"edge25\" "
+                "class=\"edge\">\n<title>140025091526128&#45;&gt;140025091725152</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M302.16,-802.98C286.03,-773.27 253.12,-715.44 "
+                "218.5,-671 210.6,-660.85 200.68,-650.46 192.36,-642.28\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"194.63,-639.6 184.99,-635.18 "
+                "189.77,-644.64 194.63,-639.6\"/>\n</g>\n<!-- 140025091526416 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140025091526416</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"405,-1229 304,-1229 304,-1210 405,-1210 "
+                "405,-1229\"/>\n<text text-anchor=\"middle\" x=\"354.5\" y=\"-1217\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140025091526416&#45;&gt;140025091526128 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140025091526416&#45;&gt;140025091526128</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M360.06,-1209.94C371.05,-1192.18 "
+                "394.5,-1149.69 394.5,-1110.5 394.5,-1110.5 394.5,-1110.5 394.5,-943.5 "
+                "394.5,-906.76 367.67,-873.11 344.7,-851.04\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"346.82,-848.23 337.1,-844.01 342.07,-853.37 "
+                "346.82,-848.23\"/>\n</g>\n<!-- 140028156436736 -->\n<g id=\"node15\" "
+                "class=\"node\">\n<title>140028156436736</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"366,-1174 265,-1174 265,-1155 366,-1155 "
+                "366,-1174\"/>\n<text text-anchor=\"middle\" x=\"315.5\" y=\"-1162\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140025091526416&#45;&gt;140028156436736 -->\n<g id=\"edge14\" "
+                "class=\"edge\">\n<title>140025091526416&#45;&gt;140028156436736</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M348.06,-1209.75C342.61,-1202.34 "
+                "334.64,-1191.5 327.94,-1182.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"330.57,-1180.07 321.82,-1174.09 324.93,-1184.22 "
+                "330.57,-1180.07\"/>\n</g>\n<!-- 140028155952000 -->\n<g id=\"node7\" "
+                "class=\"node\">\n<title>140028155952000</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"402,-1295 307,-1295 307,-1265 402,-1265 "
+                "402,-1295\"/>\n<text text-anchor=\"middle\" x=\"354.5\" y=\"-1283\" "
+                "font-family=\"monospace\" font-size=\"10.00\">step0.fc.bias</text>\n<text "
+                "text-anchor=\"middle\" x=\"354.5\" y=\"-1272\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140028155952000&#45;&gt;140025091526416 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140028155952000&#45;&gt;140025091526416</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M354.5,-1264.84C354.5,-1257.21 354.5,-1247.7 "
+                "354.5,-1239.45\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"358,-1239.27 354.5,-1229.27 351,-1239.27 358,-1239.27\"/>\n</g>\n<!-- "
+                "140025091524976 -->\n<g id=\"node8\" "
+                "class=\"node\">\n<title>140025091524976</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"357,-954 268,-954 268,-935 357,-935 357,-954\"/>\n<text "
+                "text-anchor=\"middle\" x=\"312.5\" y=\"-942\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140025091524976&#45;&gt;140025091526128 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140025091524976&#45;&gt;140025091526128</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M312.5,-934.94C312.5,-918.36 312.5,-881.15 "
+                "312.5,-854.5\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"316,-854.17 "
+                "312.5,-844.17 309,-854.17 316,-854.17\"/>\n</g>\n<!-- 140025091526560 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140025091526560</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"360,-1009 265,-1009 265,-990 "
+                "360,-990 360,-1009\"/>\n<text text-anchor=\"middle\" x=\"312.5\" y=\"-997\" "
+                "font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- "
+                "140025091526560&#45;&gt;140025091524976 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140025091526560&#45;&gt;140025091524976</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M312.5,-989.75C312.5,-982.8 312.5,-972.85 "
+                "312.5,-964.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"316,-964.09 "
+                "312.5,-954.09 309,-964.09 316,-964.09\"/>\n</g>\n<!-- 140025091525456 -->\n<g "
+                "id=\"node10\" class=\"node\">\n<title>140025091525456</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"355,-1064 266,-1064 266,-1045 "
+                "355,-1045 355,-1064\"/>\n<text text-anchor=\"middle\" x=\"310.5\" y=\"-1052\" "
+                "font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- "
+                "140025091525456&#45;&gt;140025091526560 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140025091525456&#45;&gt;140025091526560</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M310.83,-1044.75C311.09,-1037.8 "
+                "311.47,-1027.85 311.8,-1019.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"315.3,-1019.21 312.18,-1009.09 308.3,-1018.95 "
+                "315.3,-1019.21\"/>\n</g>\n<!-- 140025091524112 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140025091524112</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"343,-1119 182,-1119 182,-1100 343,-1100 "
+                "343,-1119\"/>\n<text text-anchor=\"middle\" x=\"262.5\" y=\"-1107\" "
+                "font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140025091524112&#45;&gt;140025091525456 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140025091524112&#45;&gt;140025091525456</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M270.43,-1099.75C277.35,-1092.11 "
+                "287.57,-1080.82 295.95,-1071.56\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"298.6,-1073.85 302.72,-1064.09 293.41,-1069.15 "
+                "298.6,-1073.85\"/>\n</g>\n<!-- 140024973742672 -->\n<g id=\"node31\" "
+                "class=\"node\">\n<title>140024973742672</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"245,-1064 168,-1064 168,-1045 245,-1045 "
+                "245,-1064\"/>\n<text text-anchor=\"middle\" x=\"206.5\" y=\"-1052\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091524112&#45;&gt;140024973742672 -->\n<g id=\"edge34\" "
+                "class=\"edge\">\n<title>140025091524112&#45;&gt;140024973742672</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M253.5,-1099.98C245.31,-1092.23 "
+                "232.99,-1080.58 223.03,-1071.14\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"225.33,-1068.5 215.66,-1064.17 220.52,-1073.59 "
+                "225.33,-1068.5\"/>\n</g>\n<!-- 140024973742288 -->\n<g id=\"node12\" "
+                "class=\"node\">\n<title>140024973742288</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"247,-1174 158,-1174 158,-1155 247,-1155 "
+                "247,-1174\"/>\n<text text-anchor=\"middle\" x=\"202.5\" y=\"-1162\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140024973742288&#45;&gt;140025091524112 -->\n<g id=\"edge10\" "
+                "class=\"edge\">\n<title>140024973742288&#45;&gt;140025091524112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M212.14,-1154.98C221.01,-1147.15 "
+                "234.37,-1135.34 245.11,-1125.86\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"247.51,-1128.41 252.68,-1119.17 242.87,-1123.17 "
+                "247.51,-1128.41\"/>\n</g>\n<!-- 140024973742384 -->\n<g id=\"node13\" "
+                "class=\"node\">\n<title>140024973742384</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"101,-1229 0,-1229 0,-1210 101,-1210 "
+                "101,-1229\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-1217\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140024973742384&#45;&gt;140024973742288 -->\n<g id=\"edge11\" "
+                "class=\"edge\">\n<title>140024973742384&#45;&gt;140024973742288</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M74.92,-1209.98C100.22,-1201.16 "
+                "139.95,-1187.31 168.36,-1177.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"169.6,-1180.68 177.89,-1174.08 167.29,-1174.07 "
+                "169.6,-1180.68\"/>\n</g>\n<!-- 140025091726064 -->\n<g id=\"node23\" "
+                "class=\"node\">\n<title>140025091726064</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"100,-1119 11,-1119 11,-1100 100,-1100 "
+                "100,-1119\"/>\n<text text-anchor=\"middle\" x=\"55.5\" y=\"-1107\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140024973742384&#45;&gt;140025091726064 -->\n<g id=\"edge23\" "
+                "class=\"edge\">\n<title>140024973742384&#45;&gt;140025091726064</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.91,-1209.66C51.72,-1192.17 53.54,-1152.8 "
+                "54.63,-1129.27\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"58.13,-1129.31 55.1,-1119.16 51.14,-1128.99 "
+                "58.13,-1129.31\"/>\n</g>\n<!-- 140025091549440 -->\n<g id=\"node14\" "
+                "class=\"node\">\n<title>140025091549440</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"101,-1295 0,-1295 0,-1265 101,-1265 "
+                "101,-1295\"/>\n<text text-anchor=\"middle\" x=\"50.5\" y=\"-1283\" "
+                "font-family=\"monospace\" font-size=\"10.00\">meta_parameter</text>\n<text "
+                "text-anchor=\"middle\" x=\"50.5\" y=\"-1272\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091549440&#45;&gt;140024973742384 "
+                "-->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140025091549440&#45;&gt;140024973742384</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M50.5,-1264.84C50.5,-1257.21 50.5,-1247.7 "
+                "50.5,-1239.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"54,-1239.27 "
+                "50.5,-1229.27 47,-1239.27 54,-1239.27\"/>\n</g>\n<!-- "
+                "140028156436736&#45;&gt;140025091524112 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140028156436736&#45;&gt;140025091524112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M306.75,-1154.75C299.03,-1147.03 287.6,-1135.6 "
+                "278.28,-1126.28\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"280.64,-1123.69 271.09,-1119.09 275.69,-1128.64 "
+                "280.64,-1123.69\"/>\n</g>\n<!-- 140025091525408 -->\n<g id=\"node16\" "
+                "class=\"node\">\n<title>140025091525408</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"280,-1229 203,-1229 203,-1210 280,-1210 "
+                "280,-1229\"/>\n<text text-anchor=\"middle\" x=\"241.5\" y=\"-1217\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091525408&#45;&gt;140028156436736 -->\n<g id=\"edge15\" "
+                "class=\"edge\">\n<title>140025091525408&#45;&gt;140028156436736</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M253.39,-1209.98C264.65,-1201.92 "
+                "281.79,-1189.65 295.21,-1180.03\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"297.3,-1182.84 303.39,-1174.17 293.23,-1177.15 "
+                "297.3,-1182.84\"/>\n</g>\n<!-- 140025091526224 -->\n<g id=\"node17\" "
+                "class=\"node\">\n<title>140025091526224</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"274,-1289.5 173,-1289.5 173,-1270.5 274,-1270.5 "
+                "274,-1289.5\"/>\n<text text-anchor=\"middle\" x=\"223.5\" y=\"-1277.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140025091526224&#45;&gt;140025091525408 -->\n<g id=\"edge16\" "
+                "class=\"edge\">\n<title>140025091526224&#45;&gt;140025091525408</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M226.16,-1270.37C228.68,-1262.16 "
+                "232.56,-1249.54 235.79,-1239.05\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"239.24,-1239.75 238.83,-1229.17 232.55,-1237.7 "
+                "239.24,-1239.75\"/>\n</g>\n<!-- 140025091524928 -->\n<g id=\"node26\" "
+                "class=\"node\">\n<title>140025091524928</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"210,-767 103,-767 103,-726 210,-726 210,-767\"/>\n<text "
+                "text-anchor=\"middle\" x=\"156.5\" y=\"-755\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n<text text-anchor=\"middle\" x=\"156.5\" "
+                "y=\"-744\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.fc.weight</text>\n<text text-anchor=\"middle\" "
+                "x=\"156.5\" y=\"-733\" font-family=\"monospace\" font-size=\"10.00\">(1, "
+                "16)</text>\n</g>\n<!-- 140025091526224&#45;&gt;140025091524928 -->\n<g "
+                "id=\"edge28\" "
+                "class=\"edge\">\n<title>140025091526224&#45;&gt;140025091524928</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M208.81,-1270.48C182.38,-1253.91 "
+                "129.5,-1214.62 129.5,-1165.5 129.5,-1165.5 129.5,-1165.5 129.5,-888.5 "
+                "129.5,-849.43 139.62,-805.34 147.47,-777.03\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"150.86,-777.93 150.24,-767.35 144.12,-776 "
+                "150.86,-777.93\"/>\n</g>\n<!-- 140028155952880 -->\n<g id=\"node18\" "
+                "class=\"node\">\n<title>140028155952880</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"277,-1361 170,-1361 170,-1331 277,-1331 "
+                "277,-1361\"/>\n<text text-anchor=\"middle\" x=\"223.5\" y=\"-1349\" "
+                "font-family=\"monospace\" font-size=\"10.00\">step0.fc.weight</text>\n<text "
+                "text-anchor=\"middle\" x=\"223.5\" y=\"-1338\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1, 16)</text>\n</g>\n<!-- "
+                "140028155952880&#45;&gt;140025091526224 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140028155952880&#45;&gt;140025091526224</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M223.5,-1330.8C223.5,-1321.7 223.5,-1309.79 "
+                "223.5,-1299.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"227,-1299.84 "
+                "223.5,-1289.84 220,-1299.84 227,-1299.84\"/>\n</g>\n<!-- 140025091726784 -->\n<g "
+                "id=\"node19\" class=\"node\">\n<title>140025091726784</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"281,-415 192,-415 192,-396 281,-396 "
+                "281,-415\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-403\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140025091726784&#45;&gt;140027829366544 -->\n<g id=\"edge18\" "
+                "class=\"edge\">\n<title>140025091726784&#45;&gt;140027829366544</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M236.5,-395.75C236.5,-388.8 236.5,-378.85 "
+                "236.5,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"240,-370.09 "
+                "236.5,-360.09 233,-370.09 240,-370.09\"/>\n</g>\n<!-- 140025091726688 -->\n<g "
+                "id=\"node20\" class=\"node\">\n<title>140025091726688</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"284,-470 189,-470 189,-451 284,-451 "
+                "284,-470\"/>\n<text text-anchor=\"middle\" x=\"236.5\" y=\"-458\" "
+                "font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- "
+                "140025091726688&#45;&gt;140025091726784 -->\n<g id=\"edge19\" "
+                "class=\"edge\">\n<title>140025091726688&#45;&gt;140025091726784</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M236.5,-450.75C236.5,-443.8 236.5,-433.85 "
+                "236.5,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"240,-425.09 "
+                "236.5,-415.09 233,-425.09 240,-425.09\"/>\n</g>\n<!-- 140025091725680 -->\n<g "
+                "id=\"node21\" class=\"node\">\n<title>140025091725680</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"278,-525 189,-525 189,-506 278,-506 "
+                "278,-525\"/>\n<text text-anchor=\"middle\" x=\"233.5\" y=\"-513\" "
+                "font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- "
+                "140025091725680&#45;&gt;140025091726688 -->\n<g id=\"edge20\" "
+                "class=\"edge\">\n<title>140025091725680&#45;&gt;140025091726688</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M234,-505.75C234.39,-498.8 234.95,-488.85 "
+                "235.45,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"238.94,-480.27 236.01,-470.09 231.95,-479.88 "
+                "238.94,-480.27\"/>\n</g>\n<!-- 140025091726112 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140025091726112</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"252,-580 91,-580 91,-561 252,-561 252,-580\"/>\n<text "
+                "text-anchor=\"middle\" x=\"171.5\" y=\"-568\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140025091726112&#45;&gt;140025091725680 -->\n<g id=\"edge21\" "
+                "class=\"edge\">\n<title>140025091726112&#45;&gt;140025091725680</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M181.46,-560.98C190.63,-553.15 204.44,-541.34 "
+                "215.53,-531.86\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"218.03,-534.33 223.36,-525.17 213.48,-529.01 "
+                "218.03,-534.33\"/>\n</g>\n<!-- 140025091726880 -->\n<g id=\"node38\" "
+                "class=\"node\">\n<title>140025091726880</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"169,-525 92,-525 92,-506 169,-506 169,-525\"/>\n<text "
+                "text-anchor=\"middle\" x=\"130.5\" y=\"-513\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091726112&#45;&gt;140025091726880 -->\n<g id=\"edge43\" "
+                "class=\"edge\">\n<title>140025091726112&#45;&gt;140025091726880</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M164.73,-560.75C158.94,-553.26 150.44,-542.28 "
+                "143.36,-533.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"146.03,-530.86 137.14,-525.09 140.49,-535.14 "
+                "146.03,-530.86\"/>\n</g>\n<!-- 140025091726064&#45;&gt;140025091726112 -->\n<g "
+                "id=\"edge22\" "
+                "class=\"edge\">\n<title>140025091726064&#45;&gt;140025091726112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M56.04,-1099.82C57.13,-1081.49 59.5,-1037.46 "
+                "59.5,-1000.5 59.5,-1000.5 59.5,-1000.5 59.5,-745.5 59.5,-682.79 77.36,-665.78 "
+                "115.5,-616 124.56,-604.18 137.42,-593.66 148.53,-585.83\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"150.63,-588.64 156.97,-580.15 "
+                "146.72,-582.83 150.63,-588.64\"/>\n</g>\n<!-- "
+                "140025091725152&#45;&gt;140025091726112 -->\n<g id=\"edge24\" "
+                "class=\"edge\">\n<title>140025091725152&#45;&gt;140025091726112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M174.84,-615.75C174.32,-608.8 173.56,-598.85 "
+                "172.91,-590.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"176.39,-589.8 172.15,-580.09 169.41,-590.32 "
+                "176.39,-589.8\"/>\n</g>\n<!-- 140025091725824 -->\n<g id=\"node25\" "
+                "class=\"node\">\n<title>140025091725824</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"210,-690 133,-690 133,-671 210,-671 210,-690\"/>\n<text "
+                "text-anchor=\"middle\" x=\"171.5\" y=\"-678\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091725824&#45;&gt;140025091725152 -->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140025091725824&#45;&gt;140025091725152</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M172.16,-670.75C172.68,-663.8 173.44,-653.85 "
+                "174.09,-645.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"177.59,-645.32 174.85,-635.09 170.61,-644.8 "
+                "177.59,-645.32\"/>\n</g>\n<!-- 140025091524928&#45;&gt;140025091725824 -->\n<g "
+                "id=\"edge27\" "
+                "class=\"edge\">\n<title>140025091524928&#45;&gt;140025091725824</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M161.08,-725.95C163,-717.76 165.22,-708.28 "
+                "167.12,-700.19\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"170.57,-700.8 169.45,-690.26 163.76,-699.2 170.57,-700.8\"/>\n</g>\n<!-- "
+                "140025091726016 -->\n<g id=\"node33\" "
+                "class=\"node\">\n<title>140025091726016</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"176,-250 87,-250 87,-231 176,-231 176,-250\"/>\n<text "
+                "text-anchor=\"middle\" x=\"131.5\" y=\"-238\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140025091524928&#45;&gt;140025091726016 -->\n<g id=\"edge37\" "
+                "class=\"edge\">\n<title>140025091524928&#45;&gt;140025091726016</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M136.43,-725.86C107.76,-695.59 58.5,-634.43 "
+                "58.5,-571.5 58.5,-571.5 58.5,-571.5 58.5,-349.5 58.5,-320.04 61.21,-310.54 "
+                "77.5,-286 85.37,-274.14 97.36,-263.81 108.03,-256.1\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"110.3,-258.79 116.58,-250.26 106.35,-253.02 "
+                "110.3,-258.79\"/>\n</g>\n<!-- 140025091525600 -->\n<g id=\"node27\" "
+                "class=\"node\">\n<title>140025091525600</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"247,-833 158,-833 158,-814 247,-814 247,-833\"/>\n<text "
+                "text-anchor=\"middle\" x=\"202.5\" y=\"-821\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140025091525600&#45;&gt;140025091524928 -->\n<g id=\"edge29\" "
+                "class=\"edge\">\n<title>140025091525600&#45;&gt;140025091524928</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M197.22,-813.9C191.53,-804.62 182.17,-789.35 "
+                "173.85,-775.79\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"176.8,-773.9 168.59,-767.21 170.83,-777.56 176.8,-773.9\"/>\n</g>\n<!-- "
+                "140024973742144 -->\n<g id=\"node28\" "
+                "class=\"node\">\n<title>140024973742144</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"242,-899 165,-899 165,-880 242,-880 242,-899\"/>\n<text "
+                "text-anchor=\"middle\" x=\"203.5\" y=\"-887\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973742144&#45;&gt;140025091525600 -->\n<g id=\"edge30\" "
+                "class=\"edge\">\n<title>140024973742144&#45;&gt;140025091525600</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M203.37,-879.87C203.22,-870.66 202.99,-855.79 "
+                "202.8,-843.77\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"206.3,-843.35 202.64,-833.41 199.3,-843.46 206.3,-843.35\"/>\n</g>\n<!-- "
+                "140024973742576 -->\n<g id=\"node29\" "
+                "class=\"node\">\n<title>140024973742576</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"242,-954 165,-954 165,-935 242,-935 242,-954\"/>\n<text "
+                "text-anchor=\"middle\" x=\"203.5\" y=\"-942\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973742576&#45;&gt;140024973742144 -->\n<g id=\"edge31\" "
+                "class=\"edge\">\n<title>140024973742576&#45;&gt;140024973742144</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M203.5,-934.75C203.5,-927.8 203.5,-917.85 "
+                "203.5,-909.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"207,-909.09 "
+                "203.5,-899.09 200,-909.09 207,-909.09\"/>\n</g>\n<!-- 140024973742480 -->\n<g "
+                "id=\"node30\" class=\"node\">\n<title>140024973742480</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"246,-1009 163,-1009 163,-990 "
+                "246,-990 246,-1009\"/>\n<text text-anchor=\"middle\" x=\"204.5\" y=\"-997\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- "
+                "140024973742480&#45;&gt;140024973742576 -->\n<g id=\"edge32\" "
+                "class=\"edge\">\n<title>140024973742480&#45;&gt;140024973742576</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M204.33,-989.75C204.2,-982.8 204.02,-972.85 "
+                "203.85,-964.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"207.35,-964.02 203.66,-954.09 200.35,-964.15 "
+                "207.35,-964.02\"/>\n</g>\n<!-- 140024973742672&#45;&gt;140024973742480 -->\n<g "
+                "id=\"edge33\" "
+                "class=\"edge\">\n<title>140024973742672&#45;&gt;140024973742480</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M206.17,-1044.75C205.91,-1037.8 "
+                "205.53,-1027.85 205.2,-1019.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"208.7,-1018.95 204.82,-1009.09 201.7,-1019.21 "
+                "208.7,-1018.95\"/>\n</g>\n<!-- 140027829365632 -->\n<g id=\"node32\" "
+                "class=\"node\">\n<title>140027829365632</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"188,-195 111,-195 111,-176 188,-176 188,-195\"/>\n<text "
+                "text-anchor=\"middle\" x=\"149.5\" y=\"-183\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140027829365632&#45;&gt;140027829363616 -->\n<g id=\"edge35\" "
+                "class=\"edge\">\n<title>140027829365632&#45;&gt;140027829363616</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M154.95,-175.75C159.51,-168.42 166.17,-157.73 "
+                "171.79,-148.7\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"174.84,-150.43 177.15,-140.09 168.9,-146.73 "
+                "174.84,-150.43\"/>\n</g>\n<!-- 140025091726016&#45;&gt;140027829365632 -->\n<g "
+                "id=\"edge36\" "
+                "class=\"edge\">\n<title>140025091726016&#45;&gt;140027829365632</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M134.47,-230.75C136.86,-223.72 140.29,-213.62 "
+                "143.27,-204.84\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"146.68,-205.68 146.58,-195.09 140.05,-203.43 "
+                "146.68,-205.68\"/>\n</g>\n<!-- 140025091726544 -->\n<g id=\"node34\" "
+                "class=\"node\">\n<title>140025091726544</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"176,-305 87,-305 87,-286 176,-286 176,-305\"/>\n<text "
+                "text-anchor=\"middle\" x=\"131.5\" y=\"-293\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140025091726544&#45;&gt;140025091726016 -->\n<g id=\"edge38\" "
+                "class=\"edge\">\n<title>140025091726544&#45;&gt;140025091726016</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M131.5,-285.75C131.5,-278.8 131.5,-268.85 "
+                "131.5,-260.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"135,-260.09 "
+                "131.5,-250.09 128,-260.09 135,-260.09\"/>\n</g>\n<!-- 140025091726448 -->\n<g "
+                "id=\"node35\" class=\"node\">\n<title>140025091726448</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"169,-360 92,-360 92,-341 169,-341 "
+                "169,-360\"/>\n<text text-anchor=\"middle\" x=\"130.5\" y=\"-348\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091726448&#45;&gt;140025091726544 -->\n<g id=\"edge39\" "
+                "class=\"edge\">\n<title>140025091726448&#45;&gt;140025091726544</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M130.67,-340.75C130.8,-333.8 130.98,-323.85 "
+                "131.15,-315.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"134.65,-315.15 131.34,-305.09 127.65,-315.02 "
+                "134.65,-315.15\"/>\n</g>\n<!-- 140025091725584 -->\n<g id=\"node36\" "
+                "class=\"node\">\n<title>140025091725584</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"168,-415 91,-415 91,-396 168,-396 168,-415\"/>\n<text "
+                "text-anchor=\"middle\" x=\"129.5\" y=\"-403\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140025091725584&#45;&gt;140025091726448 -->\n<g id=\"edge40\" "
+                "class=\"edge\">\n<title>140025091725584&#45;&gt;140025091726448</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M129.67,-395.75C129.8,-388.8 129.98,-378.85 "
+                "130.15,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"133.65,-370.15 130.34,-360.09 126.65,-370.02 "
+                "133.65,-370.15\"/>\n</g>\n<!-- 140025091727024 -->\n<g id=\"node37\" "
+                "class=\"node\">\n<title>140025091727024</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"171,-470 88,-470 88,-451 171,-451 171,-470\"/>\n<text "
+                "text-anchor=\"middle\" x=\"129.5\" y=\"-458\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- "
+                "140025091727024&#45;&gt;140025091725584 -->\n<g id=\"edge41\" "
+                "class=\"edge\">\n<title>140025091727024&#45;&gt;140025091725584</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M129.5,-450.75C129.5,-443.8 129.5,-433.85 "
+                "129.5,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"133,-425.09 "
+                "129.5,-415.09 126,-425.09 133,-425.09\"/>\n</g>\n<!-- "
+                "140025091726880&#45;&gt;140025091727024 -->\n<g id=\"edge42\" "
+                "class=\"edge\">\n<title>140025091726880&#45;&gt;140025091727024</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M130.33,-505.75C130.2,-498.8 130.02,-488.85 "
+                "129.85,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"133.35,-480.02 129.66,-470.09 126.35,-480.15 "
+                "133.35,-480.02\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        },
+        {
+          "data" : {
+            "text/html" : [
+              "<pre "
+              "style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'"
+              "DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; "
+              "text-decoration-color: #800000\">╭─────────────────────────────────────── "
+              "</span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: "
+              "bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: "
+              "#bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: "
+              "#800000; text-decoration-color: #800000\"> "
+              "───────────────────────────────────────╮</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #bfbf7f; text-decoration-color: "
+              "#bfbf7f\">/tmp/ipykernel_3962266/</span><span style=\"color: #808000; "
+              "text-decoration-color: #808000; font-weight: bold\">4178930003.py</span>:<span "
+              "style=\"color: #0000ff; text-decoration-color: #0000ff\">21</span> in <span "
+              "style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>      "
+              "                                                       <span style=\"color: "
+              "#800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #800000; text-decoration-color: #800000; font-style: italic\">[Errno "
+              "2] No such file or directory: '/tmp/ipykernel_3962266/4178930003.py'</span>         "
+              "                            <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #bfbf7f; text-decoration-color: "
+              "#bfbf7f\">/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/"
+              "torch/</span><span style=\"color: #808000; text-decoration-color: #808000; "
+              "font-weight: bold\">_tensor.py</span>:<span style=\"color: #0000ff; "
+              "text-decoration-color: #0000ff\">487</span> in <span style=\"color: #00ff00; "
+              "text-decoration-color: #00ff00\">backward</span>           <span style=\"color: "
+              "#800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 484 │   │   │   │   "
+              "</span>create_graph=create_graph,                                                   "
+              "            <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 485 │   │   │   │   "
+              "</span>inputs=inputs,                                                               "
+              "            <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 486 │   │   │   </span>)  "
+              "                                                                                    "
+              "      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 487 <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   "
+              "</span>torch.autograd.backward(                                                     "
+              "                    <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 488 │   │   │   "
+              "</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, "
+              "gradient, retain_graph, create_graph, inputs=inputs                                 "
+              "   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 489 │   │   </span>)      "
+              "                                                                                    "
+              "      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 490 </span>               "
+              "                                                                                    "
+              "      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">╭───────────────────────── "
+              "locals ──────────────────────────╮</span>                                           "
+              "        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span> create_graph = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>         "
+              "                             <span style=\"color: #808000; text-decoration-color: "
+              "#808000\">│</span>                                                   <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>     gradient = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>          "
+              "                             <span style=\"color: #808000; text-decoration-color: "
+              "#808000\">│</span>                                                   <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>       inputs = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>          "
+              "                             <span style=\"color: #808000; text-decoration-color: "
+              "#808000\">│</span>                                                   <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span> retain_graph = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>          "
+              "                             <span style=\"color: #808000; text-decoration-color: "
+              "#808000\">│</span>                                                   <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>         self = "
+              "<span style=\"color: #800080; text-decoration-color: #800080; font-weight: "
+              "bold\">tensor</span><span style=\"font-weight: bold\">(</span><span style=\"color: "
+              "#0000ff; text-decoration-color: #0000ff\">0.1203</span>, <span style=\"color: "
+              "#808000; text-decoration-color: #808000\">grad_fn</span>=<span style=\"font-weight: "
+              "bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; "
+              "font-weight: bold\">MseLossBackward0</span><span style=\"font-weight: "
+              "bold\">&gt;)</span> <span style=\"color: #808000; text-decoration-color: "
+              "#808000\">│</span>                                                   <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: "
+              "#808000\">╰───────────────────────────────────────────────────────────╯</span>      "
+              "                                             <span style=\"color: #800000; "
+              "text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #bfbf7f; text-decoration-color: "
+              "#bfbf7f\">/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/"
+              "torch/autograd/</span><span style=\"color: #808000; text-decoration-color: #808000; "
+              "font-weight: bold\">__init__.py</span>:<span style=\"color: #0000ff; "
+              "text-decoration-color: #0000ff\">197</span> in <span style=\"color: #00ff00; "
+              "text-decoration-color: #00ff00\">backward</span> <span style=\"color: #800000; "
+              "text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">194 │   # The reason we "
+              "repeat same the comment below is that</span>                                        "
+              "         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">195 │   # some Python "
+              "versions print out the first line of a multi-line function</span>                   "
+              "           <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">196 │   # calls in the "
+              "traceback and some print out the last line</span>                                   "
+              "          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #800000; text-decoration-color: #800000\">❱ </span>197 <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   "
+              "</span>Variable._execution_engine.run_backward(  <span style=\"color: #7f7f7f; "
+              "text-decoration-color: #7f7f7f\"># Calls into the C++ engine to run the ba</span>   "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">198 │   │   "
+              "</span>tensors, grad_tensors_, retain_graph, create_graph, inputs,                  "
+              "                     <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">199 │   │   "
+              "</span>allow_unreachable=<span style=\"color: #0000ff; text-decoration-color: "
+              "#0000ff\">True</span>, accumulate_grad=<span style=\"color: #0000ff; "
+              "text-decoration-color: #0000ff\">True</span>)  <span style=\"color: #7f7f7f; "
+              "text-decoration-color: #7f7f7f\"># Calls into the C++ engine to r</span>            "
+              "       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span "
+              "style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">200 </span>                "
+              "                                                                                    "
+              "      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>             "
+              "                                                                                    "
+              "                <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: "
+              "#808000\">╭──────────────────────────── locals ────────────────────────────╮</span> "
+              "                                             <span style=\"color: #800000; "
+              "text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>   create_graph = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>         "
+              "                                <span style=\"color: #808000; "
+              "text-decoration-color: #808000\">│</span>                                           "
+              "   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>   grad_tensors = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>          "
+              "                                <span style=\"color: #808000; "
+              "text-decoration-color: #808000\">│</span>                                           "
+              "   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>  grad_tensors_ = "
+              "<span style=\"font-weight: bold\">(</span><span style=\"color: #800080; "
+              "text-decoration-color: #800080; font-weight: bold\">tensor</span><span "
+              "style=\"font-weight: bold\">(</span><span style=\"color: #0000ff; "
+              "text-decoration-color: #0000ff\">1</span>.<span style=\"font-weight: "
+              "bold\">)</span>,<span style=\"font-weight: bold\">)</span>                          "
+              "       <span style=\"color: #808000; text-decoration-color: #808000\">│</span>      "
+              "                                        <span style=\"color: #800000; "
+              "text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span> grad_variables = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>          "
+              "                                <span style=\"color: #808000; "
+              "text-decoration-color: #808000\">│</span>                                           "
+              "   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>         inputs = "
+              "<span style=\"font-weight: bold\">()</span>                                         "
+              "   <span style=\"color: #808000; text-decoration-color: #808000\">│</span>          "
+              "                                    <span style=\"color: #800000; "
+              "text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>   retain_graph = "
+              "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>         "
+              "                                <span style=\"color: #808000; "
+              "text-decoration-color: #808000\">│</span>                                           "
+              "   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: #808000\">│</span>        tensors = "
+              "<span style=\"font-weight: bold\">(</span><span style=\"color: #800080; "
+              "text-decoration-color: #800080; font-weight: bold\">tensor</span><span "
+              "style=\"font-weight: bold\">(</span><span style=\"color: #0000ff; "
+              "text-decoration-color: #0000ff\">0.1203</span>, <span style=\"color: #808000; "
+              "text-decoration-color: #808000\">grad_fn</span>=<span style=\"font-weight: "
+              "bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; "
+              "font-weight: bold\">MseLossBackward0</span><span style=\"font-weight: "
+              "bold\">&gt;)</span>,<span style=\"font-weight: bold\">)</span> <span style=\"color: "
+              "#808000; text-decoration-color: #808000\">│</span>                                  "
+              "            <span style=\"color: #800000; text-decoration-color: "
+              "#800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span "
+              "style=\"color: #808000; text-decoration-color: "
+              "#808000\">╰────────────────────────────────────────────────────────────────╯</span> "
+              "                                             <span style=\"color: #800000; "
+              "text-decoration-color: #800000\">│</span>\n",
+              "<span style=\"color: #800000; text-decoration-color: "
+              "#800000\">"
+              "╰───────────────────────────────────────────────────────────────────────────────────"
+              "──────────────────────────────╯</span>\n",
+              "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: "
+              "bold\">RuntimeError: </span>Trying to backward through the graph a second time "
+              "<span style=\"font-weight: bold\">(</span>or directly access saved tensors after "
+              "they have \n",
+              "already been freed<span style=\"font-weight: bold\">)</span>. Saved intermediate "
+              "values of the graph are freed when you call <span style=\"color: #800080; "
+              "text-decoration-color: #800080; font-weight: bold\">.backward</span><span "
+              "style=\"font-weight: bold\">()</span> or <span style=\"color: #800080; "
+              "text-decoration-color: #800080; font-weight: bold\">autograd.grad</span><span "
+              "style=\"font-weight: bold\">()</span>.\n",
+              "Specify <span style=\"color: #808000; text-decoration-color: "
+              "#808000\">retain_graph</span>=<span style=\"color: #00ff00; text-decoration-color: "
+              "#00ff00; font-style: italic\">True</span> if you need to backward through the graph "
+              "a second time or if you need to access saved \n",
+              "tensors after calling backward.\n",
+              "</pre>\n"
+            ],
+            "text/plain" : [
+              "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────────────────────── "
+              "\u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call "
+              "last)\u001b[0m\u001b[31m "
+              "──────────────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+              "\u001b[31m│\u001b[0m "
+              "\u001b[2;33m/tmp/ipykernel_3962266/"
+              "\u001b[0m\u001b[1;33m4178930003.py\u001b[0m:\u001b[94m21\u001b[0m in "
+              "\u001b[92m<module>\u001b[0m                                                         "
+              "    \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[3;31m[Errno 2] No such file or directory: "
+              "'/tmp/ipykernel_3962266/4178930003.py'\u001b[0m                                     "
+              "\u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m "
+              "\u001b[2;33m/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/"
+              "torch/\u001b[0m\u001b[1;33m_tensor.py\u001b[0m:\u001b[94m487\u001b[0m in "
+              "\u001b[92mbackward\u001b[0m           \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m 484 \u001b[0m\u001b[2m│   │   │   │   "
+              "\u001b[0mcreate_graph=create_graph,                                                 "
+              "              \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m 485 \u001b[0m\u001b[2m│   │   │   │   "
+              "\u001b[0minputs=inputs,                                                             "
+              "              \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m 486 \u001b[0m\u001b[2m│   │   │   \u001b[0m)       "
+              "                                                                                    "
+              " \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 487 \u001b[2m│   │   "
+              "\u001b[0mtorch.autograd.backward(                                                   "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m 488 \u001b[0m\u001b[2m│   │   │   "
+              "\u001b[0m\u001b[96mself\u001b[0m, gradient, retain_graph, create_graph, "
+              "inputs=inputs                                    \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m 489 \u001b[0m\u001b[2m│   │   \u001b[0m)           "
+              "                                                                                    "
+              " \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m 490 \u001b[0m                                      "
+              "                                                                   "
+              "\u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m──────────────────────── locals "
+              "─────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m                             "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m create_graph = \u001b[94mFalse\u001b[0m   "
+              "                                   \u001b[33m│\u001b[0m                             "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m     gradient = \u001b[94mNone\u001b[0m    "
+              "                                   \u001b[33m│\u001b[0m                             "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m       inputs = \u001b[94mNone\u001b[0m    "
+              "                                   \u001b[33m│\u001b[0m                             "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m retain_graph = \u001b[94mNone\u001b[0m    "
+              "                                   \u001b[33m│\u001b[0m                             "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m         self = "
+              "\u001b[1;35mtensor\u001b[0m\u001b[1m(\u001b[0m\u001b[94m0.1203\u001b[0m, "
+              "\u001b[33mgrad_fn\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;95mMseLossBackward0\u001b["
+              "0m\u001b[1m>\u001b[0m\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m                       "
+              "                            \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m "
+              "\u001b[33m╰───────────────────────────────────────────────────────────╯\u001b[0m    "
+              "                                               \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m "
+              "\u001b[2;33m/home/PanXuehai/Miniconda3/envs/torchopt/lib/python3.9/site-packages/"
+              "torch/autograd/\u001b[0m\u001b[1;33m__init__.py\u001b[0m:\u001b[94m197\u001b[0m in "
+              "\u001b[92mbackward\u001b[0m \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m194 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# The "
+              "reason we repeat same the comment below is that\u001b[0m                            "
+              "                     \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m195 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# some "
+              "Python versions print out the first line of a multi-line function\u001b[0m          "
+              "                    \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m196 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# calls "
+              "in the traceback and some print out the last line\u001b[0m                          "
+              "                   \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m197 \u001b[2m│   "
+              "\u001b[0mVariable._execution_engine.run_backward(  \u001b[2m# Calls into the C++ "
+              "engine to run the ba\u001b[0m                   \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m198 \u001b[0m\u001b[2m│   │   \u001b[0mtensors, "
+              "grad_tensors_, retain_graph, create_graph, inputs,                                  "
+              "     \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m199 \u001b[0m\u001b[2m│   │   "
+              "\u001b[0mallow_unreachable=\u001b[94mTrue\u001b[0m, "
+              "accumulate_grad=\u001b[94mTrue\u001b[0m)  \u001b[2m# Calls into the C++ engine to "
+              "r\u001b[0m                   \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m   \u001b[2m200 \u001b[0m                                       "
+              "                                                                   "
+              "\u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m                                                                "
+              "                                                 \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m─────────────────────────── "
+              "locals ───────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m                    "
+              "                          \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m   create_graph = \u001b[94mFalse\u001b[0m "
+              "                                        \u001b[33m│\u001b[0m                        "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m   grad_tensors = \u001b[94mNone\u001b[0m  "
+              "                                        \u001b[33m│\u001b[0m                        "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m  grad_tensors_ = "
+              "\u001b[1m(\u001b[0m\u001b[1;35mtensor\u001b[0m\u001b[1m(\u001b[0m\u001b[94m1\u001b["
+              "0m.\u001b[1m)\u001b[0m,\u001b[1m)\u001b[0m                                 "
+              "\u001b[33m│\u001b[0m                                              "
+              "\u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m grad_variables = \u001b[94mNone\u001b[0m  "
+              "                                        \u001b[33m│\u001b[0m                        "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m         inputs = "
+              "\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m                                            "
+              "\u001b[33m│\u001b[0m                                              "
+              "\u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m   retain_graph = \u001b[94mFalse\u001b[0m "
+              "                                        \u001b[33m│\u001b[0m                        "
+              "                      \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m        tensors = "
+              "\u001b[1m(\u001b[0m\u001b[1;35mtensor\u001b[0m\u001b[1m(\u001b[0m\u001b[94m0."
+              "1203\u001b[0m, "
+              "\u001b[33mgrad_fn\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;95mMseLossBackward0\u001b["
+              "0m\u001b[1m>\u001b[0m\u001b[1m)\u001b[0m,\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m   "
+              "                                           \u001b[31m│\u001b[0m\n",
+              "\u001b[31m│\u001b[0m "
+              "\u001b[33m╰────────────────────────────────────────────────────────────────╯\u001b["
+              "0m                                              \u001b[31m│\u001b[0m\n",
+              "\u001b["
+              "31m╰────────────────────────────────────────────────────────────────────────────────"
+              "─────────────────────────────────╯\u001b[0m\n",
+              "\u001b[1;91mRuntimeError: \u001b[0mTrying to backward through the graph a second "
+              "time \u001b[1m(\u001b[0mor directly access saved tensors after they have \n",
+              "already been freed\u001b[1m)\u001b[0m. Saved intermediate values of the graph are "
+              "freed when you call "
+              "\u001b[1;35m.backward\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m or "
+              "\u001b[1;35mautograd.grad\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m.\n",
+              "Specify \u001b[33mretain_graph\u001b[0m=\u001b[3;92mTrue\u001b[0m if you need to "
+              "backward through the graph a second time or if you need to access saved \n",
+              "tensors after calling backward.\n"
+            ]
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "# Inner update with attached computation graph\n",
+        "inner_loss = loss_fn(net(x), y)\n",
+        "loss = inner_loss * meta_parameter\n",
+        "optim.step(loss)\n",
+        "\n",
+        "# Outer forward process\n",
+        "outer_loss = loss_fn(net(x), y)\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        outer_loss,\n",
+        "        params=(\n",
+        "            init_net_state,\n",
+        "            one_step_net_state,\n",
+        "            {'meta_parameter': meta_parameter, 'outer_loss': outer_loss},\n",
+        "        ),\n",
+        "    )\n",
+        ")\n",
+        "\n",
+        "# Outer update\n",
+        "meta_optim.zero_grad()\n",
+        "outer_loss.backward()\n",
+        "meta_optim.step()"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : [
+        "From the graph we can see, directly conducting the second bi-level process links the "
+        "graph of first and second bi-level process together. We should manually stop gradient "
+        "with `torchopt.stop_gradient`. `torchopt.stop_gradient` will detach the node of gradient "
+        "graph and make it become a leaf node. It allows the input of network, optimizer, or state "
+        "dictionary and the gradient operation happens in an in-place manner.\n",
+        "\n",
+        "Let's use `recover_state_dict` to come back to one-step updated states."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 9,
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "# Reset to previous one-step updated states\n",
+        "torchopt.recover_state_dict(net, one_step_net_state)\n",
+        "torchopt.recover_state_dict(optim, one_step_optim_state)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["And finally, Let's conduct the stop-gradient operation before the second "
+                  "meta-optimization step. "]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 10,
+      "metadata" : {},
+      "outputs" : [
+        {
+          "name" : "stdout",
+          "output_type" : "stream",
+          "text" : [
+            "meta_parameter.grad = tensor(-0.0635)\n",
+            "meta_parameter = Parameter containing:\n",
+            "tensor(1.1940, requires_grad=True)\n",
+            "<graphviz.graphs.Digraph object at 0x7f5a19ced640>\n"
+          ]
+        },
+        {
+          "data" : {
+            "image/svg+xml" :
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<!DOCTYPE svg "
+                "PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n "
+                "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Generated by graphviz "
+                "version 2.42.3 (20191010.1750)\n -->\n<!-- Title: %3 Pages: 1 -->\n<svg "
+                "width=\"437pt\" height=\"830pt\"\n viewBox=\"0.00 0.00 437.00 830.00\" "
+                "xmlns=\"http://www.w3.org/2000/svg\" "
+                "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<g id=\"graph0\" class=\"graph\" "
+                "transform=\"scale(1 1) rotate(0) translate(4 826)\">\n<title>%3</title>\n<polygon "
+                "fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-826 433,-826 433,4 "
+                "-4,4\"/>\n<!-- 140024973754912 -->\n<g id=\"node1\" "
+                "class=\"node\">\n<title>140024973754912</title>\n<polygon fill=\"#caff70\" "
+                "stroke=\"black\" points=\"210,-30 133,-30 133,0 210,0 210,-30\"/>\n<text "
+                "text-anchor=\"middle\" x=\"171.5\" y=\"-18\" font-family=\"monospace\" "
+                "font-size=\"10.00\">outer_loss</text>\n<text text-anchor=\"middle\" x=\"171.5\" "
+                "y=\"-7\" font-family=\"monospace\" font-size=\"10.00\">()</text>\n</g>\n<!-- "
+                "140024956770528 -->\n<g id=\"node2\" "
+                "class=\"node\">\n<title>140024956770528</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"228,-85 115,-85 115,-66 228,-66 228,-85\"/>\n<text "
+                "text-anchor=\"middle\" x=\"171.5\" y=\"-73\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackward0</text>\n</g>\n<!-- "
+                "140024956770528&#45;&gt;140024973754912 -->\n<g id=\"edge26\" "
+                "class=\"edge\">\n<title>140024956770528&#45;&gt;140024973754912</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M171.5,-65.87C171.5,-59.11 171.5,-49.35 "
+                "171.5,-40.26\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"175,-40.11 "
+                "171.5,-30.11 168,-40.11 175,-40.11\"/>\n</g>\n<!-- 140024956772112 -->\n<g "
+                "id=\"node3\" class=\"node\">\n<title>140024956772112</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"222,-140 121,-140 121,-121 222,-121 "
+                "222,-140\"/>\n<text text-anchor=\"middle\" x=\"171.5\" y=\"-128\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140024956772112&#45;&gt;140024956770528 -->\n<g id=\"edge1\" "
+                "class=\"edge\">\n<title>140024956772112&#45;&gt;140024956770528</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M171.5,-120.75C171.5,-113.8 171.5,-103.85 "
+                "171.5,-95.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"175,-95.09 "
+                "171.5,-85.09 168,-95.09 175,-95.09\"/>\n</g>\n<!-- 140024956770720 -->\n<g "
+                "id=\"node4\" class=\"node\">\n<title>140024956770720</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"167,-360 78,-360 78,-341 167,-341 "
+                "167,-360\"/>\n<text text-anchor=\"middle\" x=\"122.5\" y=\"-348\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140024956770720&#45;&gt;140024956772112 -->\n<g id=\"edge2\" "
+                "class=\"edge\">\n<title>140024956770720&#45;&gt;140024956772112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M124.32,-340.82C129.65,-315.41 145.8,-239.06 "
+                "160.5,-176 162.51,-167.37 164.86,-157.82 166.86,-149.8\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"170.28,-150.57 169.32,-140.01 "
+                "163.49,-148.86 170.28,-150.57\"/>\n</g>\n<!-- 140024962101312 -->\n<g "
+                "id=\"node5\" class=\"node\">\n<title>140024962101312</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"125,-690 24,-690 24,-671 125,-671 "
+                "125,-690\"/>\n<text text-anchor=\"middle\" x=\"74.5\" y=\"-678\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140024962101312&#45;&gt;140024956770720 -->\n<g id=\"edge3\" "
+                "class=\"edge\">\n<title>140024962101312&#45;&gt;140024956770720</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M70.61,-670.74C62.91,-652.6 46.5,-609.46 "
+                "46.5,-571.5 46.5,-571.5 46.5,-571.5 46.5,-459.5 46.5,-429.63 51.45,-420.52 "
+                "68.5,-396 76.73,-384.16 88.95,-373.72 99.68,-365.94\"/>\n<polygon fill=\"black\" "
+                "stroke=\"black\" points=\"101.97,-368.61 108.22,-360.06 98,-362.85 "
+                "101.97,-368.61\"/>\n</g>\n<!-- 140024973745552 -->\n<g id=\"node14\" "
+                "class=\"node\">\n<title>140024973745552</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"241,-635 140,-635 140,-616 241,-616 241,-635\"/>\n<text "
+                "text-anchor=\"middle\" x=\"190.5\" y=\"-623\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddmmBackward0</text>\n</g>\n<!-- "
+                "140024962101312&#45;&gt;140024973745552 -->\n<g id=\"edge13\" "
+                "class=\"edge\">\n<title>140024962101312&#45;&gt;140024973745552</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M93.14,-670.98C111.8,-662.46 140.75,-649.23 "
+                "162.24,-639.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"163.88,-642.51 171.52,-635.17 160.97,-636.14 "
+                "163.88,-642.51\"/>\n</g>\n<!-- 140025091547520 -->\n<g id=\"node6\" "
+                "class=\"node\">\n<title>140025091547520</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"149,-756 0,-756 0,-726 149,-726 149,-756\"/>\n<text "
+                "text-anchor=\"middle\" x=\"74.5\" y=\"-744\" font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.detached.fc.bias</text>\n<text text-anchor=\"middle\" "
+                "x=\"74.5\" y=\"-733\" font-family=\"monospace\" "
+                "font-size=\"10.00\">(1)</text>\n</g>\n<!-- "
+                "140025091547520&#45;&gt;140024962101312 -->\n<g id=\"edge4\" "
+                "class=\"edge\">\n<title>140025091547520&#45;&gt;140024962101312</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M74.5,-725.84C74.5,-718.21 74.5,-708.7 "
+                "74.5,-700.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"78,-700.27 "
+                "74.5,-690.27 71,-700.27 78,-700.27\"/>\n</g>\n<!-- 140024971586864 -->\n<g "
+                "id=\"node7\" class=\"node\">\n<title>140024971586864</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"167,-415 78,-415 78,-396 167,-396 "
+                "167,-415\"/>\n<text text-anchor=\"middle\" x=\"122.5\" y=\"-403\" "
+                "font-family=\"monospace\" font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140024971586864&#45;&gt;140024956770720 -->\n<g id=\"edge5\" "
+                "class=\"edge\">\n<title>140024971586864&#45;&gt;140024956770720</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M122.5,-395.75C122.5,-388.8 122.5,-378.85 "
+                "122.5,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"126,-370.09 "
+                "122.5,-360.09 119,-370.09 126,-370.09\"/>\n</g>\n<!-- 140024973742528 -->\n<g "
+                "id=\"node8\" class=\"node\">\n<title>140024973742528</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"170,-470 75,-470 75,-451 170,-451 "
+                "170,-470\"/>\n<text text-anchor=\"middle\" x=\"122.5\" y=\"-458\" "
+                "font-family=\"monospace\" font-size=\"10.00\">ViewBackward0</text>\n</g>\n<!-- "
+                "140024973742528&#45;&gt;140024971586864 -->\n<g id=\"edge6\" "
+                "class=\"edge\">\n<title>140024973742528&#45;&gt;140024971586864</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M122.5,-450.75C122.5,-443.8 122.5,-433.85 "
+                "122.5,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"126,-425.09 "
+                "122.5,-415.09 119,-425.09 126,-425.09\"/>\n</g>\n<!-- 140024973743968 -->\n<g "
+                "id=\"node9\" class=\"node\">\n<title>140024973743968</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"168,-525 79,-525 79,-506 168,-506 "
+                "168,-525\"/>\n<text text-anchor=\"middle\" x=\"123.5\" y=\"-513\" "
+                "font-family=\"monospace\" font-size=\"10.00\">SumBackward1</text>\n</g>\n<!-- "
+                "140024973743968&#45;&gt;140024973742528 -->\n<g id=\"edge7\" "
+                "class=\"edge\">\n<title>140024973743968&#45;&gt;140024973742528</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M123.33,-505.75C123.2,-498.8 123.02,-488.85 "
+                "122.85,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"126.35,-480.02 122.66,-470.09 119.35,-480.15 "
+                "126.35,-480.02\"/>\n</g>\n<!-- 140024973742768 -->\n<g id=\"node10\" "
+                "class=\"node\">\n<title>140024973742768</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"271,-580 110,-580 110,-561 271,-561 271,-580\"/>\n<text "
+                "text-anchor=\"middle\" x=\"190.5\" y=\"-568\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MseLossBackwardBackward0</text>\n</g>\n<!-- "
+                "140024973742768&#45;&gt;140024973743968 -->\n<g id=\"edge8\" "
+                "class=\"edge\">\n<title>140024973742768&#45;&gt;140024973743968</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M179.74,-560.98C169.73,-553.07 154.61,-541.11 "
+                "142.57,-531.58\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"144.47,-528.63 134.46,-525.17 140.13,-534.12 "
+                "144.47,-528.63\"/>\n</g>\n<!-- 140024973744400 -->\n<g id=\"node24\" "
+                "class=\"node\">\n<title>140024973744400</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"265,-525 188,-525 188,-506 265,-506 265,-525\"/>\n<text "
+                "text-anchor=\"middle\" x=\"226.5\" y=\"-513\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973742768&#45;&gt;140024973744400 -->\n<g id=\"edge25\" "
+                "class=\"edge\">\n<title>140024973742768&#45;&gt;140024973744400</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M196.44,-560.75C201.48,-553.34 208.84,-542.5 "
+                "215.01,-533.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"217.94,-535.33 220.67,-525.09 212.15,-531.39 "
+                "217.94,-535.33\"/>\n</g>\n<!-- 140024973744688 -->\n<g id=\"node11\" "
+                "class=\"node\">\n<title>140024973744688</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"420,-635 331,-635 331,-616 420,-616 420,-635\"/>\n<text "
+                "text-anchor=\"middle\" x=\"375.5\" y=\"-623\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140024973744688&#45;&gt;140024973742768 -->\n<g id=\"edge9\" "
+                "class=\"edge\">\n<title>140024973744688&#45;&gt;140024973742768</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M345.78,-615.98C314.45,-607.01 264.93,-592.82 "
+                "230.26,-582.89\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"231.03,-579.47 220.46,-580.08 229.11,-586.2 "
+                "231.03,-579.47\"/>\n</g>\n<!-- 140024973745264 -->\n<g id=\"node12\" "
+                "class=\"node\">\n<title>140024973745264</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"429,-690 328,-690 328,-671 429,-671 429,-690\"/>\n<text "
+                "text-anchor=\"middle\" x=\"378.5\" y=\"-678\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140024973745264&#45;&gt;140024973744688 -->\n<g id=\"edge10\" "
+                "class=\"edge\">\n<title>140024973745264&#45;&gt;140024973744688</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M378,-670.75C377.61,-663.8 377.05,-653.85 "
+                "376.55,-645.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"380.05,-644.88 375.99,-635.09 373.06,-645.27 "
+                "380.05,-644.88\"/>\n</g>\n<!-- 140025091549440 -->\n<g id=\"node13\" "
+                "class=\"node\">\n<title>140025091549440</title>\n<polygon fill=\"lightblue\" "
+                "stroke=\"black\" points=\"429,-756 328,-756 328,-726 429,-726 429,-756\"/>\n<text "
+                "text-anchor=\"middle\" x=\"378.5\" y=\"-744\" font-family=\"monospace\" "
+                "font-size=\"10.00\">meta_parameter</text>\n<text text-anchor=\"middle\" "
+                "x=\"378.5\" y=\"-733\" font-family=\"monospace\" "
+                "font-size=\"10.00\">()</text>\n</g>\n<!-- 140025091549440&#45;&gt;140024973745264 "
+                "-->\n<g id=\"edge11\" "
+                "class=\"edge\">\n<title>140025091549440&#45;&gt;140024973745264</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M378.5,-725.84C378.5,-718.21 378.5,-708.7 "
+                "378.5,-700.45\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"382,-700.27 "
+                "378.5,-690.27 375,-700.27 382,-700.27\"/>\n</g>\n<!-- "
+                "140024973745552&#45;&gt;140024973742768 -->\n<g id=\"edge12\" "
+                "class=\"edge\">\n<title>140024973745552&#45;&gt;140024973742768</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M190.5,-615.75C190.5,-608.8 190.5,-598.85 "
+                "190.5,-590.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"194,-590.09 "
+                "190.5,-580.09 187,-590.09 194,-590.09\"/>\n</g>\n<!-- 140024973745168 -->\n<g "
+                "id=\"node15\" class=\"node\">\n<title>140024973745168</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"250,-690 173,-690 173,-671 250,-671 "
+                "250,-690\"/>\n<text text-anchor=\"middle\" x=\"211.5\" y=\"-678\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973745168&#45;&gt;140024973745552 -->\n<g id=\"edge14\" "
+                "class=\"edge\">\n<title>140024973745168&#45;&gt;140024973745552</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M208.03,-670.75C205.22,-663.65 201.16,-653.4 "
+                "197.65,-644.56\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"200.84,-643.1 193.9,-635.09 194.33,-645.68 200.84,-643.1\"/>\n</g>\n<!-- "
+                "140024973744256 -->\n<g id=\"node16\" "
+                "class=\"node\">\n<title>140024973744256</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"297,-750.5 196,-750.5 196,-731.5 297,-731.5 "
+                "297,-750.5\"/>\n<text text-anchor=\"middle\" x=\"246.5\" y=\"-738.5\" "
+                "font-family=\"monospace\" font-size=\"10.00\">AccumulateGrad</text>\n</g>\n<!-- "
+                "140024973744256&#45;&gt;140024973745168 -->\n<g id=\"edge15\" "
+                "class=\"edge\">\n<title>140024973744256&#45;&gt;140024973745168</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M241.34,-731.37C236.27,-722.9 228.4,-709.74 "
+                "222.01,-699.07\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"224.82,-696.95 216.69,-690.17 218.82,-700.55 "
+                "224.82,-696.95\"/>\n</g>\n<!-- 140024973745984 -->\n<g id=\"node19\" "
+                "class=\"node\">\n<title>140024973745984</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"271,-250 182,-250 182,-231 271,-231 271,-250\"/>\n<text "
+                "text-anchor=\"middle\" x=\"226.5\" y=\"-238\" font-family=\"monospace\" "
+                "font-size=\"10.00\">AddBackward0</text>\n</g>\n<!-- "
+                "140024973744256&#45;&gt;140024973745984 -->\n<g id=\"edge19\" "
+                "class=\"edge\">\n<title>140024973744256&#45;&gt;140024973745984</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M253.87,-731.24C268.43,-713.01 299.5,-669.12 "
+                "299.5,-626.5 299.5,-626.5 299.5,-626.5 299.5,-349.5 299.5,-319.91 295.93,-310.61 "
+                "279.5,-286 271.58,-274.13 259.6,-263.7 249.04,-255.93\"/>\n<polygon "
+                "fill=\"black\" stroke=\"black\" points=\"250.82,-252.9 240.61,-250.05 "
+                "246.81,-258.64 250.82,-252.9\"/>\n</g>\n<!-- 140027828983424 -->\n<g "
+                "id=\"node17\" class=\"node\">\n<title>140027828983424</title>\n<polygon "
+                "fill=\"lightblue\" stroke=\"black\" points=\"327,-822 166,-822 166,-792 327,-792 "
+                "327,-822\"/>\n<text text-anchor=\"middle\" x=\"246.5\" y=\"-810\" "
+                "font-family=\"monospace\" "
+                "font-size=\"10.00\">step1.detached.fc.weight</text>\n<text text-anchor=\"middle\" "
+                "x=\"246.5\" y=\"-799\" font-family=\"monospace\" font-size=\"10.00\">(1, "
+                "16)</text>\n</g>\n<!-- 140027828983424&#45;&gt;140024973744256 -->\n<g "
+                "id=\"edge16\" "
+                "class=\"edge\">\n<title>140027828983424&#45;&gt;140024973744256</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M246.5,-791.8C246.5,-782.7 246.5,-770.79 "
+                "246.5,-760.9\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"250,-760.84 "
+                "246.5,-750.84 243,-760.84 250,-760.84\"/>\n</g>\n<!-- 140024956771632 -->\n<g "
+                "id=\"node18\" class=\"node\">\n<title>140024956771632</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"247,-195 170,-195 170,-176 247,-176 "
+                "247,-195\"/>\n<text text-anchor=\"middle\" x=\"208.5\" y=\"-183\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024956771632&#45;&gt;140024956772112 -->\n<g id=\"edge17\" "
+                "class=\"edge\">\n<title>140024956771632&#45;&gt;140024956772112</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M202.39,-175.75C197.22,-168.34 189.65,-157.5 "
+                "183.31,-148.41\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"186.09,-146.29 177.5,-140.09 180.35,-150.29 "
+                "186.09,-146.29\"/>\n</g>\n<!-- 140024973745984&#45;&gt;140024956771632 -->\n<g "
+                "id=\"edge18\" "
+                "class=\"edge\">\n<title>140024973745984&#45;&gt;140024956771632</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M223.53,-230.75C221.14,-223.72 217.71,-213.62 "
+                "214.73,-204.84\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"217.95,-203.43 211.42,-195.09 211.32,-205.68 "
+                "217.95,-203.43\"/>\n</g>\n<!-- 140024973743728 -->\n<g id=\"node20\" "
+                "class=\"node\">\n<title>140024973743728</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"271,-305 182,-305 182,-286 271,-286 271,-305\"/>\n<text "
+                "text-anchor=\"middle\" x=\"226.5\" y=\"-293\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MulBackward0</text>\n</g>\n<!-- "
+                "140024973743728&#45;&gt;140024973745984 -->\n<g id=\"edge20\" "
+                "class=\"edge\">\n<title>140024973743728&#45;&gt;140024973745984</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M226.5,-285.75C226.5,-278.8 226.5,-268.85 "
+                "226.5,-260.13\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"230,-260.09 "
+                "226.5,-250.09 223,-260.09 230,-260.09\"/>\n</g>\n<!-- 140024973743344 -->\n<g "
+                "id=\"node21\" class=\"node\">\n<title>140024973743344</title>\n<polygon "
+                "fill=\"lightgrey\" stroke=\"black\" points=\"266,-360 189,-360 189,-341 266,-341 "
+                "266,-360\"/>\n<text text-anchor=\"middle\" x=\"227.5\" y=\"-348\" "
+                "font-family=\"monospace\" font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973743344&#45;&gt;140024973743728 -->\n<g id=\"edge21\" "
+                "class=\"edge\">\n<title>140024973743344&#45;&gt;140024973743728</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M227.33,-340.75C227.2,-333.8 227.02,-323.85 "
+                "226.85,-315.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"230.35,-315.02 226.66,-305.09 223.35,-315.15 "
+                "230.35,-315.02\"/>\n</g>\n<!-- 140024973745312 -->\n<g id=\"node22\" "
+                "class=\"node\">\n<title>140024973745312</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"267,-415 190,-415 190,-396 267,-396 267,-415\"/>\n<text "
+                "text-anchor=\"middle\" x=\"228.5\" y=\"-403\" font-family=\"monospace\" "
+                "font-size=\"10.00\">TBackward0</text>\n</g>\n<!-- "
+                "140024973745312&#45;&gt;140024973743344 -->\n<g id=\"edge22\" "
+                "class=\"edge\">\n<title>140024973745312&#45;&gt;140024973743344</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M228.33,-395.75C228.2,-388.8 228.02,-378.85 "
+                "227.85,-370.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"231.35,-370.02 227.66,-360.09 224.35,-370.15 "
+                "231.35,-370.02\"/>\n</g>\n<!-- 140024973743200 -->\n<g id=\"node23\" "
+                "class=\"node\">\n<title>140024973743200</title>\n<polygon fill=\"lightgrey\" "
+                "stroke=\"black\" points=\"271,-470 188,-470 188,-451 271,-451 271,-470\"/>\n<text "
+                "text-anchor=\"middle\" x=\"229.5\" y=\"-458\" font-family=\"monospace\" "
+                "font-size=\"10.00\">MmBackward0</text>\n</g>\n<!-- "
+                "140024973743200&#45;&gt;140024973745312 -->\n<g id=\"edge23\" "
+                "class=\"edge\">\n<title>140024973743200&#45;&gt;140024973745312</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M229.33,-450.75C229.2,-443.8 229.02,-433.85 "
+                "228.85,-425.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"232.35,-425.02 228.66,-415.09 225.35,-425.15 "
+                "232.35,-425.02\"/>\n</g>\n<!-- 140024973744400&#45;&gt;140024973743200 -->\n<g "
+                "id=\"edge24\" "
+                "class=\"edge\">\n<title>140024973744400&#45;&gt;140024973743200</title>\n<path "
+                "fill=\"none\" stroke=\"black\" d=\"M227,-505.75C227.39,-498.8 227.95,-488.85 "
+                "228.45,-480.13\"/>\n<polygon fill=\"black\" stroke=\"black\" "
+                "points=\"231.94,-480.27 229.01,-470.09 224.95,-479.88 "
+                "231.94,-480.27\"/>\n</g>\n</g>\n</svg>\n"
+          },
+          "metadata" : {},
+          "output_type" : "display_data"
+        }
+      ],
+      "source" : [
+        "# Stop gradient and make them become the leaf node\n",
+        "torchopt.stop_gradient(net)\n",
+        "torchopt.stop_gradient(optim)\n",
+        "one_step_net_state_detached = torchopt.extract_state_dict(\n",
+        "    net, enable_visual=True, visual_prefix='step1.detached.'\n",
+        ")\n",
+        "\n",
+        "# Inner update\n",
+        "inner_loss = loss_fn(net(x), y)\n",
+        "loss = inner_loss * meta_parameter\n",
+        "optim.step(loss)\n",
+        "\n",
+        "# Outer update\n",
+        "outer_loss = loss_fn(net(x), y)\n",
+        "meta_optim.zero_grad()\n",
+        "outer_loss.backward()\n",
+        "print(f'meta_parameter.grad = {meta_parameter.grad!r}')\n",
+        "meta_optim.step()\n",
+        "print(f'meta_parameter = {meta_parameter!r}')\n",
+        "\n",
+        "display(\n",
+        "    torchopt.visual.make_dot(\n",
+        "        outer_loss,\n",
+        "        params=(\n",
+        "            one_step_net_state_detached,\n",
+        "            {'meta_parameter': meta_parameter, 'outer_loss': outer_loss},\n",
+        "        ),\n",
+        "    )\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "metadata" : {},
+      "source" : ["The gradient graph is the same with the first meta-optimization's gradient "
+                  "graph and we successfully conduct the second bi-level process."]
+    }
+  ],
+  "metadata" : {
+    "kernelspec" :
+        {"display_name" : "Python 3 (ipykernel)", "language" : "python", "name" : "python3"},
+    "language_info" : {
+      "codemirror_mode" : {"name" : "ipython", "version" : 3},
+      "file_extension" : ".py",
+      "mimetype" : "text/x-python",
+      "name" : "python",
+      "nbconvert_exporter" : "python",
+      "pygments_lexer" : "ipython3",
+      "version" : "3.9.15"
+    },
+    "vscode" : {
+      "interpreter" :
+          {"hash" : "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"}
     }
-   ],
-   "source": [
-    "# Stop gradient and make them become the leaf node\n",
-    "torchopt.stop_gradient(net)\n",
-    "torchopt.stop_gradient(optim)\n",
-    "one_step_net_state_detached = torchopt.extract_state_dict(\n",
-    "    net, enable_visual=True, visual_prefix='step1.detached.'\n",
-    ")\n",
-    "\n",
-    "# Inner update\n",
-    "inner_loss = loss_fn(net(x), y)\n",
-    "loss = inner_loss * meta_parameter\n",
-    "optim.step(loss)\n",
-    "\n",
-    "# Outer update\n",
-    "outer_loss = loss_fn(net(x), y)\n",
-    "meta_optim.zero_grad()\n",
-    "outer_loss.backward()\n",
-    "print(f'meta_parameter.grad = {meta_parameter.grad!r}')\n",
-    "meta_optim.step()\n",
-    "print(f'meta_parameter = {meta_parameter!r}')\n",
-    "\n",
-    "display(\n",
-    "    torchopt.visual.make_dot(\n",
-    "        outer_loss,\n",
-    "        params=(\n",
-    "            one_step_net_state_detached,\n",
-    "            {'meta_parameter': meta_parameter, 'outer_loss': outer_loss},\n",
-    "        ),\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The gradient graph is the same with the first meta-optimization's gradient graph and we successfully conduct the second bi-level process."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
   },
-  "vscode": {
-   "interpreter": {
-    "hash": "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+  "nbformat" : 4,
+  "nbformat_minor" : 4
 }
diff --git a/tutorials/5_Implicit_Differentiation.ipynb b/tutorials/5_Implicit_Differentiation.ipynb
index 23407801..a7dc8ed5 100644
--- a/tutorials/5_Implicit_Differentiation.ipynb
+++ b/tutorials/5_Implicit_Differentiation.ipynb
@@ -1,576 +1,590 @@
 {
-  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8850c832-3b54-4971-8ee0-2cd64b585ea8",
-   "metadata": {},
-   "source": [
-    "# TorchOpt for Implicit Differentiation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2b547376",
-   "metadata": {},
-   "source": [
-    "[<img align=\"left\" src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/5_Implicit_Differentiation.ipynb)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8d7f9865-dc02-43d4-be90-da1160c4e4dd",
-   "metadata": {},
-   "source": [
-    "By treating the solution $\\phi^{\\star}$ as an implicit function of $\\theta$, the idea of implicit differentiation is to directly get analytical best-response derivatives $\\partial \\phi^{\\star}(\\theta)/ \\partial \\theta$ by implicit function theorem. This is suitable for algorithms when the inner-level optimal solution is achieved ${\\left. \\frac{\\partial F (\\phi, \\theta)}{\\partial \\phi} \\right\\rvert}_{\\phi = \\phi^{\\star}} = 0$ or reaches some stationary conditions $F (\\phi^{\\star}, \\theta) = 0$, such as [iMAML](https://arxiv.org/abs/1909.04630) and [DEQ](https://arxiv.org/abs/1909.01377)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d7e4b9e1-115f-45ad-a9b3-ea338bcfe6dd",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will introduce how TorchOpt can be used to conduct implicit differentiation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "8f13ae67-e328-409f-84a8-1fc425c03a66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import functorch\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import torchopt"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "0cdaac49-4b94-4900-9bb5-a39057ac8b21",
-   "metadata": {},
-   "source": [
-    "## 1. Functional API\n",
-    "\n",
-    "The basic functional API is `torchopt.diff.implicit.custom_root`, which is used as the decorator for the forward process implicit gradient procedures. Users are required to implement the stationary conditions for the inner-loop process, which will be used as the input of custom_root decorator. We show the pseudo code in the following part.\n",
-    "\n",
-    "```python\n",
-    "# Functional API for implicit gradient\n",
-    "def stationary(params, meta_params, data):\n",
-    "    # stationary condition construction\n",
-    "    return stationary condition\n",
-    "\n",
-    "# Decorator that wraps the function\n",
-    "# Optionally specify the linear solver (conjugate gradient or Neumann series)\n",
-    "@torchopt.diff.implicit.custom_root(stationary, solve=linear_solver)\n",
-    "def solve(params, meta_params, data):\n",
-    "    # Forward optimization process for params\n",
-    "    return optimal_params\n",
-    "\n",
-    "# Define params, meta_params and get data\n",
-    "params, meta_prams, data = ..., ..., ...\n",
-    "optimal_params = solve(params, meta_params, data)\n",
-    "loss = outer_loss(optimal_params)\n",
-    "\n",
-    "meta_grads = torch.autograd.grad(loss, meta_params)\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dbef87df-2164-4f1d-8919-37a6fbdc5011",
-   "metadata": {},
-   "source": [
-    "Here we use the example of [iMAML](https://arxiv.org/abs/1909.04630) as a real example. For iMAML, the inner-loop objective is described by the following equation.\n",
-    "\n",
-    "$$\n",
-    "{\\mathcal{Alg}}^{\\star} \\left( \\boldsymbol{\\theta}, \\mathcal{D}_{i}^{\\text{tr}} \\right) = \\underset{\\phi'}{\\operatorname{\\arg \\min}} ~ G \\left( \\boldsymbol{\\phi}', \\boldsymbol{\\theta} \\right) \\triangleq \\mathcal{L} \\left( \\boldsymbol{\\phi}', \\mathcal{D}_{i}^{\\text{tr}} \\right) + \\frac{\\lambda}{2} {\\left\\| \\boldsymbol{\\phi}' - \\boldsymbol{\\theta} \\right\\|}^{2}\n",
-    "$$\n",
-    "\n",
-    "According to this function, we can define the forward function `inner_solver`, where we solve this equation based on sufficient gradient descents. For such inner-loop process, the optimality condition is that the gradient w.r.t inner-loop parameter is $0$.\n",
-    "\n",
-    "$$\n",
-    "{\\left. \\nabla_{\\boldsymbol{\\phi}'} G \\left( \\boldsymbol{\\phi}', \\boldsymbol{\\theta} \\right) \\right\\rvert}_{\\boldsymbol{\\phi}' = \\boldsymbol{\\phi}^{\\star}} = 0\n",
-    "$$\n",
-    "\n",
-    "Thus we can define the optimality function by defining `imaml_objective` and make it first-order gradient w.r.t the inner-loop parameter as $0$. We achieve so by calling out `functorch.grad(imaml_objective, argnums=0)`. Finally, the forward function is decorated by the `@torchopt.diff.implicit.custom_root` decorator and the optimality condition we define."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "8d623b2f-48ee-4df6-a2ce-cf306b4c9067",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Inner-loop objective function\n",
-    "# The optimality function: grad(imaml_objective)\n",
-    "def imaml_objective(params, meta_params, data):\n",
-    "    x, y, fmodel = data\n",
-    "    y_pred = fmodel(params, x)\n",
-    "    regularization_loss = 0.0\n",
-    "    for p1, p2 in zip(params, meta_params):\n",
-    "        regularization_loss += 0.5 * torch.sum(torch.square(p1.view(-1) - p2.view(-1)))\n",
-    "    loss = F.mse_loss(y_pred, y) + regularization_loss\n",
-    "    return loss\n",
-    "\n",
-    "\n",
-    "# Optimality Condition is: the gradient w.r.t inner-loop optimal params is 0 (we achieve so by\n",
-    "# specifying argnums=0 in functorch.grad) the argnums=1 specify which meta-parameter we want to\n",
-    "# backpropogate, in this case we want to backpropogate to the initial parameters so we set it as 1.\n",
-    "# You can also set argnums as (1, 2) if you want to backpropogate through multiple meta-parameters\n",
-    "\n",
-    "\n",
-    "# Here we pass argnums=1 to the custom_root. That means we want to compute the gradient of\n",
-    "# optimal_params w.r.t. the 1-indexed argument in inner_solver, i.e., params.\n",
-    "# torchopt.linear_solve.solve_normal_cg specify that we use the conjugate gradient based linear solver\n",
-    "@torchopt.diff.implicit.custom_root(\n",
-    "    functorch.grad(imaml_objective, argnums=0),  # optimality function\n",
-    "    argnums=1,\n",
-    "    solve=torchopt.linear_solve.solve_normal_cg(maxiter=5, atol=0),\n",
-    ")\n",
-    "def inner_solver(params, meta_params, data):\n",
-    "    # Initial functional optimizer based on TorchOpt\n",
-    "    x, y, fmodel = data\n",
-    "    optimizer = torchopt.sgd(lr=2e-2)\n",
-    "    opt_state = optimizer.init(params)\n",
-    "    with torch.enable_grad():\n",
-    "        # Temporarily enable gradient computation for conducting the optimization\n",
-    "        for i in range(100):\n",
-    "            pred = fmodel(params, x)\n",
-    "            loss = F.mse_loss(pred, y)  # compute loss\n",
-    "\n",
-    "            # Compute regularization loss\n",
-    "            regularization_loss = 0.0\n",
-    "            for p1, p2 in zip(params, meta_params):\n",
-    "                regularization_loss += 0.5 * torch.sum(torch.square(p1.view(-1) - p2.view(-1)))\n",
-    "            final_loss = loss + regularization_loss\n",
-    "\n",
-    "            grads = torch.autograd.grad(final_loss, params)  # compute gradients\n",
-    "            updates, opt_state = optimizer.update(grads, opt_state, inplace=True)  # get updates\n",
-    "            params = torchopt.apply_updates(params, updates, inplace=True)\n",
-    "\n",
-    "    optimal_params = params\n",
-    "    return optimal_params\n",
-    "\n",
-    "\n",
-    "# torchopt.linear_solve.solve_inv specify that we use the Neumann Series inversion linear solver\n",
-    "@torchopt.diff.implicit.custom_root(\n",
-    "    functorch.grad(imaml_objective, argnums=0),  # optimality function\n",
-    "    argnums=1,\n",
-    "    solve=torchopt.linear_solve.solve_inv(ns=True, maxiter=100, alpha=0.1),\n",
-    ")\n",
-    "def inner_solver_inv_ns(params, meta_params, data):\n",
-    "    # Initial functional optimizer based on TorchOpt\n",
-    "    x, y, fmodel = data\n",
-    "    optimizer = torchopt.sgd(lr=2e-2)\n",
-    "    opt_state = optimizer.init(params)\n",
-    "    with torch.enable_grad():\n",
-    "        # Temporarily enable gradient computation for conducting the optimization\n",
-    "        for i in range(100):\n",
-    "            pred = fmodel(params, x)\n",
-    "            loss = F.mse_loss(pred, y)  # compute loss\n",
-    "\n",
-    "            # Compute regularization loss\n",
-    "            regularization_loss = 0.0\n",
-    "            for p1, p2 in zip(params, meta_params):\n",
-    "                regularization_loss += 0.5 * torch.sum(torch.square(p1.view(-1) - p2.view(-1)))\n",
-    "            final_loss = loss + regularization_loss\n",
-    "\n",
-    "            grads = torch.autograd.grad(final_loss, params)  # compute gradients\n",
-    "            updates, opt_state = optimizer.update(grads, opt_state, inplace=True)  # get updates\n",
-    "            params = torchopt.apply_updates(params, updates, inplace=True)\n",
-    "\n",
-    "    optimal_params = params\n",
-    "    return optimal_params"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32a75c81-d479-4120-a73d-5b2b488358d0",
-   "metadata": {},
-   "source": [
-    "In the next step, we consider a specific case for one layer neural network to fit the linear data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "fb95538b-1fd9-4ec8-9f57-6360bedc05b7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "torch.manual_seed(0)\n",
-    "x = torch.randn(20, 4)\n",
-    "w = torch.randn(4, 1)\n",
-    "b = torch.randn(1)\n",
-    "y = x @ w + b + 0.5 * torch.randn(20, 1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eeb1823a-2231-4471-bb68-cce7724f2578",
-   "metadata": {},
-   "source": [
-    "We instantiate an one layer neural network, where the weights and bias are initialized with constant."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "d50a7bfe-ac69-4089-8cf8-3cbd69d6d4e7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "class Net(nn.Module):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, 1, bias=True)\n",
-    "        nn.init.ones_(self.fc.weight)\n",
-    "        nn.init.zeros_(self.fc.bias)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.fc(x)\n",
-    "\n",
-    "\n",
-    "model = Net(4)\n",
-    "fmodel, meta_params = functorch.make_functional(model)\n",
-    "data = (x, y, fmodel)\n",
-    "\n",
-    "\n",
-    "# Clone function for parameters\n",
-    "def clone(params):\n",
-    "    cloned = []\n",
-    "    for item in params:\n",
-    "        if isinstance(item, torch.Tensor):\n",
-    "            cloned.append(item.clone().detach_().requires_grad_(True))\n",
-    "        else:\n",
-    "            cloned.append(item)\n",
-    "    return tuple(cloned)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "065c36c4-89e2-4a63-8213-63db6ee3b08e",
-   "metadata": {},
-   "source": [
-    "We take the forward process by calling out the forward function, then we pass the optimal params into the outer-loop loss function."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "115e79c6-911f-4743-a2ed-e50a71c3a813",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "optimal_params = inner_solver(clone(meta_params), meta_params, data)\n",
-    "\n",
-    "outer_loss = fmodel(optimal_params, x).mean()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2812351-f635-496e-9732-c80831ac04a6",
-   "metadata": {},
-   "source": [
-    "Finally, we can get the meta-gradient as shown below."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "6bdcbe8d-2336-4f80-b124-eb43c5a2fc0a",
-   "metadata": {},
-   "outputs": [
+  "cells" : [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(tensor([[-0.0369,  0.0248,  0.0347,  0.0067]]), tensor([0.3156]))\n"
-     ]
-    }
-   ],
-   "source": [
-    "torch.autograd.grad(outer_loss, meta_params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "926ae8bb",
-   "metadata": {},
-   "source": [
-    "Also we can switch to the Neumann Series inversion linear solver."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "43df0374",
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "id" : "8850c832-3b54-4971-8ee0-2cd64b585ea8",
+      "metadata" : {},
+      "source" : ["# TorchOpt for Implicit Differentiation"]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(tensor([[-0.0369,  0.0248,  0.0347,  0.0067]]), tensor([0.3156]))\n"
-     ]
-    }
-   ],
-   "source": [
-    "optimal_params = inner_solver_inv_ns(clone(meta_params), meta_params, data)\n",
-    "outer_loss = fmodel(optimal_params, x).mean()\n",
-    "torch.autograd.grad(outer_loss, meta_params)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "c92e67ea-b220-4a14-a1ea-4eb3c5f52b6b",
-   "metadata": {},
-   "source": [
-    "## 2. OOP API\n",
-    "\n",
-    "The basic OOP class is the class `ImplicitMetaGradientModule`. We make the network as an `nn.Module` following a classical PyTorch style. Users need to define the stationary condition/objective function and the inner-loop solve function to enable implicit gradient computation. We show the pseudo code in the following part.\n",
-    "\n",
-    "```python\n",
-    "from torchopt.nn import ImplicitMetaGradientModule\n",
-    "\n",
-    "# Inherited from the class ImplicitMetaGradientModule\n",
-    "# Optionally specify the linear solver (conjugate gradient or Neumann series)\n",
-    "class InnerNet(ImplicitMetaGradientModule, linear_solve=linear_solver):\n",
-    "    def __init__(self, meta_module):\n",
-    "        ...\n",
-    "\n",
-    "    def forward(self, batch):\n",
-    "        # Forward process\n",
-    "        ...\n",
-    "\n",
-    "    def optimality(self, batch, labels):\n",
-    "        # Stationary condition construction for calculating implicit gradient\n",
-    "        # NOTE: If this method is not implemented, it will be automatically derived from the\n",
-    "        # gradient of the `objective` function.\n",
-    "        ...\n",
-    "\n",
-    "    def objective(self, batch, labels):\n",
-    "        # Define the inner-loop optimization objective\n",
-    "        # NOTE: This method is optional if method `optimality` is implemented.\n",
-    "        ...\n",
-    "\n",
-    "    def solve(self, batch, labels):\n",
-    "        # Conduct the inner-loop optimization\n",
-    "        ...\n",
-    "        return self  # optimized module\n",
-    "\n",
-    "# Get meta_params and data\n",
-    "meta_params, data = ..., ...\n",
-    "inner_net = InnerNet()\n",
-    "\n",
-    "# Solve for inner-loop process related to the meta-parameters\n",
-    "optimal_inner_net = inner_net.solve(meta_params, *data)\n",
-    "\n",
-    "# Get outer-loss and solve for meta-gradient\n",
-    "loss = outer_loss(optimal_inner_net)\n",
-    "meta_grad = torch.autograd.grad(loss, meta_params)\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "62fbe520-11d0-41ff-9b0a-c6508b1d01cf",
-   "metadata": {},
-   "source": [
-    "The class `ImplicitMetaGradientModule` is to enable the gradient flow from `self.parameters()` to `self.meta_parameters()`. In `__init__` function, users need to define the inner parameters and meta-parameters. By default, `ImplicitMetaGradientModule` treats all tensors and modules from input as `self.meta_parameters()`, and all tensors and modules defined in the `__init__` are regarded as `self.parameters()`. Users can also register `self.parameters()` and `self.meta_parameters()` by calling `self.register_parameter()` and `self.register_meta_parameter()` respectively."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "c3999684-f4d3-4bc0-86ab-a7e803b2fe80",
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "id" : "2b547376",
+      "metadata" : {},
+      "source" : ["[<img align=\"left\" "
+                  "src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://"
+                  "colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/"
+                  "5_Implicit_Differentiation.ipynb)"]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(tensor([[-0.0369,  0.0248,  0.0347,  0.0067]]), tensor([0.3156]))\n"
-     ]
-    }
-   ],
-   "source": [
-    "class InnerNet(\n",
-    "    torchopt.nn.ImplicitMetaGradientModule,\n",
-    "    linear_solve=torchopt.linear_solve.solve_normal_cg(maxiter=5, atol=0),\n",
-    "):\n",
-    "    def __init__(self, meta_net, n_inner_iter, reg_param):\n",
-    "        super().__init__()\n",
-    "        # Declaration of the meta-parameter\n",
-    "        self.meta_net = meta_net\n",
-    "        # Get a deepcopy, register inner-parameter\n",
-    "        self.net = torchopt.module_clone(meta_net, by='deepcopy', detach_buffers=True)\n",
-    "        self.n_inner_iter = n_inner_iter\n",
-    "        self.reg_param = reg_param\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.net(x)\n",
-    "\n",
-    "    def objective(self, x, y):\n",
-    "        # We do not implement the optimality conditions, so it will be automatically derived from\n",
-    "        # the gradient of the `objective` function.\n",
-    "        y_pred = self(x)\n",
-    "        loss = F.mse_loss(y_pred, y)\n",
-    "        regularization_loss = 0\n",
-    "        for p1, p2 in zip(\n",
-    "            self.parameters(),  # parameters of `self.net`\n",
-    "            self.meta_parameters(),  # parameters of `self.meta_net`\n",
-    "        ):\n",
-    "            regularization_loss += (\n",
-    "                0.5 * self.reg_param * torch.sum(torch.square(p1.view(-1) - p2.view(-1)))\n",
-    "            )\n",
-    "        return loss + regularization_loss\n",
-    "\n",
-    "    def solve(self, x, y):\n",
-    "        params = tuple(self.parameters())\n",
-    "        inner_optim = torchopt.SGD(params, lr=2e-2)\n",
-    "        with torch.enable_grad():\n",
-    "            # Temporarily enable gradient computation for conducting the optimization\n",
-    "            for _ in range(self.n_inner_iter):\n",
-    "                loss = self.objective(x, y)\n",
-    "                inner_optim.zero_grad()\n",
-    "                # NOTE: The parameter inputs should be explicitly specified in `backward` function\n",
-    "                # as argument `inputs`. Otherwise, if not provided, the gradient is accumulated into\n",
-    "                # all the leaf Tensors (including the meta-parameters) that were used to compute the\n",
-    "                # objective output. Alternatively, please use `torch.autograd.grad` instead.\n",
-    "                loss.backward(inputs=params)  # backward pass in inner-loop\n",
-    "                inner_optim.step()  # update inner parameters\n",
-    "        return self\n",
-    "\n",
-    "\n",
-    "# Initialize the meta-network\n",
-    "meta_net = Net(4)\n",
-    "inner_net = InnerNet(meta_net, 100, reg_param=1)\n",
-    "\n",
-    "# Solve for inner-loop\n",
-    "optimal_inner_net = inner_net.solve(x, y)\n",
-    "outer_loss = optimal_inner_net(x).mean()\n",
-    "\n",
-    "# Derive the meta-gradient\n",
-    "torch.autograd.grad(outer_loss, meta_net.parameters())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2b69a5d6-b5e4-4f08-af0a-40afc2382b45",
-   "metadata": {},
-   "source": [
-    "We also show an example on how to implement implicit gradient calculation when the inner-level optimal solution reaches some stationary conditions $F (\\phi^{\\star}, \\theta) = 0$, such as [DEQ](https://arxiv.org/abs/1909.01377), based on the OOP API. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "de87c308-d847-4491-9aa1-bc393e6dd1d8",
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "id" : "8d7f9865-dc02-43d4-be90-da1160c4e4dd",
+      "metadata" : {},
+      "source" :
+          ["By treating the solution $\\phi^{\\star}$ as an implicit function of $\\theta$, the "
+           "idea of implicit differentiation is to directly get analytical best-response "
+           "derivatives $\\partial \\phi^{\\star}(\\theta)/ \\partial \\theta$ by implicit "
+           "function theorem. This is suitable for algorithms when the inner-level optimal "
+           "solution is achieved ${\\left. \\frac{\\partial F (\\phi, \\theta)}{\\partial \\phi} "
+           "\\right\\rvert}_{\\phi = \\phi^{\\star}} = 0$ or reaches some stationary conditions $F "
+           "(\\phi^{\\star}, \\theta) = 0$, such as [iMAML](https://arxiv.org/abs/1909.04630) and "
+           "[DEQ](https://arxiv.org/abs/1909.01377)."]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "d7e4b9e1-115f-45ad-a9b3-ea338bcfe6dd",
+      "metadata" : {},
+      "source" : ["In this tutorial, we will introduce how TorchOpt can be used to conduct "
+                  "implicit differentiation."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 1,
+      "id" : "8f13ae67-e328-409f-84a8-1fc425c03a66",
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "import functorch\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "import torchopt"
+      ]
+    },
+    {
+      "attachments" : {},
+      "cell_type" : "markdown",
+      "id" : "0cdaac49-4b94-4900-9bb5-a39057ac8b21",
+      "metadata" : {},
+      "source" : [
+        "## 1. Functional API\n",
+        "\n",
+        "The basic functional API is `torchopt.diff.implicit.custom_root`, which is used as the "
+        "decorator for the forward process implicit gradient procedures. Users are required to "
+        "implement the stationary conditions for the inner-loop process, which will be used as the "
+        "input of custom_root decorator. We show the pseudo code in the following part.\n",
+        "\n",
+        "```python\n",
+        "# Functional API for implicit gradient\n",
+        "def stationary(params, meta_params, data):\n",
+        "    # stationary condition construction\n",
+        "    return stationary condition\n",
+        "\n",
+        "# Decorator that wraps the function\n",
+        "# Optionally specify the linear solver (conjugate gradient or Neumann series)\n",
+        "@torchopt.diff.implicit.custom_root(stationary, solve=linear_solver)\n",
+        "def solve(params, meta_params, data):\n",
+        "    # Forward optimization process for params\n",
+        "    return optimal_params\n",
+        "\n",
+        "# Define params, meta_params and get data\n",
+        "params, meta_prams, data = ..., ..., ...\n",
+        "optimal_params = solve(params, meta_params, data)\n",
+        "loss = outer_loss(optimal_params)\n",
+        "\n",
+        "meta_grads = torch.autograd.grad(loss, meta_params)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "dbef87df-2164-4f1d-8919-37a6fbdc5011",
+      "metadata" : {},
+      "source" : [
+        "Here we use the example of [iMAML](https://arxiv.org/abs/1909.04630) as a real example. "
+        "For iMAML, the inner-loop objective is described by the following equation.\n",
+        "\n",
+        "$$\n",
+        "{\\mathcal{Alg}}^{\\star} \\left( \\boldsymbol{\\theta}, \\mathcal{D}_{i}^{\\text{tr}} "
+        "\\right) = \\underset{\\phi'}{\\operatorname{\\arg \\min}} ~ G \\left( "
+        "\\boldsymbol{\\phi}', \\boldsymbol{\\theta} \\right) \\triangleq \\mathcal{L} \\left( "
+        "\\boldsymbol{\\phi}', \\mathcal{D}_{i}^{\\text{tr}} \\right) + \\frac{\\lambda}{2} "
+        "{\\left\\| \\boldsymbol{\\phi}' - \\boldsymbol{\\theta} \\right\\|}^{2}\n",
+        "$$\n",
+        "\n",
+        "According to this function, we can define the forward function `inner_solver`, where we "
+        "solve this equation based on sufficient gradient descents. For such inner-loop process, "
+        "the optimality condition is that the gradient w.r.t inner-loop parameter is $0$.\n",
+        "\n",
+        "$$\n",
+        "{\\left. \\nabla_{\\boldsymbol{\\phi}'} G \\left( \\boldsymbol{\\phi}', "
+        "\\boldsymbol{\\theta} \\right) \\right\\rvert}_{\\boldsymbol{\\phi}' = "
+        "\\boldsymbol{\\phi}^{\\star}} = 0\n",
+        "$$\n",
+        "\n",
+        "Thus we can define the optimality function by defining `imaml_objective` and make it "
+        "first-order gradient w.r.t the inner-loop parameter as $0$. We achieve so by calling out "
+        "`functorch.grad(imaml_objective, argnums=0)`. Finally, the forward function is decorated "
+        "by the `@torchopt.diff.implicit.custom_root` decorator and the optimality condition we "
+        "define."
+      ]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 2,
+      "id" : "8d623b2f-48ee-4df6-a2ce-cf306b4c9067",
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "# Inner-loop objective function\n",
+        "# The optimality function: grad(imaml_objective)\n",
+        "def imaml_objective(params, meta_params, data):\n",
+        "    x, y, fmodel = data\n",
+        "    y_pred = fmodel(params, x)\n",
+        "    regularization_loss = 0.0\n",
+        "    for p1, p2 in zip(params, meta_params):\n",
+        "        regularization_loss += 0.5 * torch.sum(torch.square(p1.view(-1) - p2.view(-1)))\n",
+        "    loss = F.mse_loss(y_pred, y) + regularization_loss\n",
+        "    return loss\n",
+        "\n",
+        "\n",
+        "# Optimality Condition is: the gradient w.r.t inner-loop optimal params is 0 (we achieve "
+        "so by\n",
+        "# specifying argnums=0 in functorch.grad) the argnums=1 specify which meta-parameter we "
+        "want to\n",
+        "# backpropogate, in this case we want to backpropogate to the initial parameters so we "
+        "set it as 1.\n",
+        "# You can also set argnums as (1, 2) if you want to backpropogate through multiple "
+        "meta-parameters\n",
+        "\n",
+        "\n",
+        "# Here we pass argnums=1 to the custom_root. That means we want to compute the gradient "
+        "of\n",
+        "# optimal_params w.r.t. the 1-indexed argument in inner_solver, i.e., params.\n",
+        "# torchopt.linear_solve.solve_normal_cg specify that we use the conjugate gradient based "
+        "linear solver\n",
+        "@torchopt.diff.implicit.custom_root(\n",
+        "    functorch.grad(imaml_objective, argnums=0),  # optimality function\n",
+        "    argnums=1,\n",
+        "    solve=torchopt.linear_solve.solve_normal_cg(maxiter=5, atol=0),\n",
+        ")\n",
+        "def inner_solver(params, meta_params, data):\n",
+        "    # Initial functional optimizer based on TorchOpt\n",
+        "    x, y, fmodel = data\n",
+        "    optimizer = torchopt.sgd(lr=2e-2)\n",
+        "    opt_state = optimizer.init(params)\n",
+        "    with torch.enable_grad():\n",
+        "        # Temporarily enable gradient computation for conducting the optimization\n",
+        "        for i in range(100):\n",
+        "            pred = fmodel(params, x)\n",
+        "            loss = F.mse_loss(pred, y)  # compute loss\n",
+        "\n",
+        "            # Compute regularization loss\n",
+        "            regularization_loss = 0.0\n",
+        "            for p1, p2 in zip(params, meta_params):\n",
+        "                regularization_loss += 0.5 * torch.sum(torch.square(p1.view(-1) - "
+        "p2.view(-1)))\n",
+        "            final_loss = loss + regularization_loss\n",
+        "\n",
+        "            grads = torch.autograd.grad(final_loss, params)  # compute gradients\n",
+        "            updates, opt_state = optimizer.update(grads, opt_state, inplace=True)  # get "
+        "updates\n",
+        "            params = torchopt.apply_updates(params, updates, inplace=True)\n",
+        "\n",
+        "    optimal_params = params\n",
+        "    return optimal_params\n",
+        "\n",
+        "\n",
+        "# torchopt.linear_solve.solve_inv specify that we use the Neumann Series inversion linear "
+        "solver\n",
+        "@torchopt.diff.implicit.custom_root(\n",
+        "    functorch.grad(imaml_objective, argnums=0),  # optimality function\n",
+        "    argnums=1,\n",
+        "    solve=torchopt.linear_solve.solve_inv(ns=True, maxiter=100, alpha=0.1),\n",
+        ")\n",
+        "def inner_solver_inv_ns(params, meta_params, data):\n",
+        "    # Initial functional optimizer based on TorchOpt\n",
+        "    x, y, fmodel = data\n",
+        "    optimizer = torchopt.sgd(lr=2e-2)\n",
+        "    opt_state = optimizer.init(params)\n",
+        "    with torch.enable_grad():\n",
+        "        # Temporarily enable gradient computation for conducting the optimization\n",
+        "        for i in range(100):\n",
+        "            pred = fmodel(params, x)\n",
+        "            loss = F.mse_loss(pred, y)  # compute loss\n",
+        "\n",
+        "            # Compute regularization loss\n",
+        "            regularization_loss = 0.0\n",
+        "            for p1, p2 in zip(params, meta_params):\n",
+        "                regularization_loss += 0.5 * torch.sum(torch.square(p1.view(-1) - "
+        "p2.view(-1)))\n",
+        "            final_loss = loss + regularization_loss\n",
+        "\n",
+        "            grads = torch.autograd.grad(final_loss, params)  # compute gradients\n",
+        "            updates, opt_state = optimizer.update(grads, opt_state, inplace=True)  # get "
+        "updates\n",
+        "            params = torchopt.apply_updates(params, updates, inplace=True)\n",
+        "\n",
+        "    optimal_params = params\n",
+        "    return optimal_params"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "32a75c81-d479-4120-a73d-5b2b488358d0",
+      "metadata" : {},
+      "source" : ["In the next step, we consider a specific case for one layer neural network to "
+                  "fit the linear data."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 3,
+      "id" : "fb95538b-1fd9-4ec8-9f57-6360bedc05b7",
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "torch.manual_seed(0)\n",
+        "x = torch.randn(20, 4)\n",
+        "w = torch.randn(4, 1)\n",
+        "b = torch.randn(1)\n",
+        "y = x @ w + b + 0.5 * torch.randn(20, 1)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "eeb1823a-2231-4471-bb68-cce7724f2578",
+      "metadata" : {},
+      "source" : ["We instantiate an one layer neural network, where the weights and bias are "
+                  "initialized with constant."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 4,
+      "id" : "d50a7bfe-ac69-4089-8cf8-3cbd69d6d4e7",
+      "metadata" : {"tags" : []},
+      "outputs" : [],
+      "source" : [
+        "class Net(nn.Module):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, 1, bias=True)\n",
+        "        nn.init.ones_(self.fc.weight)\n",
+        "        nn.init.zeros_(self.fc.bias)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.fc(x)\n",
+        "\n",
+        "\n",
+        "model = Net(4)\n",
+        "fmodel, meta_params = functorch.make_functional(model)\n",
+        "data = (x, y, fmodel)\n",
+        "\n",
+        "\n",
+        "# Clone function for parameters\n",
+        "def clone(params):\n",
+        "    cloned = []\n",
+        "    for item in params:\n",
+        "        if isinstance(item, torch.Tensor):\n",
+        "            cloned.append(item.clone().detach_().requires_grad_(True))\n",
+        "        else:\n",
+        "            cloned.append(item)\n",
+        "    return tuple(cloned)"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "065c36c4-89e2-4a63-8213-63db6ee3b08e",
+      "metadata" : {},
+      "source" : ["We take the forward process by calling out the forward function, then we pass "
+                  "the optimal params into the outer-loop loss function."]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "(\n",
-      "│   tensor([[ 0.0272,  0.0031, -0.0156, -0.0238],\n",
-      "│   │   [ 0.1004,  0.0113, -0.0573, -0.0878],\n",
-      "│   │   [ 0.0666,  0.0075, -0.0380, -0.0583],\n",
-      "│   │   [ 0.1446,  0.0163, -0.0826, -0.1265]]),\n",
-      "│   tensor([0.0574, 0.2114, 0.1403, 0.3046])\n",
-      ")\n"
-     ]
+      "cell_type" : "code",
+      "execution_count" : 5,
+      "id" : "115e79c6-911f-4743-a2ed-e50a71c3a813",
+      "metadata" : {"tags" : []},
+      "outputs" : [],
+      "source" : [
+        "optimal_params = inner_solver(clone(meta_params), meta_params, data)\n",
+        "\n",
+        "outer_loss = fmodel(optimal_params, x).mean()"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "e2812351-f635-496e-9732-c80831ac04a6",
+      "metadata" : {},
+      "source" : ["Finally, we can get the meta-gradient as shown below."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 6,
+      "id" : "6bdcbe8d-2336-4f80-b124-eb43c5a2fc0a",
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : ["(tensor([[-0.0369,  0.0248,  0.0347,  0.0067]]), tensor([0.3156]))\n"]
+      } ],
+      "source" : ["torch.autograd.grad(outer_loss, meta_params)"]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "926ae8bb",
+      "metadata" : {},
+      "source" : ["Also we can switch to the Neumann Series inversion linear solver."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 7,
+      "id" : "43df0374",
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : ["(tensor([[-0.0369,  0.0248,  0.0347,  0.0067]]), tensor([0.3156]))\n"]
+      } ],
+      "source" : [
+        "optimal_params = inner_solver_inv_ns(clone(meta_params), meta_params, data)\n",
+        "outer_loss = fmodel(optimal_params, x).mean()\n",
+        "torch.autograd.grad(outer_loss, meta_params)"
+      ]
+    },
+    {
+      "attachments" : {},
+      "cell_type" : "markdown",
+      "id" : "c92e67ea-b220-4a14-a1ea-4eb3c5f52b6b",
+      "metadata" : {},
+      "source" : [
+        "## 2. OOP API\n",
+        "\n",
+        "The basic OOP class is the class `ImplicitMetaGradientModule`. We make the network as an "
+        "`nn.Module` following a classical PyTorch style. Users need to define the stationary "
+        "condition/objective function and the inner-loop solve function to enable implicit "
+        "gradient computation. We show the pseudo code in the following part.\n",
+        "\n",
+        "```python\n",
+        "from torchopt.nn import ImplicitMetaGradientModule\n",
+        "\n",
+        "# Inherited from the class ImplicitMetaGradientModule\n",
+        "# Optionally specify the linear solver (conjugate gradient or Neumann series)\n",
+        "class InnerNet(ImplicitMetaGradientModule, linear_solve=linear_solver):\n",
+        "    def __init__(self, meta_module):\n",
+        "        ...\n",
+        "\n",
+        "    def forward(self, batch):\n",
+        "        # Forward process\n",
+        "        ...\n",
+        "\n",
+        "    def optimality(self, batch, labels):\n",
+        "        # Stationary condition construction for calculating implicit gradient\n",
+        "        # NOTE: If this method is not implemented, it will be automatically derived from "
+        "the\n",
+        "        # gradient of the `objective` function.\n",
+        "        ...\n",
+        "\n",
+        "    def objective(self, batch, labels):\n",
+        "        # Define the inner-loop optimization objective\n",
+        "        # NOTE: This method is optional if method `optimality` is implemented.\n",
+        "        ...\n",
+        "\n",
+        "    def solve(self, batch, labels):\n",
+        "        # Conduct the inner-loop optimization\n",
+        "        ...\n",
+        "        return self  # optimized module\n",
+        "\n",
+        "# Get meta_params and data\n",
+        "meta_params, data = ..., ...\n",
+        "inner_net = InnerNet()\n",
+        "\n",
+        "# Solve for inner-loop process related to the meta-parameters\n",
+        "optimal_inner_net = inner_net.solve(meta_params, *data)\n",
+        "\n",
+        "# Get outer-loss and solve for meta-gradient\n",
+        "loss = outer_loss(optimal_inner_net)\n",
+        "meta_grad = torch.autograd.grad(loss, meta_params)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "62fbe520-11d0-41ff-9b0a-c6508b1d01cf",
+      "metadata" : {},
+      "source" :
+          ["The class `ImplicitMetaGradientModule` is to enable the gradient flow from "
+           "`self.parameters()` to `self.meta_parameters()`. In `__init__` function, users need to "
+           "define the inner parameters and meta-parameters. By default, "
+           "`ImplicitMetaGradientModule` treats all tensors and modules from input as "
+           "`self.meta_parameters()`, and all tensors and modules defined in the `__init__` are "
+           "regarded as `self.parameters()`. Users can also register `self.parameters()` and "
+           "`self.meta_parameters()` by calling `self.register_parameter()` and "
+           "`self.register_meta_parameter()` respectively."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 8,
+      "id" : "c3999684-f4d3-4bc0-86ab-a7e803b2fe80",
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : ["(tensor([[-0.0369,  0.0248,  0.0347,  0.0067]]), tensor([0.3156]))\n"]
+      } ],
+      "source" : [
+        "class InnerNet(\n",
+        "    torchopt.nn.ImplicitMetaGradientModule,\n",
+        "    linear_solve=torchopt.linear_solve.solve_normal_cg(maxiter=5, atol=0),\n",
+        "):\n",
+        "    def __init__(self, meta_net, n_inner_iter, reg_param):\n",
+        "        super().__init__()\n",
+        "        # Declaration of the meta-parameter\n",
+        "        self.meta_net = meta_net\n",
+        "        # Get a deepcopy, register inner-parameter\n",
+        "        self.net = torchopt.module_clone(meta_net, by='deepcopy', detach_buffers=True)\n",
+        "        self.n_inner_iter = n_inner_iter\n",
+        "        self.reg_param = reg_param\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.net(x)\n",
+        "\n",
+        "    def objective(self, x, y):\n",
+        "        # We do not implement the optimality conditions, so it will be automatically "
+        "derived from\n",
+        "        # the gradient of the `objective` function.\n",
+        "        y_pred = self(x)\n",
+        "        loss = F.mse_loss(y_pred, y)\n",
+        "        regularization_loss = 0\n",
+        "        for p1, p2 in zip(\n",
+        "            self.parameters(),  # parameters of `self.net`\n",
+        "            self.meta_parameters(),  # parameters of `self.meta_net`\n",
+        "        ):\n",
+        "            regularization_loss += (\n",
+        "                0.5 * self.reg_param * torch.sum(torch.square(p1.view(-1) - "
+        "p2.view(-1)))\n",
+        "            )\n",
+        "        return loss + regularization_loss\n",
+        "\n",
+        "    def solve(self, x, y):\n",
+        "        params = tuple(self.parameters())\n",
+        "        inner_optim = torchopt.SGD(params, lr=2e-2)\n",
+        "        with torch.enable_grad():\n",
+        "            # Temporarily enable gradient computation for conducting the optimization\n",
+        "            for _ in range(self.n_inner_iter):\n",
+        "                loss = self.objective(x, y)\n",
+        "                inner_optim.zero_grad()\n",
+        "                # NOTE: The parameter inputs should be explicitly specified in `backward` "
+        "function\n",
+        "                # as argument `inputs`. Otherwise, if not provided, the gradient is "
+        "accumulated into\n",
+        "                # all the leaf Tensors (including the meta-parameters) that were used to "
+        "compute the\n",
+        "                # objective output. Alternatively, please use `torch.autograd.grad` "
+        "instead.\n",
+        "                loss.backward(inputs=params)  # backward pass in inner-loop\n",
+        "                inner_optim.step()  # update inner parameters\n",
+        "        return self\n",
+        "\n",
+        "\n",
+        "# Initialize the meta-network\n",
+        "meta_net = Net(4)\n",
+        "inner_net = InnerNet(meta_net, 100, reg_param=1)\n",
+        "\n",
+        "# Solve for inner-loop\n",
+        "optimal_inner_net = inner_net.solve(x, y)\n",
+        "outer_loss = optimal_inner_net(x).mean()\n",
+        "\n",
+        "# Derive the meta-gradient\n",
+        "torch.autograd.grad(outer_loss, meta_net.parameters())"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "2b69a5d6-b5e4-4f08-af0a-40afc2382b45",
+      "metadata" : {},
+      "source" : ["We also show an example on how to implement implicit gradient calculation when "
+                  "the inner-level optimal solution reaches some stationary conditions $F "
+                  "(\\phi^{\\star}, \\theta) = 0$, such as "
+                  "[DEQ](https://arxiv.org/abs/1909.01377), based on the OOP API. "]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 9,
+      "id" : "de87c308-d847-4491-9aa1-bc393e6dd1d8",
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "\n",
+          "(\n",
+          "│   tensor([[ 0.0272,  0.0031, -0.0156, -0.0238],\n",
+          "│   │   [ 0.1004,  0.0113, -0.0573, -0.0878],\n",
+          "│   │   [ 0.0666,  0.0075, -0.0380, -0.0583],\n",
+          "│   │   [ 0.1446,  0.0163, -0.0826, -0.1265]]),\n",
+          "│   tensor([0.0574, 0.2114, 0.1403, 0.3046])\n",
+          ")\n"
+        ]
+      } ],
+      "source" : [
+        "class Net(nn.Module):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, dim)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.fc(x)\n",
+        "\n",
+        "\n",
+        "class InnerNet(\n",
+        "    torchopt.nn.ImplicitMetaGradientModule,\n",
+        "    linear_solve=torchopt.linear_solve.solve_normal_cg(maxiter=5, atol=0),\n",
+        "):\n",
+        "    def __init__(self, meta_net, x0):\n",
+        "        super().__init__()\n",
+        "        # Register meta-parameter\n",
+        "        self.meta_net = meta_net\n",
+        "        # Declaration of the inner-parameter, register inner-parameter\n",
+        "        self.x = nn.Parameter(x0.clone().detach_(), requires_grad=True)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.meta_net(x)\n",
+        "\n",
+        "    def optimality(self):\n",
+        "        # Fixed-point condition\n",
+        "        return (self.x - self(self.x),)\n",
+        "\n",
+        "    def solve(self):\n",
+        "        # Solving inner-loop fixed-point iteration\n",
+        "        # This is just an illustrating example for solving fixed-point iteration\n",
+        "        # one can use more advanced method to solve fixed-point iteration\n",
+        "        # such as anderson acceleration.\n",
+        "        for _ in range(10):\n",
+        "            self.x.copy_(self(self.x))\n",
+        "        return self\n",
+        "\n",
+        "\n",
+        "# Initialize meta-network\n",
+        "torch.manual_seed(0)\n",
+        "meta_net = Net(4)\n",
+        "x0 = torch.randn(1, 4)\n",
+        "inner_net = InnerNet(meta_net, x0)\n",
+        "\n",
+        "# Solve for inner-loop\n",
+        "optimal_inner_net = inner_net.solve()\n",
+        "outer_loss = optimal_inner_net.x.mean()\n",
+        "\n",
+        "# Derive the meta-gradient\n",
+        "torch.autograd.grad(outer_loss, meta_net.parameters())"
+      ]
+    }
+  ],
+  "metadata" : {
+    "kernelspec" :
+        {"display_name" : "Python 3 (ipykernel)", "language" : "python", "name" : "python3"},
+    "language_info" : {
+      "codemirror_mode" : {"name" : "ipython", "version" : 3},
+      "file_extension" : ".py",
+      "mimetype" : "text/x-python",
+      "name" : "python",
+      "nbconvert_exporter" : "python",
+      "pygments_lexer" : "ipython3",
+      "version" : "3.9.15"
+    },
+    "vscode" : {
+      "interpreter" :
+          {"hash" : "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"}
     }
-   ],
-   "source": [
-    "class Net(nn.Module):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, dim)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.fc(x)\n",
-    "\n",
-    "\n",
-    "class InnerNet(\n",
-    "    torchopt.nn.ImplicitMetaGradientModule,\n",
-    "    linear_solve=torchopt.linear_solve.solve_normal_cg(maxiter=5, atol=0),\n",
-    "):\n",
-    "    def __init__(self, meta_net, x0):\n",
-    "        super().__init__()\n",
-    "        # Register meta-parameter\n",
-    "        self.meta_net = meta_net\n",
-    "        # Declaration of the inner-parameter, register inner-parameter\n",
-    "        self.x = nn.Parameter(x0.clone().detach_(), requires_grad=True)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.meta_net(x)\n",
-    "\n",
-    "    def optimality(self):\n",
-    "        # Fixed-point condition\n",
-    "        return (self.x - self(self.x),)\n",
-    "\n",
-    "    def solve(self):\n",
-    "        # Solving inner-loop fixed-point iteration\n",
-    "        # This is just an illustrating example for solving fixed-point iteration\n",
-    "        # one can use more advanced method to solve fixed-point iteration\n",
-    "        # such as anderson acceleration.\n",
-    "        for _ in range(10):\n",
-    "            self.x.copy_(self(self.x))\n",
-    "        return self\n",
-    "\n",
-    "\n",
-    "# Initialize meta-network\n",
-    "torch.manual_seed(0)\n",
-    "meta_net = Net(4)\n",
-    "x0 = torch.randn(1, 4)\n",
-    "inner_net = InnerNet(meta_net, x0)\n",
-    "\n",
-    "# Solve for inner-loop\n",
-    "optimal_inner_net = inner_net.solve()\n",
-    "outer_loss = optimal_inner_net.x.mean()\n",
-    "\n",
-    "# Derive the meta-gradient\n",
-    "torch.autograd.grad(outer_loss, meta_net.parameters())"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
   },
-  "vscode": {
-   "interpreter": {
-    "hash": "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat" : 4,
+  "nbformat_minor" : 5
 }
diff --git a/tutorials/6_Zero_Order_Differentiation.ipynb b/tutorials/6_Zero_Order_Differentiation.ipynb
index d6cb028c..d6e2a4e0 100644
--- a/tutorials/6_Zero_Order_Differentiation.ipynb
+++ b/tutorials/6_Zero_Order_Differentiation.ipynb
@@ -1,356 +1,390 @@
 {
-  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8850c832-3b54-4971-8ee0-2cd64b585ea8",
-   "metadata": {},
-   "source": [
-    "# TorchOpt for Zero-Order Differentiation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2b547376",
-   "metadata": {},
-   "source": [
-    "[<img align=\"left\" src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/6_Zero_Order_Differentiation.ipynb)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8d7f9865-dc02-43d4-be90-da1160c4e4dd",
-   "metadata": {},
-   "source": [
-    "When the inner-loop process is non-differentiable or one wants to eliminate the heavy computation burdens in the previous two modes (brought by Hessian), one can choose ZD. ZD typically gets gradients based on zero-order estimation, such as finite-difference, or Evolutionary Strategy.\n",
-    "\n",
-    "TorchOpt offers API for ES-based differentiation. Instead of optimizing the objective $f (\\boldsymbol{\\theta}): \\mathbb{R}^n \\to \\mathbb{R}$, ES optimizes a Gaussion smoothing objective defined as $\\tilde{f}_{\\sigma} (\\boldsymbol{\\theta}) = \\mathbb{E}_{\\boldsymbol{z} \\sim \\mathcal{N}( 0, {I}_d )} [ f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) ]$, where $\\sigma$ denotes precision. The gradient of such objective is $\\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} (\\boldsymbol{\\theta}) = \\frac{1}{\\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim \\mathcal{N}( 0, {I}_d )} [ f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) \\cdot \\boldsymbol{z} ]$. Refer to [ES-MAML](https://arxiv.org/pdf/1910.01215.pdf) for more details."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d7e4b9e1-115f-45ad-a9b3-ea338bcfe6dd",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will introduce how TorchOpt can be used to ES-based differentiation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "8f13ae67-e328-409f-84a8-1fc425c03a66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import functorch\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import torchopt"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0cdaac49-4b94-4900-9bb5-a39057ac8b21",
-   "metadata": {},
-   "source": [
-    "## 1. Functional API\n",
-    "\n",
-    "The basic functional API is `torchopt.diff.zero_order.zero_order`, which is used as the decorator for the forward process zero-order gradient procedures. Users are required to implement the noise sampling function, which will be used as the input of zero_order decorator. Here we show the specific meaning for each parameter used in the decorator.\n",
-    "\n",
-    "- `distribution` for noise sampling distribution. The distribution $\\lambda$ should be spherical symmetric and with a constant variance of $1$ for each element. I.e.:\n",
-    "\n",
-    "    - Spherical symmetric: $\\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} [ \\boldsymbol{z} ] = \\boldsymbol{0}$.\n",
-    "    - Constant variance of $1$ for each element: $\\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} [ {\\lvert z_i \\rvert}^2 ] = 1$.\n",
-    "    - For example, the standard multi-dimensional normal distribution $\\mathcal{N} (\\boldsymbol{0}, \\boldsymbol{1})$.\n",
-    "\n",
-    "- `method` for different kind of algorithms, we support `'naive'` ([ES-RL](https://arxiv.org/abs/1703.03864)), `'forward'` ([Forward-FD](http://proceedings.mlr.press/v80/choromanski18a/choromanski18a.pdf)), and `'antithetic'` ([antithetic](https://arxiv.org/abs/1803.07055)).\n",
-    "\n",
-    "    $$\n",
-    "    \\begin{align*}\n",
-    "        \\text{naive}      \\qquad & \\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} (\\boldsymbol{\\theta}) = \\frac{1}{\\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} [ f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) \\cdot \\boldsymbol{z} ] \\\\\n",
-    "        \\text{forward}    \\qquad & \\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} (\\boldsymbol{\\theta}) = \\frac{1}{\\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} [ ( f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) - f (\\boldsymbol{\\theta}) ) \\cdot \\boldsymbol{z} ] \\\\\n",
-    "        \\text{antithetic} \\qquad & \\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} (\\boldsymbol{\\theta}) = \\frac{1}{2 \\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} [ (f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) - f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) ) \\cdot \\boldsymbol{z} ]\n",
-    "    \\end{align*}\n",
-    "    $$\n",
-    "\n",
-    "- `argnums` specifies which parameter we want to trace the meta-gradient.\n",
-    "- `num_samples` specifies how many times we want to conduct the sampling.\n",
-    "- `sigma` is for precision. This is the scaling factor for the sampling distribution.\n",
-    "\n",
-    "We show the pseudo code in the following part.\n",
-    "\n",
-    "```python\n",
-    "# Functional API for zero-order differentiation\n",
-    "# 1. Customize the noise distribution via a distribution class\n",
-    "class Distribution:\n",
-    "    def sample(self, sample_shape=torch.Size()):\n",
-    "        # Sampling function for noise\n",
-    "        # NOTE: The distribution should be spherical symmetric and with a constant variance of 1.\n",
-    "        ...\n",
-    "        return noise_batch\n",
-    "\n",
-    "distribution = Distribution()\n",
-    "\n",
-    "# 2. Customize the noise distribution via a sampling function\n",
-    "def distribution(sample_shape=torch.Size()):\n",
-    "    # Sampling function for noise\n",
-    "    # NOTE: The distribution should be spherical symmetric and with a constant variance of 1.\n",
-    "    ...\n",
-    "    return noise_batch\n",
-    "\n",
-    "# 3. Distribution can also be an instance of `torch.distributions.Distribution`, e.g., `torch.distributions.Normal(...)`\n",
-    "distribution = torch.distributions.Normal(loc=0, scale=1)\n",
-    "\n",
-    "# Decorator that wraps the function\n",
-    "@torchopt.diff.zero_order(distribution=distribution, method='naive', argnums=0, num_samples=100, sigma=0.01)\n",
-    "def forward(params, data):\n",
-    "    # Forward optimization process for params\n",
-    "    ...\n",
-    "    return objective  # the returned tensor should be a scalar tensor\n",
-    "\n",
-    "# Define params and get data\n",
-    "params, data = ..., ...\n",
-    "\n",
-    "# Forward pass\n",
-    "loss = forward(params, data)\n",
-    "# Backward pass using zero-order differentiation\n",
-    "grads = torch.autograd.grad(loss, params)\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dbef87df-2164-4f1d-8919-37a6fbdc5011",
-   "metadata": {},
-   "source": [
-    "Here we use the example of a linear layer as an example, note that this is just an example to show linear layer can work with ES."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "8d623b2f-48ee-4df6-a2ce-cf306b4c9067",
-   "metadata": {},
-   "outputs": [
+  "cells" : [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "001: tensor(0.0265, grad_fn=<ZeroOrderBackward>)\n",
-      "002: tensor(0.0243, grad_fn=<ZeroOrderBackward>)\n",
-      "003: tensor(0.0222, grad_fn=<ZeroOrderBackward>)\n",
-      "004: tensor(0.0202, grad_fn=<ZeroOrderBackward>)\n",
-      "005: tensor(0.0184, grad_fn=<ZeroOrderBackward>)\n",
-      "006: tensor(0.0170, grad_fn=<ZeroOrderBackward>)\n",
-      "007: tensor(0.0157, grad_fn=<ZeroOrderBackward>)\n",
-      "008: tensor(0.0146, grad_fn=<ZeroOrderBackward>)\n",
-      "009: tensor(0.0137, grad_fn=<ZeroOrderBackward>)\n",
-      "010: tensor(0.0130, grad_fn=<ZeroOrderBackward>)\n",
-      "011: tensor(0.0123, grad_fn=<ZeroOrderBackward>)\n",
-      "012: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
-      "013: tensor(0.0114, grad_fn=<ZeroOrderBackward>)\n",
-      "014: tensor(0.0111, grad_fn=<ZeroOrderBackward>)\n",
-      "015: tensor(0.0111, grad_fn=<ZeroOrderBackward>)\n",
-      "016: tensor(0.0111, grad_fn=<ZeroOrderBackward>)\n",
-      "017: tensor(0.0113, grad_fn=<ZeroOrderBackward>)\n",
-      "018: tensor(0.0115, grad_fn=<ZeroOrderBackward>)\n",
-      "019: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
-      "020: tensor(0.0120, grad_fn=<ZeroOrderBackward>)\n",
-      "021: tensor(0.0121, grad_fn=<ZeroOrderBackward>)\n",
-      "022: tensor(0.0121, grad_fn=<ZeroOrderBackward>)\n",
-      "023: tensor(0.0122, grad_fn=<ZeroOrderBackward>)\n",
-      "024: tensor(0.0122, grad_fn=<ZeroOrderBackward>)\n",
-      "025: tensor(0.0122, grad_fn=<ZeroOrderBackward>)\n"
-     ]
-    }
-   ],
-   "source": [
-    "torch.random.manual_seed(0)\n",
-    "\n",
-    "fmodel, params = functorch.make_functional(nn.Linear(32, 1))\n",
-    "x = torch.randn(64, 32) * 0.1\n",
-    "y = torch.randn(64, 1) * 0.1\n",
-    "distribution = torch.distributions.Normal(loc=0, scale=1)\n",
-    "\n",
-    "\n",
-    "@torchopt.diff.zero_order(\n",
-    "    distribution=distribution, method='forward', argnums=0, num_samples=100, sigma=0.01\n",
-    ")\n",
-    "def forward_process(params, fn, x, y):\n",
-    "    y_pred = fn(params, x)\n",
-    "    loss = F.mse_loss(y_pred, y)\n",
-    "    return loss\n",
-    "\n",
-    "\n",
-    "optimizer = torchopt.adam(lr=0.01)\n",
-    "opt_state = optimizer.init(params)  # init optimizer\n",
-    "\n",
-    "for i in range(25):\n",
-    "    loss = forward_process(params, fmodel, x, y)  # compute loss\n",
-    "\n",
-    "    grads = torch.autograd.grad(loss, params)  # compute gradients\n",
-    "    updates, opt_state = optimizer.update(grads, opt_state)  # get updates\n",
-    "    params = torchopt.apply_updates(params, updates)  # update network parameters\n",
-    "\n",
-    "    print(f'{i + 1:03d}: {loss!r}')"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "db723f6b",
-   "metadata": {},
-   "source": [
-    "## 2. OOP API\n",
-    "\n",
-    "The basic OOP API is the class `ZeroOrderGradientModule`. We make the network as an `nn.Module` following a classical PyTorch style. Users need to define the forward process zero-order gradient procedures `forward()` and a noise sampling function `sample()`. Here we show the specific meaning for each parameter used in the class.\n",
-    "\n",
-    "- `method` for different kind of algorithms, we support `'naive'` ([ES-RL](https://arxiv.org/abs/1703.03864)), `'forward'` ([Forward-FD](http://proceedings.mlr.press/v80/choromanski18a/choromanski18a.pdf)), and `'antithetic'` ([antithetic](https://d1wqtxts1xzle7.cloudfront.net/75609515/coredp2011_1web-with-cover-page-v2.pdf?Expires=1670215467&Signature=RfP~mQhhhI7aGknwXbRBgSggFrKuNTPYdyUSdMmfTxOa62QoOJAm-Xhr3F1PLyjUQc2JVxmKIKGGuyYvyfCTpB31dfmMtuVQxZMWVF-SfErTN05SliC93yjA1x1g2kjhn8bkBFdQqGl~1RQSKnhj88BakgSeDNzyCxwbD5VgR89BXRs4YIK5RBIKYtgLhoyz5jar7wHS3TJhRzs3WNeTIAjAmLqJ068oGFZ0Jr7maGquTe3w~8LEEIprJ6cyCMc6b1UUJkmwjNq0RLTVbxgFjfi4Z9kyxyJB9IOS1J25OOON4jfwh5JlXS7MVskuONUyHJim1TQ8OwCraKlBsQLPQw__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA)).\n",
-    "- `num_samples` specifies how many times we want to conduct the sampling.\n",
-    "- `sigma` is for precision. This is the scaling factor for the sampling distribution.\n",
-    "\n",
-    "We show the pseudo code in the following part.\n",
-    "\n",
-    "```python\n",
-    "from torchopt.nn import ZeroOrderGradientModule\n",
-    "\n",
-    "# Inherited from the class ZeroOrderGradientModule\n",
-    "# Optionally specify the `method` and/or `num_samples` and/or `sigma` used for sampling\n",
-    "class Net(ZeroOrderGradientModule, method='naive', num_samples=100, sigma=0.01):\n",
-    "    def __init__(self, ...):\n",
-    "        ...\n",
-    "\n",
-    "    def forward(self, batch):\n",
-    "        # Forward process\n",
-    "        ...\n",
-    "        return objective  # the returned tensor should be a scalar tensor\n",
-    "\n",
-    "    def sample(self, sample_shape=torch.Size()):\n",
-    "        # Generate a batch of noise samples\n",
-    "        # NOTE: The distribution should be spherical symmetric and with a constant variance of 1.\n",
-    "        ...\n",
-    "        return noise_batch\n",
-    "\n",
-    "# Get model and data\n",
-    "net = Net(...)\n",
-    "data = ...\n",
-    "\n",
-    "# Forward pass\n",
-    "loss = Net(data)\n",
-    "# Backward pass using zero-order differentiation\n",
-    "grads = torch.autograd.grad(loss, net.parameters())\n",
-    "```"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "b53524f5",
-   "metadata": {},
-   "source": [
-    "Here we reimplement the functional API example above with the OOP API."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "ecc5730c",
-   "metadata": {},
-   "outputs": [
+      "cell_type" : "markdown",
+      "id" : "8850c832-3b54-4971-8ee0-2cd64b585ea8",
+      "metadata" : {},
+      "source" : ["# TorchOpt for Zero-Order Differentiation"]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "001: tensor(0.0201, grad_fn=<ZeroOrderBackward>)\n",
-      "002: tensor(0.0181, grad_fn=<ZeroOrderBackward>)\n",
-      "003: tensor(0.0167, grad_fn=<ZeroOrderBackward>)\n",
-      "004: tensor(0.0153, grad_fn=<ZeroOrderBackward>)\n",
-      "005: tensor(0.0142, grad_fn=<ZeroOrderBackward>)\n",
-      "006: tensor(0.0133, grad_fn=<ZeroOrderBackward>)\n",
-      "007: tensor(0.0125, grad_fn=<ZeroOrderBackward>)\n",
-      "008: tensor(0.0119, grad_fn=<ZeroOrderBackward>)\n",
-      "009: tensor(0.0116, grad_fn=<ZeroOrderBackward>)\n",
-      "010: tensor(0.0114, grad_fn=<ZeroOrderBackward>)\n",
-      "011: tensor(0.0112, grad_fn=<ZeroOrderBackward>)\n",
-      "012: tensor(0.0112, grad_fn=<ZeroOrderBackward>)\n",
-      "013: tensor(0.0113, grad_fn=<ZeroOrderBackward>)\n",
-      "014: tensor(0.0116, grad_fn=<ZeroOrderBackward>)\n",
-      "015: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
-      "016: tensor(0.0121, grad_fn=<ZeroOrderBackward>)\n",
-      "017: tensor(0.0123, grad_fn=<ZeroOrderBackward>)\n",
-      "018: tensor(0.0125, grad_fn=<ZeroOrderBackward>)\n",
-      "019: tensor(0.0127, grad_fn=<ZeroOrderBackward>)\n",
-      "020: tensor(0.0127, grad_fn=<ZeroOrderBackward>)\n",
-      "021: tensor(0.0125, grad_fn=<ZeroOrderBackward>)\n",
-      "022: tensor(0.0123, grad_fn=<ZeroOrderBackward>)\n",
-      "023: tensor(0.0120, grad_fn=<ZeroOrderBackward>)\n",
-      "024: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
-      "025: tensor(0.0117, grad_fn=<ZeroOrderBackward>)\n"
-     ]
+      "cell_type" : "markdown",
+      "id" : "2b547376",
+      "metadata" : {},
+      "source" : ["[<img align=\"left\" "
+                  "src=\"https://colab.research.google.com/assets/colab-badge.svg\">](https://"
+                  "colab.research.google.com/github/metaopt/torchopt/blob/main/tutorials/"
+                  "6_Zero_Order_Differentiation.ipynb)"]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "8d7f9865-dc02-43d4-be90-da1160c4e4dd",
+      "metadata" : {},
+      "source" : [
+        "When the inner-loop process is non-differentiable or one wants to eliminate the heavy "
+        "computation burdens in the previous two modes (brought by Hessian), one can choose ZD. ZD "
+        "typically gets gradients based on zero-order estimation, such as finite-difference, or "
+        "Evolutionary Strategy.\n",
+        "\n",
+        "TorchOpt offers API for ES-based differentiation. Instead of optimizing the objective $f "
+        "(\\boldsymbol{\\theta}): \\mathbb{R}^n \\to \\mathbb{R}$, ES optimizes a Gaussion "
+        "smoothing objective defined as $\\tilde{f}_{\\sigma} (\\boldsymbol{\\theta}) = "
+        "\\mathbb{E}_{\\boldsymbol{z} \\sim \\mathcal{N}( 0, {I}_d )} [ f (\\boldsymbol{\\theta} + "
+        "\\sigma \\, \\boldsymbol{z}) ]$, where $\\sigma$ denotes precision. The gradient of such "
+        "objective is $\\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} "
+        "(\\boldsymbol{\\theta}) = \\frac{1}{\\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim "
+        "\\mathcal{N}( 0, {I}_d )} [ f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) "
+        "\\cdot \\boldsymbol{z} ]$. Refer to [ES-MAML](https://arxiv.org/pdf/1910.01215.pdf) for "
+        "more details."
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "d7e4b9e1-115f-45ad-a9b3-ea338bcfe6dd",
+      "metadata" : {},
+      "source" : ["In this tutorial, we will introduce how TorchOpt can be used to ES-based "
+                  "differentiation."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 1,
+      "id" : "8f13ae67-e328-409f-84a8-1fc425c03a66",
+      "metadata" : {},
+      "outputs" : [],
+      "source" : [
+        "import functorch\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "import torchopt"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "0cdaac49-4b94-4900-9bb5-a39057ac8b21",
+      "metadata" : {},
+      "source" : [
+        "## 1. Functional API\n",
+        "\n",
+        "The basic functional API is `torchopt.diff.zero_order.zero_order`, which is used as the "
+        "decorator for the forward process zero-order gradient procedures. Users are required to "
+        "implement the noise sampling function, which will be used as the input of zero_order "
+        "decorator. Here we show the specific meaning for each parameter used in the decorator.\n",
+        "\n",
+        "- `distribution` for noise sampling distribution. The distribution $\\lambda$ should be "
+        "spherical symmetric and with a constant variance of $1$ for each element. I.e.:\n",
+        "\n",
+        "    - Spherical symmetric: $\\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} [ "
+        "\\boldsymbol{z} ] = \\boldsymbol{0}$.\n",
+        "    - Constant variance of $1$ for each element: $\\mathbb{E}_{\\boldsymbol{z} \\sim "
+        "\\lambda} [ {\\lvert z_i \\rvert}^2 ] = 1$.\n",
+        "    - For example, the standard multi-dimensional normal distribution $\\mathcal{N} "
+        "(\\boldsymbol{0}, \\boldsymbol{1})$.\n",
+        "\n",
+        "- `method` for different kind of algorithms, we support `'naive'` "
+        "([ES-RL](https://arxiv.org/abs/1703.03864)), `'forward'` "
+        "([Forward-FD](http://proceedings.mlr.press/v80/choromanski18a/choromanski18a.pdf)), and "
+        "`'antithetic'` ([antithetic](https://arxiv.org/abs/1803.07055)).\n",
+        "\n",
+        "    $$\n",
+        "    \\begin{align*}\n",
+        "        \\text{naive}      \\qquad & \\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} "
+        "(\\boldsymbol{\\theta}) = \\frac{1}{\\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} "
+        "[ f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) \\cdot \\boldsymbol{z} ] \\\\\n",
+        "        \\text{forward}    \\qquad & \\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} "
+        "(\\boldsymbol{\\theta}) = \\frac{1}{\\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim \\lambda} "
+        "[ ( f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) - f (\\boldsymbol{\\theta}) ) "
+        "\\cdot \\boldsymbol{z} ] \\\\\n",
+        "        \\text{antithetic} \\qquad & \\nabla_{\\boldsymbol{\\theta}} \\tilde{f}_{\\sigma} "
+        "(\\boldsymbol{\\theta}) = \\frac{1}{2 \\sigma} \\mathbb{E}_{\\boldsymbol{z} \\sim "
+        "\\lambda} [ (f (\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) - f "
+        "(\\boldsymbol{\\theta} + \\sigma \\, \\boldsymbol{z}) ) \\cdot \\boldsymbol{z} ]\n",
+        "    \\end{align*}\n",
+        "    $$\n",
+        "\n",
+        "- `argnums` specifies which parameter we want to trace the meta-gradient.\n",
+        "- `num_samples` specifies how many times we want to conduct the sampling.\n",
+        "- `sigma` is for precision. This is the scaling factor for the sampling distribution.\n",
+        "\n",
+        "We show the pseudo code in the following part.\n",
+        "\n",
+        "```python\n",
+        "# Functional API for zero-order differentiation\n",
+        "# 1. Customize the noise distribution via a distribution class\n",
+        "class Distribution:\n",
+        "    def sample(self, sample_shape=torch.Size()):\n",
+        "        # Sampling function for noise\n",
+        "        # NOTE: The distribution should be spherical symmetric and with a constant "
+        "variance of 1.\n",
+        "        ...\n",
+        "        return noise_batch\n",
+        "\n",
+        "distribution = Distribution()\n",
+        "\n",
+        "# 2. Customize the noise distribution via a sampling function\n",
+        "def distribution(sample_shape=torch.Size()):\n",
+        "    # Sampling function for noise\n",
+        "    # NOTE: The distribution should be spherical symmetric and with a constant variance "
+        "of 1.\n",
+        "    ...\n",
+        "    return noise_batch\n",
+        "\n",
+        "# 3. Distribution can also be an instance of `torch.distributions.Distribution`, e.g., "
+        "`torch.distributions.Normal(...)`\n",
+        "distribution = torch.distributions.Normal(loc=0, scale=1)\n",
+        "\n",
+        "# Decorator that wraps the function\n",
+        "@torchopt.diff.zero_order(distribution=distribution, method='naive', argnums=0, "
+        "num_samples=100, sigma=0.01)\n",
+        "def forward(params, data):\n",
+        "    # Forward optimization process for params\n",
+        "    ...\n",
+        "    return objective  # the returned tensor should be a scalar tensor\n",
+        "\n",
+        "# Define params and get data\n",
+        "params, data = ..., ...\n",
+        "\n",
+        "# Forward pass\n",
+        "loss = forward(params, data)\n",
+        "# Backward pass using zero-order differentiation\n",
+        "grads = torch.autograd.grad(loss, params)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type" : "markdown",
+      "id" : "dbef87df-2164-4f1d-8919-37a6fbdc5011",
+      "metadata" : {},
+      "source" : ["Here we use the example of a linear layer as an example, note that this is just "
+                  "an example to show linear layer can work with ES."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 2,
+      "id" : "8d623b2f-48ee-4df6-a2ce-cf306b4c9067",
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "001: tensor(0.0265, grad_fn=<ZeroOrderBackward>)\n",
+          "002: tensor(0.0243, grad_fn=<ZeroOrderBackward>)\n",
+          "003: tensor(0.0222, grad_fn=<ZeroOrderBackward>)\n",
+          "004: tensor(0.0202, grad_fn=<ZeroOrderBackward>)\n",
+          "005: tensor(0.0184, grad_fn=<ZeroOrderBackward>)\n",
+          "006: tensor(0.0170, grad_fn=<ZeroOrderBackward>)\n",
+          "007: tensor(0.0157, grad_fn=<ZeroOrderBackward>)\n",
+          "008: tensor(0.0146, grad_fn=<ZeroOrderBackward>)\n",
+          "009: tensor(0.0137, grad_fn=<ZeroOrderBackward>)\n",
+          "010: tensor(0.0130, grad_fn=<ZeroOrderBackward>)\n",
+          "011: tensor(0.0123, grad_fn=<ZeroOrderBackward>)\n",
+          "012: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
+          "013: tensor(0.0114, grad_fn=<ZeroOrderBackward>)\n",
+          "014: tensor(0.0111, grad_fn=<ZeroOrderBackward>)\n",
+          "015: tensor(0.0111, grad_fn=<ZeroOrderBackward>)\n",
+          "016: tensor(0.0111, grad_fn=<ZeroOrderBackward>)\n",
+          "017: tensor(0.0113, grad_fn=<ZeroOrderBackward>)\n",
+          "018: tensor(0.0115, grad_fn=<ZeroOrderBackward>)\n",
+          "019: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
+          "020: tensor(0.0120, grad_fn=<ZeroOrderBackward>)\n",
+          "021: tensor(0.0121, grad_fn=<ZeroOrderBackward>)\n",
+          "022: tensor(0.0121, grad_fn=<ZeroOrderBackward>)\n",
+          "023: tensor(0.0122, grad_fn=<ZeroOrderBackward>)\n",
+          "024: tensor(0.0122, grad_fn=<ZeroOrderBackward>)\n",
+          "025: tensor(0.0122, grad_fn=<ZeroOrderBackward>)\n"
+        ]
+      } ],
+      "source" : [
+        "torch.random.manual_seed(0)\n",
+        "\n",
+        "fmodel, params = functorch.make_functional(nn.Linear(32, 1))\n",
+        "x = torch.randn(64, 32) * 0.1\n",
+        "y = torch.randn(64, 1) * 0.1\n",
+        "distribution = torch.distributions.Normal(loc=0, scale=1)\n",
+        "\n",
+        "\n",
+        "@torchopt.diff.zero_order(\n",
+        "    distribution=distribution, method='forward', argnums=0, num_samples=100, sigma=0.01\n",
+        ")\n",
+        "def forward_process(params, fn, x, y):\n",
+        "    y_pred = fn(params, x)\n",
+        "    loss = F.mse_loss(y_pred, y)\n",
+        "    return loss\n",
+        "\n",
+        "\n",
+        "optimizer = torchopt.adam(lr=0.01)\n",
+        "opt_state = optimizer.init(params)  # init optimizer\n",
+        "\n",
+        "for i in range(25):\n",
+        "    loss = forward_process(params, fmodel, x, y)  # compute loss\n",
+        "\n",
+        "    grads = torch.autograd.grad(loss, params)  # compute gradients\n",
+        "    updates, opt_state = optimizer.update(grads, opt_state)  # get updates\n",
+        "    params = torchopt.apply_updates(params, updates)  # update network parameters\n",
+        "\n",
+        "    print(f'{i + 1:03d}: {loss!r}')"
+      ]
+    },
+    {
+      "attachments" : {},
+      "cell_type" : "markdown",
+      "id" : "db723f6b",
+      "metadata" : {},
+      "source" : [
+        "## 2. OOP API\n",
+        "\n",
+        "The basic OOP API is the class `ZeroOrderGradientModule`. We make the network as an "
+        "`nn.Module` following a classical PyTorch style. Users need to define the forward process "
+        "zero-order gradient procedures `forward()` and a noise sampling function `sample()`. Here "
+        "we show the specific meaning for each parameter used in the class.\n",
+        "\n",
+        "- `method` for different kind of algorithms, we support `'naive'` "
+        "([ES-RL](https://arxiv.org/abs/1703.03864)), `'forward'` "
+        "([Forward-FD](http://proceedings.mlr.press/v80/choromanski18a/choromanski18a.pdf)), and "
+        "`'antithetic'` "
+        "([antithetic](https://d1wqtxts1xzle7.cloudfront.net/75609515/"
+        "coredp2011_1web-with-cover-page-v2.pdf?Expires=1670215467&Signature=RfP~"
+        "mQhhhI7aGknwXbRBgSggFrKuNTPYdyUSdMmfTxOa62QoOJAm-"
+        "Xhr3F1PLyjUQc2JVxmKIKGGuyYvyfCTpB31dfmMtuVQxZMWVF-SfErTN05SliC93yjA1x1g2kjhn8bkBFdQqGl~"
+        "1RQSKnhj88BakgSeDNzyCxwbD5VgR89BXRs4YIK5RBIKYtgLhoyz5jar7wHS3TJhRzs3WNeTIAjAmLqJ068oGFZ0Jr"
+        "7maGquTe3w~"
+        "8LEEIprJ6cyCMc6b1UUJkmwjNq0RLTVbxgFjfi4Z9kyxyJB9IOS1J25OOON4jfwh5JlXS7MVskuONUyHJim1TQ8OwC"
+        "raKlBsQLPQw__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA)).\n",
+        "- `num_samples` specifies how many times we want to conduct the sampling.\n",
+        "- `sigma` is for precision. This is the scaling factor for the sampling distribution.\n",
+        "\n",
+        "We show the pseudo code in the following part.\n",
+        "\n",
+        "```python\n",
+        "from torchopt.nn import ZeroOrderGradientModule\n",
+        "\n",
+        "# Inherited from the class ZeroOrderGradientModule\n",
+        "# Optionally specify the `method` and/or `num_samples` and/or `sigma` used for sampling\n",
+        "class Net(ZeroOrderGradientModule, method='naive', num_samples=100, sigma=0.01):\n",
+        "    def __init__(self, ...):\n",
+        "        ...\n",
+        "\n",
+        "    def forward(self, batch):\n",
+        "        # Forward process\n",
+        "        ...\n",
+        "        return objective  # the returned tensor should be a scalar tensor\n",
+        "\n",
+        "    def sample(self, sample_shape=torch.Size()):\n",
+        "        # Generate a batch of noise samples\n",
+        "        # NOTE: The distribution should be spherical symmetric and with a constant "
+        "variance of 1.\n",
+        "        ...\n",
+        "        return noise_batch\n",
+        "\n",
+        "# Get model and data\n",
+        "net = Net(...)\n",
+        "data = ...\n",
+        "\n",
+        "# Forward pass\n",
+        "loss = Net(data)\n",
+        "# Backward pass using zero-order differentiation\n",
+        "grads = torch.autograd.grad(loss, net.parameters())\n",
+        "```"
+      ]
+    },
+    {
+      "attachments" : {},
+      "cell_type" : "markdown",
+      "id" : "b53524f5",
+      "metadata" : {},
+      "source" : ["Here we reimplement the functional API example above with the OOP API."]
+    },
+    {
+      "cell_type" : "code",
+      "execution_count" : 3,
+      "id" : "ecc5730c",
+      "metadata" : {},
+      "outputs" : [ {
+        "name" : "stdout",
+        "output_type" : "stream",
+        "text" : [
+          "001: tensor(0.0201, grad_fn=<ZeroOrderBackward>)\n",
+          "002: tensor(0.0181, grad_fn=<ZeroOrderBackward>)\n",
+          "003: tensor(0.0167, grad_fn=<ZeroOrderBackward>)\n",
+          "004: tensor(0.0153, grad_fn=<ZeroOrderBackward>)\n",
+          "005: tensor(0.0142, grad_fn=<ZeroOrderBackward>)\n",
+          "006: tensor(0.0133, grad_fn=<ZeroOrderBackward>)\n",
+          "007: tensor(0.0125, grad_fn=<ZeroOrderBackward>)\n",
+          "008: tensor(0.0119, grad_fn=<ZeroOrderBackward>)\n",
+          "009: tensor(0.0116, grad_fn=<ZeroOrderBackward>)\n",
+          "010: tensor(0.0114, grad_fn=<ZeroOrderBackward>)\n",
+          "011: tensor(0.0112, grad_fn=<ZeroOrderBackward>)\n",
+          "012: tensor(0.0112, grad_fn=<ZeroOrderBackward>)\n",
+          "013: tensor(0.0113, grad_fn=<ZeroOrderBackward>)\n",
+          "014: tensor(0.0116, grad_fn=<ZeroOrderBackward>)\n",
+          "015: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
+          "016: tensor(0.0121, grad_fn=<ZeroOrderBackward>)\n",
+          "017: tensor(0.0123, grad_fn=<ZeroOrderBackward>)\n",
+          "018: tensor(0.0125, grad_fn=<ZeroOrderBackward>)\n",
+          "019: tensor(0.0127, grad_fn=<ZeroOrderBackward>)\n",
+          "020: tensor(0.0127, grad_fn=<ZeroOrderBackward>)\n",
+          "021: tensor(0.0125, grad_fn=<ZeroOrderBackward>)\n",
+          "022: tensor(0.0123, grad_fn=<ZeroOrderBackward>)\n",
+          "023: tensor(0.0120, grad_fn=<ZeroOrderBackward>)\n",
+          "024: tensor(0.0118, grad_fn=<ZeroOrderBackward>)\n",
+          "025: tensor(0.0117, grad_fn=<ZeroOrderBackward>)\n"
+        ]
+      } ],
+      "source" : [
+        "torch.random.manual_seed(0)\n",
+        "\n",
+        "\n",
+        "class Net(torchopt.nn.ZeroOrderGradientModule, method='forward', num_samples=100, "
+        "sigma=0.01):\n",
+        "    def __init__(self, dim):\n",
+        "        super().__init__()\n",
+        "        self.fc = nn.Linear(dim, 1)\n",
+        "        self.distribution = torch.distributions.Normal(loc=0, scale=1)\n",
+        "\n",
+        "    def forward(self, x, y):\n",
+        "        y_pred = self.fc(x)\n",
+        "        loss = F.mse_loss(y_pred, y)\n",
+        "        return loss\n",
+        "\n",
+        "    def sample(self, sample_shape=torch.Size()):\n",
+        "        return self.distribution.sample(sample_shape)\n",
+        "\n",
+        "\n",
+        "x = torch.randn(64, 32) * 0.1\n",
+        "y = torch.randn(64, 1) * 0.1\n",
+        "net = Net(dim=32)\n",
+        "\n",
+        "\n",
+        "optimizer = torchopt.Adam(net.parameters(), lr=0.01)\n",
+        "\n",
+        "for i in range(25):\n",
+        "    loss = net(x, y)  # compute loss\n",
+        "\n",
+        "    optimizer.zero_grad()\n",
+        "    loss.backward()  # backward pass\n",
+        "    optimizer.step()  # update network parameters\n",
+        "\n",
+        "    print(f'{i + 1:03d}: {loss!r}')"
+      ]
+    }
+  ],
+  "metadata" : {
+    "kernelspec" :
+        {"display_name" : "Python 3.9.15 ('torchopt')", "language" : "python", "name" : "python3"},
+    "language_info" : {
+      "codemirror_mode" : {"name" : "ipython", "version" : 3},
+      "file_extension" : ".py",
+      "mimetype" : "text/x-python",
+      "name" : "python",
+      "nbconvert_exporter" : "python",
+      "pygments_lexer" : "ipython3",
+      "version" : "3.9.15"
+    },
+    "vscode" : {
+      "interpreter" :
+          {"hash" : "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"}
     }
-   ],
-   "source": [
-    "torch.random.manual_seed(0)\n",
-    "\n",
-    "\n",
-    "class Net(torchopt.nn.ZeroOrderGradientModule, method='forward', num_samples=100, sigma=0.01):\n",
-    "    def __init__(self, dim):\n",
-    "        super().__init__()\n",
-    "        self.fc = nn.Linear(dim, 1)\n",
-    "        self.distribution = torch.distributions.Normal(loc=0, scale=1)\n",
-    "\n",
-    "    def forward(self, x, y):\n",
-    "        y_pred = self.fc(x)\n",
-    "        loss = F.mse_loss(y_pred, y)\n",
-    "        return loss\n",
-    "\n",
-    "    def sample(self, sample_shape=torch.Size()):\n",
-    "        return self.distribution.sample(sample_shape)\n",
-    "\n",
-    "\n",
-    "x = torch.randn(64, 32) * 0.1\n",
-    "y = torch.randn(64, 1) * 0.1\n",
-    "net = Net(dim=32)\n",
-    "\n",
-    "\n",
-    "optimizer = torchopt.Adam(net.parameters(), lr=0.01)\n",
-    "\n",
-    "for i in range(25):\n",
-    "    loss = net(x, y)  # compute loss\n",
-    "\n",
-    "    optimizer.zero_grad()\n",
-    "    loss.backward()  # backward pass\n",
-    "    optimizer.step()  # update network parameters\n",
-    "\n",
-    "    print(f'{i + 1:03d}: {loss!r}')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.9.15 ('torchopt')",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
   },
-  "vscode": {
-   "interpreter": {
-    "hash": "2a8cc1ff2cbc47027bf9993941710d9ab9175f14080903d9c7c432ee63d681da"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat" : 4,
+  "nbformat_minor" : 5
 }