From bb31de89993441224e9005926dedad95395bb058 Mon Sep 17 00:00:00 2001 From: casinca <47400729+casinca@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:52:42 +0100 Subject: [PATCH] [minor] typo & comments (#441) * typo & comment - safe -> save - commenting code: batch_size, seq_len = in_idx.shape * comment - adding # NEW for assert num_heads % num_kv_groups == 0 * update memory wording --------- Co-authored-by: rasbt --- .../07_gpt_to_llama/converting-gpt-to-llama2.ipynb | 10 +++++----- .../converting-llama2-to-llama3.ipynb | 14 +++++++------- ch05/07_gpt_to_llama/standalone-llama32.ipynb | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb index e75ff4a6..00bcabbb 100644 --- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb +++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb @@ -381,7 +381,7 @@ "id": "qcD8LSHNhBRW" }, "source": [ - "- Note that we also added a `dtype=cfg[\"dtype\"]` setting above, which will allow us to load the model directly in lower precision formats later to save memory (versus instantiating it in the original 32-bit precision format and then converting it)\n", + "- Note that we also added a `dtype=cfg[\"dtype\"]` setting above, which will allow us to load the model directly in lower precision formats later to reduce memory usage (versus instantiating it in the original 32-bit precision format and then converting it)\n", "- We also set `bias=False` since Llama doesn't use any bias units" ] }, @@ -648,7 +648,7 @@ "\n", "mha(example_batch)\n", "\n", - "del mha # delete to safe memory" + "del mha # delete to free up memory" ] }, { @@ -781,7 +781,7 @@ " self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False, dtype=cfg[\"dtype\"])\n", "\n", " def forward(self, in_idx):\n", - " batch_size, seq_len = in_idx.shape\n", + " # batch_size, seq_len = in_idx.shape\n", " tok_embeds = self.tok_emb(in_idx)\n", " # pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", " x = tok_embeds # + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n", @@ -890,7 +890,7 @@ " \"n_heads\": 32, # Number of attention heads\n", " \"n_layers\": 32, # Number of layers\n", " \"hidden_dim\": 11008, # NEW: Size of the intermediate dimension in FeedForward\n", - " \"dtype\": torch.bfloat16 # NEW: Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16 # NEW: Lower-precision dtype to reduce memory usage\n", "}" ] }, @@ -1691,7 +1691,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.4" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb index eac9d582..4e211ba0 100644 --- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb +++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb @@ -481,7 +481,7 @@ " ):\n", " super().__init__()\n", " assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n", - " assert num_heads % num_kv_groups == 0, \"num_heads must be divisible by num_kv_groups\"\n", + " assert num_heads % num_kv_groups == 0, \"num_heads must be divisible by num_kv_groups\" # NEW\n", "\n", " self.d_out = d_out\n", " self.num_heads = num_heads\n", @@ -886,7 +886,7 @@ " \"n_heads\": 32, # Number of attention heads\n", " \"n_layers\": 32, # Number of layers\n", " \"hidden_dim\": 11_008, # Size of the intermediate dimension in FeedForward\n", - " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16 # Lower-precision dtype to reduce memory usage\n", "}" ] }, @@ -909,7 +909,7 @@ " \"n_kv_groups\": 8, # NEW: Key-Value groups for grouped-query attention\n", " \"rope_base\": 500_000.0, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n", " \"rope_freq\": None, # NEW: Additional configuration for adjusting the RoPE frequencies\n", - " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16 # Lower-precision dtype to reduce memory usage\n", "}" ] }, @@ -2062,7 +2062,7 @@ " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", " \"rope_freq\": None, # Additional configuration for adjusting the RoPE frequencies\n", - " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16 # Lower-precision dtype to reduce memory usage\n", "}\n", "\n", "LLAMA31_CONFIG_8B = {\n", @@ -2074,7 +2074,7 @@ " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", - " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n", " \"rope_freq\": { # NEW: RoPE frequency scaling\n", " \"factor\": 8.0,\n", " \"low_freq_factor\": 1.0,\n", @@ -2448,7 +2448,7 @@ " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", - " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usagey\n", " \"rope_freq\": { # NEW: RoPE frequency scaling\n", " \"factor\": 8.0,\n", " \"low_freq_factor\": 1.0,\n", @@ -2467,7 +2467,7 @@ " \"hidden_dim\": 8192, # NEW: Almost half the size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", - " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n", " \"rope_freq\": { # RoPE frequency scaling\n", " \"factor\": 32.0, # NEW: Adjustment of the rescaling factor\n", " \"low_freq_factor\": 1.0,\n", diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb index e4c94c41..d108df3a 100644 --- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb +++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb @@ -438,7 +438,7 @@ " \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n", " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", - " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", + " \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n", " \"rope_freq\": { # RoPE frequency scaling\n", " \"factor\": 32.0,\n", " \"low_freq_factor\": 1.0,\n", @@ -458,7 +458,7 @@ "# \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n", "# \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n", "# \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", - "# \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n", + "# \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n", "# \"rope_freq\": { # RoPE frequency scaling\n", "# \"factor\": 32.0,\n", "# \"low_freq_factor\": 1.0,\n",